gsMap 1.64__tar.gz → 1.66__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {gsmap-1.64 → gsmap-1.66}/PKG-INFO +1 -1
  2. {gsmap-1.64 → gsmap-1.66}/src/gsMap/__init__.py +1 -1
  3. gsmap-1.66/src/gsMap/latent_to_gene.py +241 -0
  4. gsmap-1.66/src/gsMap/setup.py +5 -0
  5. gsmap-1.64/src/gsMap/latent_to_gene.py +0 -218
  6. gsmap-1.64/test/GPS-snakemake-workflow-macaque.smk +0 -268
  7. gsmap-1.64/test/GPS-snakemake-workflow.smk +0 -229
  8. {gsmap-1.64 → gsmap-1.66}/.github/workflows/publish-to-pypi.yml +0 -0
  9. {gsmap-1.64 → gsmap-1.66}/.gitignore +0 -0
  10. {gsmap-1.64 → gsmap-1.66}/LICENSE +0 -0
  11. {gsmap-1.64 → gsmap-1.66}/README.md +0 -0
  12. {gsmap-1.64 → gsmap-1.66}/docs/Makefile +0 -0
  13. {gsmap-1.64 → gsmap-1.66}/docs/make.bat +0 -0
  14. {gsmap-1.64 → gsmap-1.66}/docs/requirements.txt +0 -0
  15. {gsmap-1.64 → gsmap-1.66}/docs/source/_static/schematic.svg +0 -0
  16. {gsmap-1.64 → gsmap-1.66}/docs/source/api/cauchy_combination.rst +0 -0
  17. {gsmap-1.64 → gsmap-1.66}/docs/source/api/find_latent_representations.rst +0 -0
  18. {gsmap-1.64 → gsmap-1.66}/docs/source/api/format_sumstats.rst +0 -0
  19. {gsmap-1.64 → gsmap-1.66}/docs/source/api/generate_ldscore.rst +0 -0
  20. {gsmap-1.64 → gsmap-1.66}/docs/source/api/latent_to_gene.rst +0 -0
  21. {gsmap-1.64 → gsmap-1.66}/docs/source/api/quick_mode.rst +0 -0
  22. {gsmap-1.64 → gsmap-1.66}/docs/source/api/report.rst +0 -0
  23. {gsmap-1.64 → gsmap-1.66}/docs/source/api/spatial_ldsc.rst +0 -0
  24. {gsmap-1.64 → gsmap-1.66}/docs/source/api.rst +0 -0
  25. {gsmap-1.64 → gsmap-1.66}/docs/source/charts/cortex/Cortex_151507_Height.json +0 -0
  26. {gsmap-1.64 → gsmap-1.66}/docs/source/charts/cortex/Cortex_151507_IQ.json +0 -0
  27. {gsmap-1.64 → gsmap-1.66}/docs/source/charts/cortex/Cortex_151507_MCHC.json +0 -0
  28. {gsmap-1.64 → gsmap-1.66}/docs/source/charts/cortex/Cortex_151507_SCZ.json +0 -0
  29. {gsmap-1.64 → gsmap-1.66}/docs/source/charts/mouse_embryo/E16.5_E1S1_Height.json +0 -0
  30. {gsmap-1.64 → gsmap-1.66}/docs/source/charts/mouse_embryo/E16.5_E1S1_IQ.json +0 -0
  31. {gsmap-1.64 → gsmap-1.66}/docs/source/charts/mouse_embryo/E16.5_E1S1_MCHC.json +0 -0
  32. {gsmap-1.64 → gsmap-1.66}/docs/source/charts/mouse_embryo/E16.5_E1S1_SCZ.json +0 -0
  33. {gsmap-1.64 → gsmap-1.66}/docs/source/charts/test.json +0 -0
  34. {gsmap-1.64 → gsmap-1.66}/docs/source/conf.py +0 -0
  35. {gsmap-1.64 → gsmap-1.66}/docs/source/data.rst +0 -0
  36. {gsmap-1.64 → gsmap-1.66}/docs/source/data_format.md +0 -0
  37. {gsmap-1.64 → gsmap-1.66}/docs/source/index.rst +0 -0
  38. {gsmap-1.64 → gsmap-1.66}/docs/source/install.rst +0 -0
  39. {gsmap-1.64 → gsmap-1.66}/docs/source/mouse.rst +0 -0
  40. {gsmap-1.64 → gsmap-1.66}/docs/source/mouse_example.md +0 -0
  41. {gsmap-1.64 → gsmap-1.66}/docs/source/quick_mode.md +0 -0
  42. {gsmap-1.64 → gsmap-1.66}/docs/source/release.rst +0 -0
  43. {gsmap-1.64 → gsmap-1.66}/docs/source/tutorials.rst +0 -0
  44. {gsmap-1.64 → gsmap-1.66}/pyproject.toml +0 -0
  45. {gsmap-1.64 → gsmap-1.66}/schematic.png +0 -0
  46. {gsmap-1.64 → gsmap-1.66}/src/gsMap/GNN_VAE/__init__.py +0 -0
  47. {gsmap-1.64 → gsmap-1.66}/src/gsMap/GNN_VAE/adjacency_matrix.py +0 -0
  48. {gsmap-1.64 → gsmap-1.66}/src/gsMap/GNN_VAE/model.py +0 -0
  49. {gsmap-1.64 → gsmap-1.66}/src/gsMap/GNN_VAE/train.py +0 -0
  50. {gsmap-1.64 → gsmap-1.66}/src/gsMap/__main__.py +0 -0
  51. {gsmap-1.64 → gsmap-1.66}/src/gsMap/cauchy_combination_test.py +0 -0
  52. {gsmap-1.64 → gsmap-1.66}/src/gsMap/config.py +0 -0
  53. {gsmap-1.64 → gsmap-1.66}/src/gsMap/diagnosis.py +0 -0
  54. {gsmap-1.64 → gsmap-1.66}/src/gsMap/find_latent_representation.py +0 -0
  55. {gsmap-1.64 → gsmap-1.66}/src/gsMap/format_sumstats.py +0 -0
  56. {gsmap-1.64 → gsmap-1.66}/src/gsMap/generate_ldscore.py +0 -0
  57. {gsmap-1.64 → gsmap-1.66}/src/gsMap/main.py +0 -0
  58. {gsmap-1.64 → gsmap-1.66}/src/gsMap/report.py +0 -0
  59. {gsmap-1.64 → gsmap-1.66}/src/gsMap/run_all_mode.py +0 -0
  60. {gsmap-1.64 → gsmap-1.66}/src/gsMap/spatial_ldsc_multiple_sumstats.py +0 -0
  61. {gsmap-1.64 → gsmap-1.66}/src/gsMap/templates/report_template.html +0 -0
  62. {gsmap-1.64 → gsmap-1.66}/src/gsMap/utils/__init__.py +0 -0
  63. {gsmap-1.64 → gsmap-1.66}/src/gsMap/utils/generate_r2_matrix.py +0 -0
  64. {gsmap-1.64 → gsmap-1.66}/src/gsMap/utils/jackknife.py +0 -0
  65. {gsmap-1.64 → gsmap-1.66}/src/gsMap/utils/make_annotations.py +0 -0
  66. {gsmap-1.64 → gsmap-1.66}/src/gsMap/utils/manhattan_plot.py +0 -0
  67. {gsmap-1.64 → gsmap-1.66}/src/gsMap/utils/regression_read.py +0 -0
  68. {gsmap-1.64 → gsmap-1.66}/src/gsMap/visualize.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: gsMap
3
- Version: 1.64
3
+ Version: 1.66
4
4
  Summary: Genetics-informed pathogenic spatial mapping
5
5
  Author-email: liyang <songliyang@westlake.edu.cn>, wenhao <chenwenhao@westlake.edu.cn>
6
6
  Requires-Python: >=3.8
@@ -2,4 +2,4 @@
2
2
  Genetics-informed pathogenic spatial mapping
3
3
  '''
4
4
 
5
- __version__ = '1.64'
5
+ __version__ = '1.66'
@@ -0,0 +1,241 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ import scanpy as sc
7
+ from scipy.sparse import csr_matrix
8
+ from scipy.stats import gmean
9
+ from scipy.stats import rankdata
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+ from sklearn.neighbors import NearestNeighbors
12
+ from joblib import Parallel, delayed
13
+ from tqdm import tqdm
14
+
15
+ from gsMap.config import LatentToGeneConfig
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def find_neighbors(coor, num_neighbour):
21
+ """
22
+ Find Neighbors of each cell (based on spatial coordinates).
23
+ """
24
+ nbrs = NearestNeighbors(n_neighbors=num_neighbour).fit(coor)
25
+ distances, indices = nbrs.kneighbors(coor, return_distance=True)
26
+ cell_indices = np.arange(coor.shape[0])
27
+ cell1 = np.repeat(cell_indices, indices.shape[1])
28
+ cell2 = indices.flatten()
29
+ distance = distances.flatten()
30
+ spatial_net = pd.DataFrame({'Cell1': cell1, 'Cell2': cell2, 'Distance': distance})
31
+ return spatial_net
32
+
33
+
34
+ def build_spatial_net(adata, annotation, num_neighbour):
35
+ """
36
+ Build spatial neighbourhood matrix for each spot (cell) based on the spatial coordinates.
37
+ """
38
+ logger.info(f'------Building spatial graph based on spatial coordinates...')
39
+
40
+ coor = adata.obsm['spatial']
41
+ if annotation is not None:
42
+ logger.info(f'Cell annotations are provided...')
43
+ spatial_net_list = []
44
+ # Cells with annotations
45
+ for ct in adata.obs[annotation].dropna().unique():
46
+ idx = np.where(adata.obs[annotation] == ct)[0]
47
+ coor_temp = coor[idx, :]
48
+ spatial_net_temp = find_neighbors(coor_temp, min(num_neighbour, coor_temp.shape[0]))
49
+ # Map back to original indices
50
+ spatial_net_temp['Cell1'] = idx[spatial_net_temp['Cell1'].values]
51
+ spatial_net_temp['Cell2'] = idx[spatial_net_temp['Cell2'].values]
52
+ spatial_net_list.append(spatial_net_temp)
53
+ logger.info(f'{ct}: {coor_temp.shape[0]} cells')
54
+
55
+ # Cells labeled as nan
56
+ if pd.isnull(adata.obs[annotation]).any():
57
+ idx_nan = np.where(pd.isnull(adata.obs[annotation]))[0]
58
+ logger.info(f'Nan: {len(idx_nan)} cells')
59
+ spatial_net_temp = find_neighbors(coor, num_neighbour)
60
+ spatial_net_temp = spatial_net_temp[spatial_net_temp['Cell1'].isin(idx_nan)]
61
+ spatial_net_list.append(spatial_net_temp)
62
+ spatial_net = pd.concat(spatial_net_list, axis=0)
63
+ else:
64
+ logger.info(f'Cell annotations are not provided...')
65
+ spatial_net = find_neighbors(coor, num_neighbour)
66
+
67
+ return spatial_net
68
+
69
+
70
+ def find_neighbors_regional(cell_pos, spatial_net_dict, coor_latent, config, cell_annotations):
71
+ num_neighbour = config.num_neighbour
72
+ annotations = config.annotation
73
+
74
+ cell_use_pos = spatial_net_dict.get(cell_pos, [])
75
+ if len(cell_use_pos) == 0:
76
+ return []
77
+
78
+ cell_latent = coor_latent[cell_pos, :].reshape(1, -1)
79
+ neighbors_latent = coor_latent[cell_use_pos, :]
80
+ similarity = cosine_similarity(cell_latent, neighbors_latent).reshape(-1)
81
+
82
+ if annotations is not None:
83
+ cell_annotation = cell_annotations[cell_pos]
84
+ neighbor_annotations = cell_annotations[cell_use_pos]
85
+ mask = neighbor_annotations == cell_annotation
86
+ if not np.any(mask):
87
+ return []
88
+ similarity = similarity[mask]
89
+ cell_use_pos = cell_use_pos[mask]
90
+
91
+ if len(similarity) == 0:
92
+ return []
93
+
94
+ indices = np.argsort(-similarity) # descending order
95
+ top_indices = indices[:num_neighbour]
96
+ cell_select_pos = cell_use_pos[top_indices]
97
+ return cell_select_pos
98
+
99
+
100
+ def compute_regional_mkscore(cell_pos, spatial_net_dict, coor_latent, config, cell_annotations,
101
+ ranks, frac_whole, adata_X_bool):
102
+ """
103
+ Compute gmean ranks of a region.
104
+ """
105
+ cell_select_pos = find_neighbors_regional(
106
+ cell_pos, spatial_net_dict, coor_latent, config, cell_annotations
107
+ )
108
+ if len(cell_select_pos) == 0:
109
+ return np.zeros(ranks.shape[1], dtype=np.float16)
110
+
111
+ # Ratio of expression ranks
112
+ ranks_tg = ranks[cell_select_pos, :]
113
+ gene_ranks_region = gmean(ranks_tg, axis=0)
114
+ gene_ranks_region[gene_ranks_region <= 1] = 0
115
+
116
+ if not config.no_expression_fraction:
117
+ # Ratio of expression fractions
118
+ frac_focal = adata_X_bool[cell_select_pos, :].sum(axis=0).A1 / len(cell_select_pos)
119
+ frac_region = frac_focal / frac_whole
120
+ frac_region[frac_region <= 1] = 0
121
+ frac_region[frac_region > 1] = 1
122
+
123
+ # Simultaneously consider the ratio of expression fractions and ranks
124
+ gene_ranks_region = gene_ranks_region * frac_region
125
+
126
+ mkscore = np.exp(gene_ranks_region ** 1.5) - 1
127
+ return mkscore.astype(np.float16, copy=False)
128
+
129
+
130
+ def run_latent_to_gene(config: LatentToGeneConfig):
131
+ logger.info('------Loading the spatial data...')
132
+ adata = sc.read_h5ad(config.hdf5_with_latent_path)
133
+
134
+ if config.annotation is not None:
135
+ logger.info(f'------Cell annotations are provided as {config.annotation}...')
136
+ adata = adata[~pd.isnull(adata.obs[config.annotation]), :]
137
+
138
+ # Homologs transformation
139
+ if config.homolog_file is not None:
140
+ logger.info(f'------Transforming the {config.species} to HUMAN_GENE_SYM...')
141
+ homologs = pd.read_csv(config.homolog_file, sep='\t')
142
+ if homologs.shape[1] != 2:
143
+ raise ValueError(
144
+ "Homologs file must have two columns: one for the species and one for the human gene symbol.")
145
+
146
+ homologs.columns = [config.species, 'HUMAN_GENE_SYM']
147
+ homologs.set_index(config.species, inplace=True)
148
+ adata = adata[:, adata.var_names.isin(homologs.index)]
149
+ logger.info(f"{adata.shape[1]} genes retained after homolog transformation.")
150
+ if adata.shape[1] < 100:
151
+ raise ValueError("Too few genes retained in ST data (<100).")
152
+ adata.var_names = homologs.loc[adata.var_names, 'HUMAN_GENE_SYM'].values
153
+ adata = adata[:, ~adata.var_names.duplicated()]
154
+
155
+ # Remove cells and genes that are not expressed
156
+ logger.info(f'Number of cells, genes of the input data: {adata.shape[0]},{adata.shape[1]}')
157
+ adata = adata[adata.X.sum(axis=1) > 0, adata.X.sum(axis=0) > 0]
158
+ logger.info(f'Number of cells, genes after transformation: {adata.shape[0]},{adata.shape[1]}')
159
+
160
+ # Create mappings
161
+ n_cells = adata.n_obs
162
+ n_genes = adata.n_vars
163
+
164
+ if config.annotation is not None:
165
+ cell_annotations = adata.obs[config.annotation].values
166
+ else:
167
+ cell_annotations = None
168
+
169
+ # Build the spatial graph
170
+ spatial_net = build_spatial_net(adata, config.annotation, config.num_neighbour_spatial)
171
+ spatial_net_dict = spatial_net.groupby('Cell1')['Cell2'].apply(np.array).to_dict()
172
+
173
+ # Extract the latent representation
174
+ coor_latent = adata.obsm[config.latent_representation]
175
+ coor_latent = coor_latent.astype(np.float32)
176
+
177
+ # Compute ranks
178
+ logger.info('------Ranking the spatial data...')
179
+ adata_X = adata.X.tocsr()
180
+ ranks = np.zeros((n_cells, n_genes), dtype=np.float32)
181
+
182
+ for i in tqdm(range(n_cells), desc="Computing ranks per cell"):
183
+ data = adata_X[i, :].toarray().flatten()
184
+ ranks[i, :] = rankdata(data, method='average')
185
+
186
+ # Geometric mean across slices
187
+ if config.gM_slices is not None:
188
+ logger.info('Geometrical mean across multiple slices is provided.')
189
+ gM_df = pd.read_parquet(config.gM_slices)
190
+ if config.species is not None:
191
+ homologs = pd.read_csv(config.homolog_file, sep='\t', header=None)
192
+ if homologs.shape[1] < 2:
193
+ raise ValueError(
194
+ "Homologs file must have at least two columns: one for the species and one for the human gene symbol.")
195
+ homologs.columns = [config.species, 'HUMAN_GENE_SYM']
196
+ homologs.set_index(config.species, inplace=True)
197
+ gM_df = gM_df.loc[gM_df.index.isin(homologs.index)]
198
+ gM_df.index = homologs.loc[gM_df.index, 'HUMAN_GENE_SYM'].values
199
+ common_genes = np.intersect1d(adata.var_names, gM_df.index)
200
+ gM_df = gM_df.loc[common_genes]
201
+ gM = gM_df['G_Mean'].values
202
+ adata = adata[:, common_genes]
203
+ ranks = ranks[:, np.isin(adata.var_names, common_genes)]
204
+ else:
205
+ gM = gmean(ranks, axis=0)
206
+
207
+ # Compute the fraction of each gene across cells
208
+ adata_X_bool = adata_X.astype(bool)
209
+ frac_whole = np.asarray(adata_X_bool.sum(axis=0)).flatten() / n_cells
210
+
211
+ # Normalize the ranks
212
+ ranks = ranks / gM
213
+
214
+ # Compute marker scores in parallel
215
+ logger.info('------Computing marker scores...')
216
+
217
+ def compute_mk_score_wrapper(cell_pos):
218
+ return compute_regional_mkscore(
219
+ cell_pos, spatial_net_dict, coor_latent, config, cell_annotations, ranks, frac_whole, adata_X_bool
220
+ )
221
+
222
+ mk_scores = [compute_mk_score_wrapper(cell_pos) for cell_pos in tqdm(range(n_cells), desc="Calculating marker scores")]
223
+ mk_score = np.vstack(mk_scores).T
224
+
225
+ # Remove mitochondrial genes
226
+ gene_names = adata.var_names.values.astype(str)
227
+ mt_gene_mask = ~(np.char.startswith(gene_names, 'MT-') | np.char.startswith(gene_names, 'mt-'))
228
+ mk_score = mk_score[mt_gene_mask, :]
229
+ gene_names = gene_names[mt_gene_mask]
230
+
231
+ # Save the marker scores
232
+ logger.info(f'------Saving marker scores ...')
233
+ output_file_path = Path(config.mkscore_feather_path)
234
+ output_file_path.parent.mkdir(parents=True, exist_ok=True, mode=0o755)
235
+ mk_score_df = pd.DataFrame(mk_score, index=gene_names, columns=adata.obs_names)
236
+ mk_score_df.reset_index(inplace=True)
237
+ mk_score_df.rename(columns={'index': 'HUMAN_GENE_SYM'}, inplace=True)
238
+ mk_score_df.to_feather(output_file_path)
239
+
240
+ # Save the modified adata object to disk
241
+ adata.write(config.hdf5_with_latent_path)
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env python
2
+ import setuptools
3
+
4
+ if __name__ == "__main__":
5
+ setuptools.setup(name='gsMap')
@@ -1,218 +0,0 @@
1
- import logging
2
- from pathlib import Path
3
-
4
- import numpy as np
5
- import pandas as pd
6
- import scanpy as sc
7
- from scipy.stats import gmean
8
- from scipy.stats import rankdata
9
- from sklearn.metrics.pairwise import cosine_similarity
10
- from sklearn.neighbors import NearestNeighbors
11
- from tqdm import tqdm
12
-
13
- from gsMap.config import LatentToGeneConfig
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
-
18
- def find_Neighbors(coor, num_neighbour):
19
- """
20
- find Neighbors of each cell (based on spatial coordinates)
21
- """
22
- nbrs = NearestNeighbors(n_neighbors=num_neighbour).fit(coor)
23
- distances, indices = nbrs.kneighbors(coor, return_distance=True)
24
-
25
- KNN_list = [pd.DataFrame(zip([it] * indices[it].shape[0], indices[it], distances[it])) for it in
26
- range(indices.shape[0])]
27
- KNN_df = pd.concat(KNN_list)
28
- KNN_df.columns = ['Cell1', 'Cell2', 'Distance']
29
-
30
- spatial_net = KNN_df.copy()
31
- id_cell_trans = dict(zip(range(coor.shape[0]), np.array(coor.index)))
32
-
33
- spatial_net['Cell1'] = spatial_net['Cell1'].map(id_cell_trans)
34
- spatial_net['Cell2'] = spatial_net['Cell2'].map(id_cell_trans)
35
-
36
- return spatial_net
37
-
38
-
39
- def _build_spatial_net(adata, annotation, num_neighbour):
40
- """
41
- 1 Build spatial neighbourhood matrix for each spot (cell) based on the spatial coord
42
- """
43
- logger.info(f'------Building spatial graph based on spatial coordinates...')
44
-
45
- coor = pd.DataFrame(adata.obsm['spatial'])
46
- coor.index = adata.obs.index
47
-
48
- if not annotation is None:
49
- logger.info(f'Cell annotations are provided...')
50
- spatial_net = pd.DataFrame()
51
- # Cells with annotations
52
- for ct in adata.obs[annotation].dropna().unique():
53
- coor_temp = coor.loc[adata.obs[annotation] == ct, :]
54
- spatial_net_temp = find_Neighbors(coor_temp, min(num_neighbour, coor_temp.shape[0]))
55
- spatial_net = pd.concat((spatial_net, spatial_net_temp), axis=0)
56
- logger.info(f'{ct}: {coor_temp.shape[0]} cells')
57
-
58
- # Cells labeled as nan
59
- if pd.isnull(adata.obs[annotation]).any():
60
- cell_nan = adata.obs.index[np.where(pd.isnull(adata.obs[annotation]))[0]]
61
- logger.info(f'Nan: {len(cell_nan)} cells')
62
-
63
- spatial_net_temp = find_Neighbors(coor, num_neighbour)
64
- spatial_net_temp = spatial_net_temp.loc[spatial_net_temp.Cell1.isin(cell_nan), :]
65
- spatial_net = pd.concat((spatial_net, spatial_net_temp), axis=0)
66
- else:
67
- logger.info(f'Cell annotations are not provided...')
68
- spatial_net = find_Neighbors(coor, num_neighbour)
69
-
70
- return spatial_net
71
-
72
-
73
- def find_Neighbors_Regional(cell):
74
- cell_use = spatial_net_dict[cell]
75
- similarity = cosine_similarity(coor_latent.loc[cell].values.reshape(1, -1),
76
- coor_latent.loc[cell_use].values).reshape(-1)
77
- if not args.annotation is None:
78
- annotation = adata.obs[args.annotation]
79
- df = pd.DataFrame({'Cell2': cell_use, 'Similarity': similarity, 'Annotation': annotation[cell_use]})
80
- df = df.loc[df.loc[cell_use, 'Annotation'] == df.loc[cell, 'Annotation']]
81
- else:
82
- df = pd.DataFrame({'Cell2': cell_use, 'Similarity': similarity})
83
-
84
- df = df.sort_values(by='Similarity', ascending=False)
85
- cell_select = df.Cell2[0:args.num_neighbour].to_list()
86
-
87
- return cell_select
88
-
89
-
90
- def _compute_regional_mkscore(cell_tg, ):
91
- """
92
- compute gmean ranks of a region
93
- """
94
- cell_select = find_Neighbors_Regional(cell_tg)
95
-
96
- # Ratio of expression ranks
97
- ranks_tg = ranks.loc[cell_select]
98
- gene_ranks_region = gmean(ranks_tg, axis=0)
99
- gene_ranks_region[gene_ranks_region <= 1] = 0
100
-
101
- if not args.no_expression_fraction:
102
- # Ratio of expression fractions
103
- frac_focal = expressed_mask.loc[cell_select].sum(0) / len(cell_select)
104
- frac_region = frac_focal / frac_whole
105
- frac_region[frac_region <= 1] = 0
106
- frac_region[frac_region > 1] = 1
107
-
108
- # Simultaneously consider the ratio of expression fractions and ranks
109
- gene_ranks_region = (gene_ranks_region * frac_region).values
110
-
111
- mkscore = np.exp(gene_ranks_region ** 1.5) - 1
112
- return mkscore.astype(np.float16, copy=False)
113
-
114
-
115
- def run_latent_to_gene(config: LatentToGeneConfig):
116
- global adata, coor_latent, spatial_net, ranks, frac_whole, args, spatial_net_dict, expressed_mask
117
- args = config
118
- # Load and process the spatial data
119
- logger.info('------Loading the spatial data...')
120
- adata = sc.read_h5ad(config.hdf5_with_latent_path)
121
-
122
- logger.info('------Ranking the spatial data...')
123
- adata.layers['rank'] = rankdata(adata.X.toarray().astype(np.float32), axis=1).astype(np.float32)
124
-
125
- if not config.annotation is None:
126
- logger.info(f'------Cell annotations are provided as {config.annotation}...')
127
- adata = adata[~pd.isnull(adata.obs[config.annotation]), :]
128
-
129
- # Homologs transformation
130
- if not config.homolog_file is None:
131
- logger.info(f'------Transforming the {config.species} to HUMAN_GENE_SYM...')
132
- homologs = pd.read_csv(config.homolog_file, sep='\t')
133
- if homologs.shape[1] != 2:
134
- raise ValueError(
135
- "Homologs file must have two columns: one for the species and one for the human gene symbol.")
136
-
137
- homologs.columns = [config.species, 'HUMAN_GENE_SYM']
138
- homologs.set_index(config.species, inplace=True)
139
- adata = adata[:, adata.var_names.isin(homologs.index)]
140
- # Log the number of genes left after homolog transformation
141
- logger.info(f"{adata.shape[1]} genes retained after homolog transformation.")
142
- if adata.shape[1] < 100:
143
- raise ValueError("Too few genes retained in ST data (<100).")
144
- adata.var_names = homologs.loc[adata.var_names, 'HUMAN_GENE_SYM'].values
145
- # drop duplicated genes
146
- adata = adata[:, ~adata.var_names.duplicated()]
147
-
148
- # Remove cells that do not express any genes after transformation, and genes that are not expressed in any cells.
149
- logger.info(f'Number of cells, genes of the input data: {adata.shape[0]},{adata.shape[1]}')
150
- adata = adata[adata.X.sum(axis=1) > 0, adata.X.sum(axis=0) > 0]
151
- logger.info(f'Number of cells, genes after transformation: {adata.shape[0]},{adata.shape[1]}')
152
- # Buid the spatial graph
153
- spatial_net = _build_spatial_net(adata, config.annotation, config.num_neighbour_spatial)
154
- spatial_net.set_index('Cell1', inplace=True)
155
- # convert the spatial graph to a dictionary cell1 to cells in the neighbourhood
156
- spatial_net_dict = spatial_net.groupby(spatial_net.index).Cell2.apply(list).to_dict()
157
-
158
- # Extract the latent representation
159
- coor_latent = pd.DataFrame(adata.obsm[config.latent_representation])
160
- coor_latent.index = adata.obs.index
161
- # Find marker genes
162
- cell_list = adata.obs.index.tolist()
163
-
164
- # Load the geometrical mean across slices
165
- if config.gM_slices is not None:
166
- logger.info('Geometrical mean across multiple slices is provided.')
167
- gM = pd.read_parquet(config.gM_slices)
168
- if config.species is not None:
169
- homologs = pd.read_csv(config.homolog_file, sep='\t', header=None)
170
- if homologs.shape[1] < 2:
171
- raise ValueError(
172
- "Homologs file must have at least two columns: one for the species and one for the human gene symbol.")
173
- homologs.columns = [config.species, 'HUMAN_GENE_SYM']
174
- homologs.set_index(config.species, inplace=True)
175
- gM = gM.loc[gM.index.isin(homologs.index)]
176
- gM.index = homologs.loc[gM.index, 'HUMAN_GENE_SYM'].values
177
- common_gene = np.intersect1d(adata.var_names, gM.index)
178
- gM = gM.loc[common_gene]
179
- gM = gM['G_Mean'].to_numpy()
180
- adata = adata[:, common_gene]
181
- else:
182
- gM = gmean(adata.layers['rank'], axis=0)
183
-
184
- # Compute the fraction of each gene across cells
185
- expressed_mask = pd.DataFrame((adata.X > 0).toarray(), index=adata.obs.index, columns=adata.var.index)
186
- # frac_whole = np.array((adata_layer > 0).sum(axis=0))[0] / (adata.shape[0])
187
- frac_whole = np.array(expressed_mask.sum(axis=0)) / (adata.shape[0])
188
- # Normalize the geometrical mean
189
- ranks = adata.layers['rank'] / gM
190
- ranks = pd.DataFrame(ranks, index=adata.obs_names)
191
- ranks.columns = adata.var.index
192
- mk_score = [
193
- _compute_regional_mkscore(cell_tg)
194
- for cell_tg in tqdm(cell_list,
195
- desc="Finding markers (Rank-based approach) | cells")
196
- ]
197
- # Normalize the marker scores
198
- mk_score = pd.DataFrame(np.vstack(mk_score).T, index=adata.var_names, columns=cell_list)
199
- # mk_score_normalized = mk_score.div(mk_score.sum())*1e+2
200
-
201
- # Remove the mitochondrial genes from mk_score
202
- mt_gene_mask = ~adata.var_names.str.startswith(('MT-', 'mt-'))
203
- mk_score = mk_score[mt_gene_mask]
204
- adata = adata[:, mt_gene_mask]
205
-
206
- # # Save the mk_score DataFrame to an adata layer
207
- # adata.layers['mkscore'] = mk_score.values.T
208
-
209
- # Save the marker scores
210
- logger.info(f'------Saving marker scores ...')
211
- output_file_path = Path(config.mkscore_feather_path)
212
- output_file_path.parent.mkdir(parents=True, exist_ok=True, mode=0o755)
213
- mk_score.reset_index(inplace=True)
214
- mk_score.rename(columns={mk_score.columns[0]: 'HUMAN_GENE_SYM'}, inplace=True)
215
- mk_score.to_feather(output_file_path)
216
-
217
- # Save the modified adata object to disk
218
- adata.write(config.hdf5_with_latent_path)
@@ -1,268 +0,0 @@
1
- from pathlib import Path
2
-
3
- import numpy as np
4
-
5
- workdir: '/storage/yangjianLab/chenwenhao/projects/202312_GPS/data/macaque/processed'
6
- # workdir: '/storage/yangjianLab/chenwenhao/projects/202312_GPS/data/GPS_test/macaque'
7
- sample_name = "Cortex_151507"
8
- chrom = "all"
9
- QOS = "huge"
10
- # chrom = range(1,23)
11
- trait_names = [
12
- 'PGC3_SCZ_wave3_public_INFO80'
13
- ]
14
- root = "/storage/yangjianLab/songliyang/SpatialData/Data/Brain/macaque/Cell/processed/h5ad"
15
- # sample_names = [file.strip().split('.')[0]
16
- # for file in open(f'{root}/representative_slices2').readlines()]
17
- #
18
- # sample_names = '''
19
- # T33_macaque1 T44_macaque1 T82_macaque1 T97_macaque1 T125_macaque1 T127_macaque1 T129_macaque1 T131_macaque1 T135_macaque1 T137_macaque1 T139_macaque1
20
- # '''.strip().split()
21
- sample_names=[]
22
- for file in Path(root).glob('*.h5ad'):
23
- sample_names.append(file.stem)
24
- sample_names.remove('T825_macaque3') # due to 25% of spot don't have spatial coordinates
25
-
26
- annotation = "SubClass"
27
- data_type = "SCT"
28
- # sample_names = ['T584_macaque2']
29
-
30
- rule all:
31
- input:
32
- expand('{sample_name}/spatial_ldsc/{sample_name}.spatial_ldsc.done',trait_name=trait_names,sample_name=sample_names)
33
-
34
-
35
- # expand('{sample_name}/cauchy_combination/{sample_name}_{trait_name}.Cauchy.csv.gz',trait_name=trait_names,sample_name=sample_names)
36
- # expand('{sample_name}/cauchy_combination/{sample_name}_{trait_name}.Cauchy.csv.gz',trait_name=trait_names,sample_name=sample_names)
37
-
38
- rule test_run:
39
- input:
40
- [f'{sample_name}/generate_ldscore/{sample_name}_generate_ldscore_chr{chrom}.done' for sample_name in
41
- sample_names]
42
-
43
- # localrules: find_latent_representations,latent_to_gene
44
- def get_annotation(wildcards):
45
- if wildcards.sample_name.endswith('3'):
46
- print(wildcards.sample_name,'will use None as annotation')
47
- return None
48
- else:
49
- print(wildcards.sample_name,'will use SubClass as annotation')
50
- return 'SubClass'
51
-
52
-
53
- rule find_latent_representations:
54
- input:
55
- hdf5_path=f'{root}/{{sample_name}}.h5ad'
56
- output:
57
- hdf5_output='{sample_name}/find_latent_representations/{sample_name}_add_latent.h5ad'
58
- params:
59
- annotation= get_annotation,
60
- type=data_type,
61
- epochs=300,
62
- feat_hidden1=256,
63
- feat_hidden2=128,
64
- feat_cell=3000,
65
- gcn_hidden1=64,
66
- gcn_hidden2=30,
67
- p_drop=0.1,
68
- gcn_lr=0.001,
69
- gcn_decay=0.01,
70
- n_neighbors=11,
71
- label_w=1,
72
- rec_w=1,
73
- n_comps=300,
74
- weighted_adj=False,
75
- nheads=3,
76
- var=False,
77
- convergence_threshold=1e-4,
78
- hierarchically=False
79
- threads:
80
- 3
81
- benchmark: '{sample_name}/find_latent_representations/{sample_name}_add_latent.h5ad.benchmark'
82
- resources:
83
- mem_mb_per_cpu=lambda wildcards, threads, attempt: 20_000 * np.log2(attempt + 1),
84
- qos=QOS
85
- run:
86
- command = f"""
87
- gsmap run_find_latent_representations \
88
- --input_hdf5_path {input.hdf5_path} \
89
- --sample_name {wildcards.sample_name} \
90
- --output_hdf5_path {output.hdf5_output} \
91
- { '--annotation ' + params.annotation if params.annotation is not None else ''} \
92
- --type {params.type} \
93
- --epochs {params.epochs} \
94
- --feat_hidden1 {params.feat_hidden1} \
95
- --feat_hidden2 {params.feat_hidden2} \
96
- --feat_cell {params.feat_cell} \
97
- --gcn_hidden1 {params.gcn_hidden1} \
98
- --gcn_hidden2 {params.gcn_hidden2} \
99
- --p_drop {params.p_drop} \
100
- --gcn_lr {params.gcn_lr} \
101
- --gcn_decay {params.gcn_decay} \
102
- --n_neighbors {params.n_neighbors} \
103
- --label_w {params.label_w} \
104
- --rec_w {params.rec_w} \
105
- --n_comps {params.n_comps} \
106
- {'--weighted_adj' if params.weighted_adj else ''} \
107
- --nheads {params.nheads} \
108
- {'--var' if params.var else ''} \
109
- --convergence_threshold {params.convergence_threshold} \
110
- {'--hierarchically' if params.hierarchically else ''}
111
- """
112
- shell(
113
- f'{command}'
114
- )
115
-
116
-
117
- rule latent_to_gene:
118
- input:
119
- hdf5_with_latent_path=rules.find_latent_representations.output.hdf5_output
120
- output:
121
- feather_path='{sample_name}/latent_to_gene/{sample_name}_gene_marker_score.feather'
122
- params:
123
- latent_representation="latent_GVAE",
124
- num_neighbour=51,
125
- num_neighbour_spatial=201,
126
- species='MACAQUE_GENE_SYM',
127
- gs_species='/storage/yangjianLab/songliyang/SpatialData/homologs/macaque_human_homologs.txt',
128
- gM_slices=None,
129
- annotation=get_annotation,
130
- type=data_type
131
- threads:
132
- 1
133
- resources:
134
- mem_mb_per_cpu=lambda wildcards, threads, attempt: 70_000 * np.log2(attempt + 1),
135
- qos=QOS
136
- benchmark: '{sample_name}/latent_to_gene/{sample_name}_gene_marker_score.feather.benchmark'
137
- run:
138
- command = f"""
139
- gsmap run_latent_to_gene \
140
- --input_hdf5_with_latent_path {input.hdf5_with_latent_path} \
141
- --sample_name {wildcards.sample_name} \
142
- --output_feather_path {output.feather_path} \
143
- { '--annotation ' + params.annotation if params.annotation is not None else ''} \
144
- --type {params.type} \
145
- --latent_representation {params.latent_representation} \
146
- --num_neighbour {params.num_neighbour} \
147
- --num_neighbour_spatial {params.num_neighbour_spatial} \
148
- {'--species ' + params.species if params.species is not None else ''} \
149
- {'--gs_species ' + params.gs_species if params.gs_species is not None else ''} \
150
- {'--gM_slices ' + params.gM_slices if params.gM_slices is not None else ''}
151
- """
152
- shell(
153
- f'{command}'
154
- )
155
-
156
-
157
- rule generate_ldscore:
158
- input:
159
- mkscore_feather_file=rules.latent_to_gene.output.feather_path
160
- output:
161
- done='{sample_name}/generate_ldscore/{sample_name}_generate_ldscore_chr{chrom}.done'
162
- params:
163
- ld_score_save_dir='{sample_name}/generate_ldscore',
164
- gtf_annotation_file="/storage/yangjianLab/songliyang/ReferenceGenome/GRCh37/gencode.v39lift37.annotation.gtf",
165
- bfile_root="/storage/yangjianLab/sharedata/LDSC_resource/1000G_EUR_Phase3_plink/1000G.EUR.QC",
166
- keep_snp_root="/storage/yangjianLab/sharedata/LDSC_resource/hapmap3_snps/hm",
167
- gene_window_size=50000,
168
- enhancer_annotation_file=None,
169
- snp_multiple_enhancer_strategy='max_mkscore',
170
- gene_window_enhancer_priority=None,
171
- spots_per_chunk=1000,
172
- ld_wind=1,
173
- ld_unit="CM",
174
- additional_baseline_annotation_dir_path=None
175
- # additional_baseline_annotation_dir_path='/storage/yangjianLab/chenwenhao/projects/202312_GPS/data/resource/ldsc/baseline_v1.2/remove_base'
176
- benchmark: '{sample_name}/generate_ldscore/{sample_name}_generate_ldscore_chr{chrom}.done.benchmark'
177
- threads:
178
- 3
179
- resources:
180
- mem_mb_per_cpu=lambda wildcards, threads, attempt: 50_000 / threads * np.log2(attempt + 1),
181
- qos=QOS
182
- run:
183
- command = f"""
184
- gsmap run_generate_ldscore \
185
- --sample_name {wildcards.sample_name} \
186
- --chrom {wildcards.chrom} \
187
- --ldscore_save_dir {params.ld_score_save_dir} \
188
- --mkscore_feather_file {input.mkscore_feather_file} \
189
- --bfile_root {params.bfile_root} \
190
- --keep_snp_root {params.keep_snp_root} \
191
- --gtf_annotation_file {params.gtf_annotation_file} \
192
- --gene_window_size {params.gene_window_size} \
193
- {'--enhancer_annotation_file ' + params.enhancer_annotation_file if params.enhancer_annotation_file is not None else ''} \
194
- --snp_multiple_enhancer_strategy {params.snp_multiple_enhancer_strategy} \
195
- {'--gene_window_enhancer_priority ' + params.gene_window_enhancer_priority if params.gene_window_enhancer_priority is not None else ''} \
196
- --spots_per_chunk {params.spots_per_chunk} \
197
- --ld_wind {params.ld_wind} \
198
- --ld_unit {params.ld_unit} \
199
- { '--additional_baseline_annotation_dir_path ' + params.additional_baseline_annotation_dir_path if params.additional_baseline_annotation_dir_path is not None else '' }
200
- """
201
- shell(command)
202
- shell('touch {output.done}')
203
-
204
-
205
- def get_h2_file(wildcards):
206
- gwas_root = "/storage/yangjianLab/songliyang/GWAS_trait/LDSC"
207
- return f"{gwas_root}/{wildcards.trait_name}.sumstats.gz",
208
-
209
-
210
- def get_ldscore(wildcards):
211
- if chrom == "all":
212
- return f"{wildcards.sample_name}/generate_ldscore/{wildcards.sample_name}_generate_ldscore_chr{chrom}.done"
213
- else:
214
- assert tuple(chrom) == tuple(range(1,23)), "chrom must be all or range(1,23)"
215
- return [f"{wildcards.sample_name}/generate_ldscore/{wildcards.sample_name}_generate_ldscore_chr{c}.done" for
216
- c in chrom]
217
-
218
-
219
- rule spatial_ldsc:
220
- input:
221
- # h2_file=get_h2_file,
222
- generate_ldscore_done=get_ldscore
223
- output:
224
- done='{sample_name}/spatial_ldsc/{sample_name}.spatial_ldsc.done'
225
- params:
226
- ldscore_input_dir=rules.generate_ldscore.params.ld_score_save_dir,
227
- ldsc_save_dir='{sample_name}/spatial_ldsc',
228
- w_file="/storage/yangjianLab/sharedata/LDSC_resource/LDSC_SEG_ldscores/weights_hm3_no_hla/weights.",
229
- sumstats_config_file='/storage/yangjianLab/chenwenhao/projects/202312_GPS/src/gsMap/example/sumstats_config_sub.yaml',
230
- all_chunk = None
231
- threads:
232
- 10
233
- benchmark:
234
- '{sample_name}/spatial_ldsc/{sample_name}.spatial_ldsc.done.benchmark'
235
- resources:
236
- mem_mb_per_cpu=lambda wildcards, threads, attempt: 40_000 / threads * np.log2(attempt + 1),
237
- qos=QOS,
238
- partition='intel-sc3,amd-ep2'
239
- run:
240
- command = f"""
241
- gsmap run_spatial_ldsc --w_file {params.w_file} --sample_name {wildcards.sample_name} --num_processes {threads} --ldscore_input_dir {params.ldscore_input_dir} --ldsc_save_dir {params.ldsc_save_dir} --sumstats_config_file {params.sumstats_config_file} {f'--all_chunk {params.all_chunk}' if params.all_chunk else ''}
242
- """
243
- shell(
244
- f'{command}'
245
- 'touch {output.done}'
246
- )
247
-
248
-
249
- rule cauchy_combination:
250
- output:
251
- done='{sample_name}/cauchy_combination/{sample_name}_{trait_name}.Cauchy.csv.gz'
252
- input:
253
- hdf5_path=rules.find_latent_representations.output.hdf5_output,
254
- ldsc_done=rules.spatial_ldsc.output.done
255
- params:
256
- cauchy_save_dir='{sample_name}/cauchy_combination',
257
- annotation=annotation,
258
- ldsc_dir=rules.spatial_ldsc.params.ldsc_save_dir
259
- benchmark:
260
- '{sample_name}/cauchy_combination/{sample_name}_{trait_name}.Cauchy.csv.gz.benchmark'
261
- threads:
262
- 2
263
- resources:
264
- mem_mb_per_cpu=25_000
265
- shell:
266
- """
267
- gsmap run_cauchy_combination --input_hdf5_path {input.hdf5_path} --input_ldsc_dir {params.ldsc_dir} --sample_name {wildcards.sample_name} --output_cauchy_dir {params.cauchy_save_dir} --trait_name {wildcards.trait_name} --annotation {params.annotation}
268
- """
@@ -1,229 +0,0 @@
1
- import numpy as np
2
-
3
- workdir: '/storage/yangjianLab/chenwenhao/projects/202312_GPS/data/GPS_test/Nature_Neuroscience_2021/snake_workdir'
4
- sample_names = ["Cortex_151507"]
5
- # chrom = "all"
6
-
7
- chrom = range(1,23)
8
- # trait_names=[
9
- # 'ADULT1_ADULT2_ONSET_ASTHMA'
10
- # ]
11
- annotation= "layer_guess"
12
- data_type = 'count'
13
- rule all:
14
- input:
15
- expand('{sample_name}/spatial_ldsc/{sample_name}.spatial_ldsc.done', sample_name=sample_names)
16
- # expand('{sample_name}/cauchy_combination/{sample_name}_{trait_name}.Cauchy.csv.gz', trait_name=trait_names, sample_name=sample_names)
17
-
18
- rule find_latent_representations:
19
- input:
20
- hdf5_path = "/storage/yangjianLab/songliyang/SpatialData/Data/Brain/Human/Nature_Neuroscience_2021/processed/h5ad/Cortex_151507.h5ad"
21
- output:
22
- hdf5_output='{sample_name}/find_latent_representations/{sample_name}_add_latent.h5ad'
23
- params:
24
- annotation=annotation,
25
- type=data_type,
26
- epochs=300,
27
- feat_hidden1=256,
28
- feat_hidden2=128,
29
- feat_cell=3000,
30
- gcn_hidden1=64,
31
- gcn_hidden2=30,
32
- p_drop=0.1,
33
- gcn_lr=0.001,
34
- gcn_decay=0.01,
35
- n_neighbors=11,
36
- label_w=1,
37
- rec_w=1,
38
- n_comps=300,
39
- weighted_adj=False,
40
- nheads=3,
41
- var=False,
42
- convergence_threshold=1e-4,
43
- hierarchically=False
44
- threads:
45
- 1
46
- benchmark: '{sample_name}/find_latent_representations/{sample_name}_add_latent.h5ad.benchmark'
47
- run:
48
- command = f"""
49
- gsmap run_find_latent_representations \
50
- --input_hdf5_path {input.hdf5_path} \
51
- --sample_name {wildcards.sample_name} \
52
- --output_hdf5_path {output.hdf5_output} \
53
- { '--annotation ' + params.annotation if params.annotation is not None else ''} \
54
- --type {params.type} \
55
- --epochs {params.epochs} \
56
- --feat_hidden1 {params.feat_hidden1} \
57
- --feat_hidden2 {params.feat_hidden2} \
58
- --feat_cell {params.feat_cell} \
59
- --gcn_hidden1 {params.gcn_hidden1} \
60
- --gcn_hidden2 {params.gcn_hidden2} \
61
- --p_drop {params.p_drop} \
62
- --gcn_lr {params.gcn_lr} \
63
- --gcn_decay {params.gcn_decay} \
64
- --n_neighbors {params.n_neighbors} \
65
- --label_w {params.label_w} \
66
- --rec_w {params.rec_w} \
67
- --n_comps {params.n_comps} \
68
- {'--weighted_adj' if params.weighted_adj else ''} \
69
- --nheads {params.nheads} \
70
- {'--var' if params.var else ''} \
71
- --convergence_threshold {params.convergence_threshold} \
72
- {'--hierarchically' if params.hierarchically else ''}
73
- """
74
- shell(
75
- f'{command}'
76
- )
77
-
78
-
79
- rule latent_to_gene:
80
- input:
81
- hdf5_with_latent_path=rules.find_latent_representations.output.hdf5_output
82
- output:
83
- feather_path='{sample_name}/latent_to_gene/{sample_name}_gene_marker_score.feather'
84
- params:
85
- latent_representation="latent_GVAE",
86
- num_neighbour=51,
87
- num_neighbour_spatial=201,
88
- species=None,
89
- gs_species=None,
90
- gM_slices=None,
91
- annotation=annotation,
92
- type=data_type
93
- threads:
94
- 1
95
- resources:
96
- mem_mb_per_cpu=lambda wildcards, threads, attempt: 70_000 * np.log2(attempt + 1),
97
- qos='huge'
98
- benchmark: '{sample_name}/latent_to_gene/{sample_name}_gene_marker_score.feather.benchmark'
99
- run:
100
- command = f"""
101
- gsmap run_latent_to_gene \
102
- --input_hdf5_with_latent_path {input.hdf5_with_latent_path} \
103
- --sample_name {wildcards.sample_name} \
104
- --output_feather_path {output.feather_path} \
105
- { '--annotation ' + params.annotation if params.annotation is not None else ''} \
106
- --type {params.type} \
107
- --latent_representation {params.latent_representation} \
108
- --num_neighbour {params.num_neighbour} \
109
- --num_neighbour_spatial {params.num_neighbour_spatial} \
110
- {'--species ' + params.species if params.species is not None else ''} \
111
- {'--gs_species ' + params.gs_species if params.gs_species is not None else ''} \
112
- {'--gM_slices ' + params.gM_slices if params.gM_slices is not None else ''}
113
- """
114
- shell(
115
- f'{command}'
116
- )
117
-
118
-
119
-
120
- rule generate_ldscore:
121
- input:
122
- mkscore_feather_file=rules.latent_to_gene.output.feather_path
123
- output:
124
- done='{sample_name}/generate_ldscore/{sample_name}_generate_ldscore_chr{chrom}.done'
125
- params:
126
- ld_score_save_dir='{sample_name}/generate_ldscore',
127
- gtf_annotation_file="/storage/yangjianLab/songliyang/ReferenceGenome/GRCh37/gencode.v39lift37.annotation.gtf",
128
- bfile_root="/storage/yangjianLab/sharedata/LDSC_resource/1000G_EUR_Phase3_plink/1000G.EUR.QC",
129
- keep_snp_root="/storage/yangjianLab/sharedata/LDSC_resource/hapmap3_snps/hm",
130
- gene_window_size=50000,
131
- enhancer_annotation_file=None,
132
- snp_multiple_enhancer_strategy='max_mkscore',
133
- gene_window_enhancer_priority=None,
134
- spots_per_chunk=5000,
135
- ld_wind=1,
136
- ld_unit="CM",
137
- additional_baseline_annotation_dir_path='/storage/yangjianLab/chenwenhao/projects/202312_GPS/data/resource/ldsc/baseline_v1.2/remove_base'
138
- benchmark: '{sample_name}/generate_ldscore/{sample_name}_generate_ldscore_chr{chrom}.done.benchmark'
139
- threads:
140
- 3
141
- resources:
142
- mem_mb_per_cpu=lambda wildcards, threads, attempt: 45_000 / threads * np.log2(attempt + 1),
143
- qos='huge'
144
- run:
145
- command = f"""
146
- gsmap run_generate_ldscore \
147
- --sample_name {wildcards.sample_name} \
148
- --chrom {wildcards.chrom} \
149
- --ldscore_save_dir {params.ld_score_save_dir} \
150
- --mkscore_feather_file {input.mkscore_feather_file} \
151
- --bfile_root {params.bfile_root} \
152
- --keep_snp_root {params.keep_snp_root} \
153
- --gtf_annotation_file {params.gtf_annotation_file} \
154
- --gene_window_size {params.gene_window_size} \
155
- {'--enhancer_annotation_file ' + params.enhancer_annotation_file if params.enhancer_annotation_file is not None else ''} \
156
- --snp_multiple_enhancer_strategy {params.snp_multiple_enhancer_strategy} \
157
- {'--gene_window_enhancer_priority ' + params.gene_window_enhancer_priority if params.gene_window_enhancer_priority is not None else ''} \
158
- --spots_per_chunk {params.spots_per_chunk} \
159
- --ld_wind {params.ld_wind} \
160
- --ld_unit {params.ld_unit} \
161
- { '--additional_baseline_annotation_dir_path' + params.additional_baseline_annotation_dir_path if params.additional_baseline_annotation_dir_path is not None else '' }
162
- """
163
- shell(command)
164
- shell('touch {output.done}')
165
-
166
-
167
- def get_h2_file(wildcards):
168
- gwas_root = "/storage/yangjianLab/songliyang/GWAS_trait/LDSC"
169
- return f"{gwas_root}/{wildcards.trait_name}.sumstats.gz",
170
-
171
-
172
- def get_ldscore(wildcards):
173
- if chrom == "all":
174
- return f"{wildcards.sample_name}/generate_ldscore/{wildcards.sample_name}_generate_ldscore_chr{chrom}.done"
175
- else:
176
- assert tuple(chrom) == tuple(range(1,23)), "chrom must be all or range(1,23)"
177
- return [f"{wildcards.sample_name}/generate_ldscore/{wildcards.sample_name}_generate_ldscore_chr{c}.done" for
178
- c in chrom]
179
-
180
-
181
- rule spatial_ldsc:
182
- input:
183
- # h2_file=get_h2_file,
184
- generate_ldscore_done=get_ldscore
185
- output:
186
- done='{sample_name}/spatial_ldsc/{sample_name}.spatial_ldsc.done'
187
- params:
188
- ldscore_input_dir=rules.generate_ldscore.params.ld_score_save_dir,
189
- ldsc_save_dir='{sample_name}/spatial_ldsc',
190
- w_file="/storage/yangjianLab/sharedata/LDSC_resource/LDSC_SEG_ldscores/weights_hm3_no_hla/weights.",
191
- sumstats_config_file='/storage/yangjianLab/chenwenhao/projects/202312_GPS/src/gsMap/example/sumstats_config_sub.yaml',
192
- all_chunk = None
193
- threads:
194
- 2
195
- benchmark:
196
- '{sample_name}/spatial_ldsc/{sample_name}.spatial_ldsc.done.benchmark'
197
- resources:
198
- mem_mb_per_cpu=lambda wildcards, threads, attempt: 60_000 / threads * np.log2(attempt + 1),
199
- qos='huge'
200
- run:
201
- command = f"""
202
- gsmap run_spatial_ldsc --w_file {params.w_file} --sample_name {wildcards.sample_name} --num_processes {threads} --ldscore_input_dir {params.ldscore_input_dir} --ldsc_save_dir {params.ldsc_save_dir} --sumstats_config_file {params.sumstats_config_file} {f'--all_chunk {params.all_chunk}' if params.all_chunk else ''}
203
- """
204
- shell(
205
- f'{command}'
206
- 'touch {output.done}'
207
- )
208
-
209
-
210
- rule cauchy_combination:
211
- output:
212
- done='{sample_name}/cauchy_combination/{sample_name}_{trait_name}.Cauchy.csv.gz'
213
- input:
214
- hdf5_path=rules.find_latent_representations.output.hdf5_output,
215
- ldsc_done=rules.spatial_ldsc.output.done
216
- params:
217
- cauchy_save_dir='{sample_name}/cauchy_combination',
218
- annotation=annotation,
219
- ldsc_dir=rules.spatial_ldsc.params.ldsc_save_dir
220
- benchmark:
221
- '{sample_name}/cauchy_combination/{sample_name}_{trait_name}.Cauchy.csv.gz.benchmark'
222
- threads:
223
- 2
224
- resources:
225
- mem_mb_per_cpu=25_000
226
- shell:
227
- """
228
- gsmap run_cauchy_combination --input_hdf5_path {input.hdf5_path} --input_ldsc_dir {params.ldsc_dir} --sample_name {wildcards.sample_name} --output_cauchy_dir {params.cauchy_save_dir} --trait_name {wildcards.trait_name} --annotation {params.annotation}
229
- """
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes