gsMap 1.64__tar.gz → 1.66__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gsmap-1.64 → gsmap-1.66}/PKG-INFO +1 -1
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/__init__.py +1 -1
- gsmap-1.66/src/gsMap/latent_to_gene.py +241 -0
- gsmap-1.66/src/gsMap/setup.py +5 -0
- gsmap-1.64/src/gsMap/latent_to_gene.py +0 -218
- gsmap-1.64/test/GPS-snakemake-workflow-macaque.smk +0 -268
- gsmap-1.64/test/GPS-snakemake-workflow.smk +0 -229
- {gsmap-1.64 → gsmap-1.66}/.github/workflows/publish-to-pypi.yml +0 -0
- {gsmap-1.64 → gsmap-1.66}/.gitignore +0 -0
- {gsmap-1.64 → gsmap-1.66}/LICENSE +0 -0
- {gsmap-1.64 → gsmap-1.66}/README.md +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/Makefile +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/make.bat +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/requirements.txt +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/_static/schematic.svg +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/api/cauchy_combination.rst +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/api/find_latent_representations.rst +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/api/format_sumstats.rst +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/api/generate_ldscore.rst +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/api/latent_to_gene.rst +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/api/quick_mode.rst +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/api/report.rst +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/api/spatial_ldsc.rst +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/api.rst +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/charts/cortex/Cortex_151507_Height.json +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/charts/cortex/Cortex_151507_IQ.json +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/charts/cortex/Cortex_151507_MCHC.json +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/charts/cortex/Cortex_151507_SCZ.json +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/charts/mouse_embryo/E16.5_E1S1_Height.json +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/charts/mouse_embryo/E16.5_E1S1_IQ.json +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/charts/mouse_embryo/E16.5_E1S1_MCHC.json +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/charts/mouse_embryo/E16.5_E1S1_SCZ.json +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/charts/test.json +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/conf.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/data.rst +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/data_format.md +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/index.rst +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/install.rst +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/mouse.rst +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/mouse_example.md +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/quick_mode.md +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/release.rst +0 -0
- {gsmap-1.64 → gsmap-1.66}/docs/source/tutorials.rst +0 -0
- {gsmap-1.64 → gsmap-1.66}/pyproject.toml +0 -0
- {gsmap-1.64 → gsmap-1.66}/schematic.png +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/GNN_VAE/__init__.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/GNN_VAE/adjacency_matrix.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/GNN_VAE/model.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/GNN_VAE/train.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/__main__.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/cauchy_combination_test.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/config.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/diagnosis.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/find_latent_representation.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/format_sumstats.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/generate_ldscore.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/main.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/report.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/run_all_mode.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/spatial_ldsc_multiple_sumstats.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/templates/report_template.html +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/utils/__init__.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/utils/generate_r2_matrix.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/utils/jackknife.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/utils/make_annotations.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/utils/manhattan_plot.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/utils/regression_read.py +0 -0
- {gsmap-1.64 → gsmap-1.66}/src/gsMap/visualize.py +0 -0
@@ -0,0 +1,241 @@
|
|
1
|
+
import logging
|
2
|
+
from pathlib import Path
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
import pandas as pd
|
6
|
+
import scanpy as sc
|
7
|
+
from scipy.sparse import csr_matrix
|
8
|
+
from scipy.stats import gmean
|
9
|
+
from scipy.stats import rankdata
|
10
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
11
|
+
from sklearn.neighbors import NearestNeighbors
|
12
|
+
from joblib import Parallel, delayed
|
13
|
+
from tqdm import tqdm
|
14
|
+
|
15
|
+
from gsMap.config import LatentToGeneConfig
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
def find_neighbors(coor, num_neighbour):
|
21
|
+
"""
|
22
|
+
Find Neighbors of each cell (based on spatial coordinates).
|
23
|
+
"""
|
24
|
+
nbrs = NearestNeighbors(n_neighbors=num_neighbour).fit(coor)
|
25
|
+
distances, indices = nbrs.kneighbors(coor, return_distance=True)
|
26
|
+
cell_indices = np.arange(coor.shape[0])
|
27
|
+
cell1 = np.repeat(cell_indices, indices.shape[1])
|
28
|
+
cell2 = indices.flatten()
|
29
|
+
distance = distances.flatten()
|
30
|
+
spatial_net = pd.DataFrame({'Cell1': cell1, 'Cell2': cell2, 'Distance': distance})
|
31
|
+
return spatial_net
|
32
|
+
|
33
|
+
|
34
|
+
def build_spatial_net(adata, annotation, num_neighbour):
|
35
|
+
"""
|
36
|
+
Build spatial neighbourhood matrix for each spot (cell) based on the spatial coordinates.
|
37
|
+
"""
|
38
|
+
logger.info(f'------Building spatial graph based on spatial coordinates...')
|
39
|
+
|
40
|
+
coor = adata.obsm['spatial']
|
41
|
+
if annotation is not None:
|
42
|
+
logger.info(f'Cell annotations are provided...')
|
43
|
+
spatial_net_list = []
|
44
|
+
# Cells with annotations
|
45
|
+
for ct in adata.obs[annotation].dropna().unique():
|
46
|
+
idx = np.where(adata.obs[annotation] == ct)[0]
|
47
|
+
coor_temp = coor[idx, :]
|
48
|
+
spatial_net_temp = find_neighbors(coor_temp, min(num_neighbour, coor_temp.shape[0]))
|
49
|
+
# Map back to original indices
|
50
|
+
spatial_net_temp['Cell1'] = idx[spatial_net_temp['Cell1'].values]
|
51
|
+
spatial_net_temp['Cell2'] = idx[spatial_net_temp['Cell2'].values]
|
52
|
+
spatial_net_list.append(spatial_net_temp)
|
53
|
+
logger.info(f'{ct}: {coor_temp.shape[0]} cells')
|
54
|
+
|
55
|
+
# Cells labeled as nan
|
56
|
+
if pd.isnull(adata.obs[annotation]).any():
|
57
|
+
idx_nan = np.where(pd.isnull(adata.obs[annotation]))[0]
|
58
|
+
logger.info(f'Nan: {len(idx_nan)} cells')
|
59
|
+
spatial_net_temp = find_neighbors(coor, num_neighbour)
|
60
|
+
spatial_net_temp = spatial_net_temp[spatial_net_temp['Cell1'].isin(idx_nan)]
|
61
|
+
spatial_net_list.append(spatial_net_temp)
|
62
|
+
spatial_net = pd.concat(spatial_net_list, axis=0)
|
63
|
+
else:
|
64
|
+
logger.info(f'Cell annotations are not provided...')
|
65
|
+
spatial_net = find_neighbors(coor, num_neighbour)
|
66
|
+
|
67
|
+
return spatial_net
|
68
|
+
|
69
|
+
|
70
|
+
def find_neighbors_regional(cell_pos, spatial_net_dict, coor_latent, config, cell_annotations):
|
71
|
+
num_neighbour = config.num_neighbour
|
72
|
+
annotations = config.annotation
|
73
|
+
|
74
|
+
cell_use_pos = spatial_net_dict.get(cell_pos, [])
|
75
|
+
if len(cell_use_pos) == 0:
|
76
|
+
return []
|
77
|
+
|
78
|
+
cell_latent = coor_latent[cell_pos, :].reshape(1, -1)
|
79
|
+
neighbors_latent = coor_latent[cell_use_pos, :]
|
80
|
+
similarity = cosine_similarity(cell_latent, neighbors_latent).reshape(-1)
|
81
|
+
|
82
|
+
if annotations is not None:
|
83
|
+
cell_annotation = cell_annotations[cell_pos]
|
84
|
+
neighbor_annotations = cell_annotations[cell_use_pos]
|
85
|
+
mask = neighbor_annotations == cell_annotation
|
86
|
+
if not np.any(mask):
|
87
|
+
return []
|
88
|
+
similarity = similarity[mask]
|
89
|
+
cell_use_pos = cell_use_pos[mask]
|
90
|
+
|
91
|
+
if len(similarity) == 0:
|
92
|
+
return []
|
93
|
+
|
94
|
+
indices = np.argsort(-similarity) # descending order
|
95
|
+
top_indices = indices[:num_neighbour]
|
96
|
+
cell_select_pos = cell_use_pos[top_indices]
|
97
|
+
return cell_select_pos
|
98
|
+
|
99
|
+
|
100
|
+
def compute_regional_mkscore(cell_pos, spatial_net_dict, coor_latent, config, cell_annotations,
|
101
|
+
ranks, frac_whole, adata_X_bool):
|
102
|
+
"""
|
103
|
+
Compute gmean ranks of a region.
|
104
|
+
"""
|
105
|
+
cell_select_pos = find_neighbors_regional(
|
106
|
+
cell_pos, spatial_net_dict, coor_latent, config, cell_annotations
|
107
|
+
)
|
108
|
+
if len(cell_select_pos) == 0:
|
109
|
+
return np.zeros(ranks.shape[1], dtype=np.float16)
|
110
|
+
|
111
|
+
# Ratio of expression ranks
|
112
|
+
ranks_tg = ranks[cell_select_pos, :]
|
113
|
+
gene_ranks_region = gmean(ranks_tg, axis=0)
|
114
|
+
gene_ranks_region[gene_ranks_region <= 1] = 0
|
115
|
+
|
116
|
+
if not config.no_expression_fraction:
|
117
|
+
# Ratio of expression fractions
|
118
|
+
frac_focal = adata_X_bool[cell_select_pos, :].sum(axis=0).A1 / len(cell_select_pos)
|
119
|
+
frac_region = frac_focal / frac_whole
|
120
|
+
frac_region[frac_region <= 1] = 0
|
121
|
+
frac_region[frac_region > 1] = 1
|
122
|
+
|
123
|
+
# Simultaneously consider the ratio of expression fractions and ranks
|
124
|
+
gene_ranks_region = gene_ranks_region * frac_region
|
125
|
+
|
126
|
+
mkscore = np.exp(gene_ranks_region ** 1.5) - 1
|
127
|
+
return mkscore.astype(np.float16, copy=False)
|
128
|
+
|
129
|
+
|
130
|
+
def run_latent_to_gene(config: LatentToGeneConfig):
|
131
|
+
logger.info('------Loading the spatial data...')
|
132
|
+
adata = sc.read_h5ad(config.hdf5_with_latent_path)
|
133
|
+
|
134
|
+
if config.annotation is not None:
|
135
|
+
logger.info(f'------Cell annotations are provided as {config.annotation}...')
|
136
|
+
adata = adata[~pd.isnull(adata.obs[config.annotation]), :]
|
137
|
+
|
138
|
+
# Homologs transformation
|
139
|
+
if config.homolog_file is not None:
|
140
|
+
logger.info(f'------Transforming the {config.species} to HUMAN_GENE_SYM...')
|
141
|
+
homologs = pd.read_csv(config.homolog_file, sep='\t')
|
142
|
+
if homologs.shape[1] != 2:
|
143
|
+
raise ValueError(
|
144
|
+
"Homologs file must have two columns: one for the species and one for the human gene symbol.")
|
145
|
+
|
146
|
+
homologs.columns = [config.species, 'HUMAN_GENE_SYM']
|
147
|
+
homologs.set_index(config.species, inplace=True)
|
148
|
+
adata = adata[:, adata.var_names.isin(homologs.index)]
|
149
|
+
logger.info(f"{adata.shape[1]} genes retained after homolog transformation.")
|
150
|
+
if adata.shape[1] < 100:
|
151
|
+
raise ValueError("Too few genes retained in ST data (<100).")
|
152
|
+
adata.var_names = homologs.loc[adata.var_names, 'HUMAN_GENE_SYM'].values
|
153
|
+
adata = adata[:, ~adata.var_names.duplicated()]
|
154
|
+
|
155
|
+
# Remove cells and genes that are not expressed
|
156
|
+
logger.info(f'Number of cells, genes of the input data: {adata.shape[0]},{adata.shape[1]}')
|
157
|
+
adata = adata[adata.X.sum(axis=1) > 0, adata.X.sum(axis=0) > 0]
|
158
|
+
logger.info(f'Number of cells, genes after transformation: {adata.shape[0]},{adata.shape[1]}')
|
159
|
+
|
160
|
+
# Create mappings
|
161
|
+
n_cells = adata.n_obs
|
162
|
+
n_genes = adata.n_vars
|
163
|
+
|
164
|
+
if config.annotation is not None:
|
165
|
+
cell_annotations = adata.obs[config.annotation].values
|
166
|
+
else:
|
167
|
+
cell_annotations = None
|
168
|
+
|
169
|
+
# Build the spatial graph
|
170
|
+
spatial_net = build_spatial_net(adata, config.annotation, config.num_neighbour_spatial)
|
171
|
+
spatial_net_dict = spatial_net.groupby('Cell1')['Cell2'].apply(np.array).to_dict()
|
172
|
+
|
173
|
+
# Extract the latent representation
|
174
|
+
coor_latent = adata.obsm[config.latent_representation]
|
175
|
+
coor_latent = coor_latent.astype(np.float32)
|
176
|
+
|
177
|
+
# Compute ranks
|
178
|
+
logger.info('------Ranking the spatial data...')
|
179
|
+
adata_X = adata.X.tocsr()
|
180
|
+
ranks = np.zeros((n_cells, n_genes), dtype=np.float32)
|
181
|
+
|
182
|
+
for i in tqdm(range(n_cells), desc="Computing ranks per cell"):
|
183
|
+
data = adata_X[i, :].toarray().flatten()
|
184
|
+
ranks[i, :] = rankdata(data, method='average')
|
185
|
+
|
186
|
+
# Geometric mean across slices
|
187
|
+
if config.gM_slices is not None:
|
188
|
+
logger.info('Geometrical mean across multiple slices is provided.')
|
189
|
+
gM_df = pd.read_parquet(config.gM_slices)
|
190
|
+
if config.species is not None:
|
191
|
+
homologs = pd.read_csv(config.homolog_file, sep='\t', header=None)
|
192
|
+
if homologs.shape[1] < 2:
|
193
|
+
raise ValueError(
|
194
|
+
"Homologs file must have at least two columns: one for the species and one for the human gene symbol.")
|
195
|
+
homologs.columns = [config.species, 'HUMAN_GENE_SYM']
|
196
|
+
homologs.set_index(config.species, inplace=True)
|
197
|
+
gM_df = gM_df.loc[gM_df.index.isin(homologs.index)]
|
198
|
+
gM_df.index = homologs.loc[gM_df.index, 'HUMAN_GENE_SYM'].values
|
199
|
+
common_genes = np.intersect1d(adata.var_names, gM_df.index)
|
200
|
+
gM_df = gM_df.loc[common_genes]
|
201
|
+
gM = gM_df['G_Mean'].values
|
202
|
+
adata = adata[:, common_genes]
|
203
|
+
ranks = ranks[:, np.isin(adata.var_names, common_genes)]
|
204
|
+
else:
|
205
|
+
gM = gmean(ranks, axis=0)
|
206
|
+
|
207
|
+
# Compute the fraction of each gene across cells
|
208
|
+
adata_X_bool = adata_X.astype(bool)
|
209
|
+
frac_whole = np.asarray(adata_X_bool.sum(axis=0)).flatten() / n_cells
|
210
|
+
|
211
|
+
# Normalize the ranks
|
212
|
+
ranks = ranks / gM
|
213
|
+
|
214
|
+
# Compute marker scores in parallel
|
215
|
+
logger.info('------Computing marker scores...')
|
216
|
+
|
217
|
+
def compute_mk_score_wrapper(cell_pos):
|
218
|
+
return compute_regional_mkscore(
|
219
|
+
cell_pos, spatial_net_dict, coor_latent, config, cell_annotations, ranks, frac_whole, adata_X_bool
|
220
|
+
)
|
221
|
+
|
222
|
+
mk_scores = [compute_mk_score_wrapper(cell_pos) for cell_pos in tqdm(range(n_cells), desc="Calculating marker scores")]
|
223
|
+
mk_score = np.vstack(mk_scores).T
|
224
|
+
|
225
|
+
# Remove mitochondrial genes
|
226
|
+
gene_names = adata.var_names.values.astype(str)
|
227
|
+
mt_gene_mask = ~(np.char.startswith(gene_names, 'MT-') | np.char.startswith(gene_names, 'mt-'))
|
228
|
+
mk_score = mk_score[mt_gene_mask, :]
|
229
|
+
gene_names = gene_names[mt_gene_mask]
|
230
|
+
|
231
|
+
# Save the marker scores
|
232
|
+
logger.info(f'------Saving marker scores ...')
|
233
|
+
output_file_path = Path(config.mkscore_feather_path)
|
234
|
+
output_file_path.parent.mkdir(parents=True, exist_ok=True, mode=0o755)
|
235
|
+
mk_score_df = pd.DataFrame(mk_score, index=gene_names, columns=adata.obs_names)
|
236
|
+
mk_score_df.reset_index(inplace=True)
|
237
|
+
mk_score_df.rename(columns={'index': 'HUMAN_GENE_SYM'}, inplace=True)
|
238
|
+
mk_score_df.to_feather(output_file_path)
|
239
|
+
|
240
|
+
# Save the modified adata object to disk
|
241
|
+
adata.write(config.hdf5_with_latent_path)
|
@@ -1,218 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from pathlib import Path
|
3
|
-
|
4
|
-
import numpy as np
|
5
|
-
import pandas as pd
|
6
|
-
import scanpy as sc
|
7
|
-
from scipy.stats import gmean
|
8
|
-
from scipy.stats import rankdata
|
9
|
-
from sklearn.metrics.pairwise import cosine_similarity
|
10
|
-
from sklearn.neighbors import NearestNeighbors
|
11
|
-
from tqdm import tqdm
|
12
|
-
|
13
|
-
from gsMap.config import LatentToGeneConfig
|
14
|
-
|
15
|
-
logger = logging.getLogger(__name__)
|
16
|
-
|
17
|
-
|
18
|
-
def find_Neighbors(coor, num_neighbour):
|
19
|
-
"""
|
20
|
-
find Neighbors of each cell (based on spatial coordinates)
|
21
|
-
"""
|
22
|
-
nbrs = NearestNeighbors(n_neighbors=num_neighbour).fit(coor)
|
23
|
-
distances, indices = nbrs.kneighbors(coor, return_distance=True)
|
24
|
-
|
25
|
-
KNN_list = [pd.DataFrame(zip([it] * indices[it].shape[0], indices[it], distances[it])) for it in
|
26
|
-
range(indices.shape[0])]
|
27
|
-
KNN_df = pd.concat(KNN_list)
|
28
|
-
KNN_df.columns = ['Cell1', 'Cell2', 'Distance']
|
29
|
-
|
30
|
-
spatial_net = KNN_df.copy()
|
31
|
-
id_cell_trans = dict(zip(range(coor.shape[0]), np.array(coor.index)))
|
32
|
-
|
33
|
-
spatial_net['Cell1'] = spatial_net['Cell1'].map(id_cell_trans)
|
34
|
-
spatial_net['Cell2'] = spatial_net['Cell2'].map(id_cell_trans)
|
35
|
-
|
36
|
-
return spatial_net
|
37
|
-
|
38
|
-
|
39
|
-
def _build_spatial_net(adata, annotation, num_neighbour):
|
40
|
-
"""
|
41
|
-
1 Build spatial neighbourhood matrix for each spot (cell) based on the spatial coord
|
42
|
-
"""
|
43
|
-
logger.info(f'------Building spatial graph based on spatial coordinates...')
|
44
|
-
|
45
|
-
coor = pd.DataFrame(adata.obsm['spatial'])
|
46
|
-
coor.index = adata.obs.index
|
47
|
-
|
48
|
-
if not annotation is None:
|
49
|
-
logger.info(f'Cell annotations are provided...')
|
50
|
-
spatial_net = pd.DataFrame()
|
51
|
-
# Cells with annotations
|
52
|
-
for ct in adata.obs[annotation].dropna().unique():
|
53
|
-
coor_temp = coor.loc[adata.obs[annotation] == ct, :]
|
54
|
-
spatial_net_temp = find_Neighbors(coor_temp, min(num_neighbour, coor_temp.shape[0]))
|
55
|
-
spatial_net = pd.concat((spatial_net, spatial_net_temp), axis=0)
|
56
|
-
logger.info(f'{ct}: {coor_temp.shape[0]} cells')
|
57
|
-
|
58
|
-
# Cells labeled as nan
|
59
|
-
if pd.isnull(adata.obs[annotation]).any():
|
60
|
-
cell_nan = adata.obs.index[np.where(pd.isnull(adata.obs[annotation]))[0]]
|
61
|
-
logger.info(f'Nan: {len(cell_nan)} cells')
|
62
|
-
|
63
|
-
spatial_net_temp = find_Neighbors(coor, num_neighbour)
|
64
|
-
spatial_net_temp = spatial_net_temp.loc[spatial_net_temp.Cell1.isin(cell_nan), :]
|
65
|
-
spatial_net = pd.concat((spatial_net, spatial_net_temp), axis=0)
|
66
|
-
else:
|
67
|
-
logger.info(f'Cell annotations are not provided...')
|
68
|
-
spatial_net = find_Neighbors(coor, num_neighbour)
|
69
|
-
|
70
|
-
return spatial_net
|
71
|
-
|
72
|
-
|
73
|
-
def find_Neighbors_Regional(cell):
|
74
|
-
cell_use = spatial_net_dict[cell]
|
75
|
-
similarity = cosine_similarity(coor_latent.loc[cell].values.reshape(1, -1),
|
76
|
-
coor_latent.loc[cell_use].values).reshape(-1)
|
77
|
-
if not args.annotation is None:
|
78
|
-
annotation = adata.obs[args.annotation]
|
79
|
-
df = pd.DataFrame({'Cell2': cell_use, 'Similarity': similarity, 'Annotation': annotation[cell_use]})
|
80
|
-
df = df.loc[df.loc[cell_use, 'Annotation'] == df.loc[cell, 'Annotation']]
|
81
|
-
else:
|
82
|
-
df = pd.DataFrame({'Cell2': cell_use, 'Similarity': similarity})
|
83
|
-
|
84
|
-
df = df.sort_values(by='Similarity', ascending=False)
|
85
|
-
cell_select = df.Cell2[0:args.num_neighbour].to_list()
|
86
|
-
|
87
|
-
return cell_select
|
88
|
-
|
89
|
-
|
90
|
-
def _compute_regional_mkscore(cell_tg, ):
|
91
|
-
"""
|
92
|
-
compute gmean ranks of a region
|
93
|
-
"""
|
94
|
-
cell_select = find_Neighbors_Regional(cell_tg)
|
95
|
-
|
96
|
-
# Ratio of expression ranks
|
97
|
-
ranks_tg = ranks.loc[cell_select]
|
98
|
-
gene_ranks_region = gmean(ranks_tg, axis=0)
|
99
|
-
gene_ranks_region[gene_ranks_region <= 1] = 0
|
100
|
-
|
101
|
-
if not args.no_expression_fraction:
|
102
|
-
# Ratio of expression fractions
|
103
|
-
frac_focal = expressed_mask.loc[cell_select].sum(0) / len(cell_select)
|
104
|
-
frac_region = frac_focal / frac_whole
|
105
|
-
frac_region[frac_region <= 1] = 0
|
106
|
-
frac_region[frac_region > 1] = 1
|
107
|
-
|
108
|
-
# Simultaneously consider the ratio of expression fractions and ranks
|
109
|
-
gene_ranks_region = (gene_ranks_region * frac_region).values
|
110
|
-
|
111
|
-
mkscore = np.exp(gene_ranks_region ** 1.5) - 1
|
112
|
-
return mkscore.astype(np.float16, copy=False)
|
113
|
-
|
114
|
-
|
115
|
-
def run_latent_to_gene(config: LatentToGeneConfig):
|
116
|
-
global adata, coor_latent, spatial_net, ranks, frac_whole, args, spatial_net_dict, expressed_mask
|
117
|
-
args = config
|
118
|
-
# Load and process the spatial data
|
119
|
-
logger.info('------Loading the spatial data...')
|
120
|
-
adata = sc.read_h5ad(config.hdf5_with_latent_path)
|
121
|
-
|
122
|
-
logger.info('------Ranking the spatial data...')
|
123
|
-
adata.layers['rank'] = rankdata(adata.X.toarray().astype(np.float32), axis=1).astype(np.float32)
|
124
|
-
|
125
|
-
if not config.annotation is None:
|
126
|
-
logger.info(f'------Cell annotations are provided as {config.annotation}...')
|
127
|
-
adata = adata[~pd.isnull(adata.obs[config.annotation]), :]
|
128
|
-
|
129
|
-
# Homologs transformation
|
130
|
-
if not config.homolog_file is None:
|
131
|
-
logger.info(f'------Transforming the {config.species} to HUMAN_GENE_SYM...')
|
132
|
-
homologs = pd.read_csv(config.homolog_file, sep='\t')
|
133
|
-
if homologs.shape[1] != 2:
|
134
|
-
raise ValueError(
|
135
|
-
"Homologs file must have two columns: one for the species and one for the human gene symbol.")
|
136
|
-
|
137
|
-
homologs.columns = [config.species, 'HUMAN_GENE_SYM']
|
138
|
-
homologs.set_index(config.species, inplace=True)
|
139
|
-
adata = adata[:, adata.var_names.isin(homologs.index)]
|
140
|
-
# Log the number of genes left after homolog transformation
|
141
|
-
logger.info(f"{adata.shape[1]} genes retained after homolog transformation.")
|
142
|
-
if adata.shape[1] < 100:
|
143
|
-
raise ValueError("Too few genes retained in ST data (<100).")
|
144
|
-
adata.var_names = homologs.loc[adata.var_names, 'HUMAN_GENE_SYM'].values
|
145
|
-
# drop duplicated genes
|
146
|
-
adata = adata[:, ~adata.var_names.duplicated()]
|
147
|
-
|
148
|
-
# Remove cells that do not express any genes after transformation, and genes that are not expressed in any cells.
|
149
|
-
logger.info(f'Number of cells, genes of the input data: {adata.shape[0]},{adata.shape[1]}')
|
150
|
-
adata = adata[adata.X.sum(axis=1) > 0, adata.X.sum(axis=0) > 0]
|
151
|
-
logger.info(f'Number of cells, genes after transformation: {adata.shape[0]},{adata.shape[1]}')
|
152
|
-
# Buid the spatial graph
|
153
|
-
spatial_net = _build_spatial_net(adata, config.annotation, config.num_neighbour_spatial)
|
154
|
-
spatial_net.set_index('Cell1', inplace=True)
|
155
|
-
# convert the spatial graph to a dictionary cell1 to cells in the neighbourhood
|
156
|
-
spatial_net_dict = spatial_net.groupby(spatial_net.index).Cell2.apply(list).to_dict()
|
157
|
-
|
158
|
-
# Extract the latent representation
|
159
|
-
coor_latent = pd.DataFrame(adata.obsm[config.latent_representation])
|
160
|
-
coor_latent.index = adata.obs.index
|
161
|
-
# Find marker genes
|
162
|
-
cell_list = adata.obs.index.tolist()
|
163
|
-
|
164
|
-
# Load the geometrical mean across slices
|
165
|
-
if config.gM_slices is not None:
|
166
|
-
logger.info('Geometrical mean across multiple slices is provided.')
|
167
|
-
gM = pd.read_parquet(config.gM_slices)
|
168
|
-
if config.species is not None:
|
169
|
-
homologs = pd.read_csv(config.homolog_file, sep='\t', header=None)
|
170
|
-
if homologs.shape[1] < 2:
|
171
|
-
raise ValueError(
|
172
|
-
"Homologs file must have at least two columns: one for the species and one for the human gene symbol.")
|
173
|
-
homologs.columns = [config.species, 'HUMAN_GENE_SYM']
|
174
|
-
homologs.set_index(config.species, inplace=True)
|
175
|
-
gM = gM.loc[gM.index.isin(homologs.index)]
|
176
|
-
gM.index = homologs.loc[gM.index, 'HUMAN_GENE_SYM'].values
|
177
|
-
common_gene = np.intersect1d(adata.var_names, gM.index)
|
178
|
-
gM = gM.loc[common_gene]
|
179
|
-
gM = gM['G_Mean'].to_numpy()
|
180
|
-
adata = adata[:, common_gene]
|
181
|
-
else:
|
182
|
-
gM = gmean(adata.layers['rank'], axis=0)
|
183
|
-
|
184
|
-
# Compute the fraction of each gene across cells
|
185
|
-
expressed_mask = pd.DataFrame((adata.X > 0).toarray(), index=adata.obs.index, columns=adata.var.index)
|
186
|
-
# frac_whole = np.array((adata_layer > 0).sum(axis=0))[0] / (adata.shape[0])
|
187
|
-
frac_whole = np.array(expressed_mask.sum(axis=0)) / (adata.shape[0])
|
188
|
-
# Normalize the geometrical mean
|
189
|
-
ranks = adata.layers['rank'] / gM
|
190
|
-
ranks = pd.DataFrame(ranks, index=adata.obs_names)
|
191
|
-
ranks.columns = adata.var.index
|
192
|
-
mk_score = [
|
193
|
-
_compute_regional_mkscore(cell_tg)
|
194
|
-
for cell_tg in tqdm(cell_list,
|
195
|
-
desc="Finding markers (Rank-based approach) | cells")
|
196
|
-
]
|
197
|
-
# Normalize the marker scores
|
198
|
-
mk_score = pd.DataFrame(np.vstack(mk_score).T, index=adata.var_names, columns=cell_list)
|
199
|
-
# mk_score_normalized = mk_score.div(mk_score.sum())*1e+2
|
200
|
-
|
201
|
-
# Remove the mitochondrial genes from mk_score
|
202
|
-
mt_gene_mask = ~adata.var_names.str.startswith(('MT-', 'mt-'))
|
203
|
-
mk_score = mk_score[mt_gene_mask]
|
204
|
-
adata = adata[:, mt_gene_mask]
|
205
|
-
|
206
|
-
# # Save the mk_score DataFrame to an adata layer
|
207
|
-
# adata.layers['mkscore'] = mk_score.values.T
|
208
|
-
|
209
|
-
# Save the marker scores
|
210
|
-
logger.info(f'------Saving marker scores ...')
|
211
|
-
output_file_path = Path(config.mkscore_feather_path)
|
212
|
-
output_file_path.parent.mkdir(parents=True, exist_ok=True, mode=0o755)
|
213
|
-
mk_score.reset_index(inplace=True)
|
214
|
-
mk_score.rename(columns={mk_score.columns[0]: 'HUMAN_GENE_SYM'}, inplace=True)
|
215
|
-
mk_score.to_feather(output_file_path)
|
216
|
-
|
217
|
-
# Save the modified adata object to disk
|
218
|
-
adata.write(config.hdf5_with_latent_path)
|
@@ -1,268 +0,0 @@
|
|
1
|
-
from pathlib import Path
|
2
|
-
|
3
|
-
import numpy as np
|
4
|
-
|
5
|
-
workdir: '/storage/yangjianLab/chenwenhao/projects/202312_GPS/data/macaque/processed'
|
6
|
-
# workdir: '/storage/yangjianLab/chenwenhao/projects/202312_GPS/data/GPS_test/macaque'
|
7
|
-
sample_name = "Cortex_151507"
|
8
|
-
chrom = "all"
|
9
|
-
QOS = "huge"
|
10
|
-
# chrom = range(1,23)
|
11
|
-
trait_names = [
|
12
|
-
'PGC3_SCZ_wave3_public_INFO80'
|
13
|
-
]
|
14
|
-
root = "/storage/yangjianLab/songliyang/SpatialData/Data/Brain/macaque/Cell/processed/h5ad"
|
15
|
-
# sample_names = [file.strip().split('.')[0]
|
16
|
-
# for file in open(f'{root}/representative_slices2').readlines()]
|
17
|
-
#
|
18
|
-
# sample_names = '''
|
19
|
-
# T33_macaque1 T44_macaque1 T82_macaque1 T97_macaque1 T125_macaque1 T127_macaque1 T129_macaque1 T131_macaque1 T135_macaque1 T137_macaque1 T139_macaque1
|
20
|
-
# '''.strip().split()
|
21
|
-
sample_names=[]
|
22
|
-
for file in Path(root).glob('*.h5ad'):
|
23
|
-
sample_names.append(file.stem)
|
24
|
-
sample_names.remove('T825_macaque3') # due to 25% of spot don't have spatial coordinates
|
25
|
-
|
26
|
-
annotation = "SubClass"
|
27
|
-
data_type = "SCT"
|
28
|
-
# sample_names = ['T584_macaque2']
|
29
|
-
|
30
|
-
rule all:
|
31
|
-
input:
|
32
|
-
expand('{sample_name}/spatial_ldsc/{sample_name}.spatial_ldsc.done',trait_name=trait_names,sample_name=sample_names)
|
33
|
-
|
34
|
-
|
35
|
-
# expand('{sample_name}/cauchy_combination/{sample_name}_{trait_name}.Cauchy.csv.gz',trait_name=trait_names,sample_name=sample_names)
|
36
|
-
# expand('{sample_name}/cauchy_combination/{sample_name}_{trait_name}.Cauchy.csv.gz',trait_name=trait_names,sample_name=sample_names)
|
37
|
-
|
38
|
-
rule test_run:
|
39
|
-
input:
|
40
|
-
[f'{sample_name}/generate_ldscore/{sample_name}_generate_ldscore_chr{chrom}.done' for sample_name in
|
41
|
-
sample_names]
|
42
|
-
|
43
|
-
# localrules: find_latent_representations,latent_to_gene
|
44
|
-
def get_annotation(wildcards):
|
45
|
-
if wildcards.sample_name.endswith('3'):
|
46
|
-
print(wildcards.sample_name,'will use None as annotation')
|
47
|
-
return None
|
48
|
-
else:
|
49
|
-
print(wildcards.sample_name,'will use SubClass as annotation')
|
50
|
-
return 'SubClass'
|
51
|
-
|
52
|
-
|
53
|
-
rule find_latent_representations:
|
54
|
-
input:
|
55
|
-
hdf5_path=f'{root}/{{sample_name}}.h5ad'
|
56
|
-
output:
|
57
|
-
hdf5_output='{sample_name}/find_latent_representations/{sample_name}_add_latent.h5ad'
|
58
|
-
params:
|
59
|
-
annotation= get_annotation,
|
60
|
-
type=data_type,
|
61
|
-
epochs=300,
|
62
|
-
feat_hidden1=256,
|
63
|
-
feat_hidden2=128,
|
64
|
-
feat_cell=3000,
|
65
|
-
gcn_hidden1=64,
|
66
|
-
gcn_hidden2=30,
|
67
|
-
p_drop=0.1,
|
68
|
-
gcn_lr=0.001,
|
69
|
-
gcn_decay=0.01,
|
70
|
-
n_neighbors=11,
|
71
|
-
label_w=1,
|
72
|
-
rec_w=1,
|
73
|
-
n_comps=300,
|
74
|
-
weighted_adj=False,
|
75
|
-
nheads=3,
|
76
|
-
var=False,
|
77
|
-
convergence_threshold=1e-4,
|
78
|
-
hierarchically=False
|
79
|
-
threads:
|
80
|
-
3
|
81
|
-
benchmark: '{sample_name}/find_latent_representations/{sample_name}_add_latent.h5ad.benchmark'
|
82
|
-
resources:
|
83
|
-
mem_mb_per_cpu=lambda wildcards, threads, attempt: 20_000 * np.log2(attempt + 1),
|
84
|
-
qos=QOS
|
85
|
-
run:
|
86
|
-
command = f"""
|
87
|
-
gsmap run_find_latent_representations \
|
88
|
-
--input_hdf5_path {input.hdf5_path} \
|
89
|
-
--sample_name {wildcards.sample_name} \
|
90
|
-
--output_hdf5_path {output.hdf5_output} \
|
91
|
-
{ '--annotation ' + params.annotation if params.annotation is not None else ''} \
|
92
|
-
--type {params.type} \
|
93
|
-
--epochs {params.epochs} \
|
94
|
-
--feat_hidden1 {params.feat_hidden1} \
|
95
|
-
--feat_hidden2 {params.feat_hidden2} \
|
96
|
-
--feat_cell {params.feat_cell} \
|
97
|
-
--gcn_hidden1 {params.gcn_hidden1} \
|
98
|
-
--gcn_hidden2 {params.gcn_hidden2} \
|
99
|
-
--p_drop {params.p_drop} \
|
100
|
-
--gcn_lr {params.gcn_lr} \
|
101
|
-
--gcn_decay {params.gcn_decay} \
|
102
|
-
--n_neighbors {params.n_neighbors} \
|
103
|
-
--label_w {params.label_w} \
|
104
|
-
--rec_w {params.rec_w} \
|
105
|
-
--n_comps {params.n_comps} \
|
106
|
-
{'--weighted_adj' if params.weighted_adj else ''} \
|
107
|
-
--nheads {params.nheads} \
|
108
|
-
{'--var' if params.var else ''} \
|
109
|
-
--convergence_threshold {params.convergence_threshold} \
|
110
|
-
{'--hierarchically' if params.hierarchically else ''}
|
111
|
-
"""
|
112
|
-
shell(
|
113
|
-
f'{command}'
|
114
|
-
)
|
115
|
-
|
116
|
-
|
117
|
-
rule latent_to_gene:
|
118
|
-
input:
|
119
|
-
hdf5_with_latent_path=rules.find_latent_representations.output.hdf5_output
|
120
|
-
output:
|
121
|
-
feather_path='{sample_name}/latent_to_gene/{sample_name}_gene_marker_score.feather'
|
122
|
-
params:
|
123
|
-
latent_representation="latent_GVAE",
|
124
|
-
num_neighbour=51,
|
125
|
-
num_neighbour_spatial=201,
|
126
|
-
species='MACAQUE_GENE_SYM',
|
127
|
-
gs_species='/storage/yangjianLab/songliyang/SpatialData/homologs/macaque_human_homologs.txt',
|
128
|
-
gM_slices=None,
|
129
|
-
annotation=get_annotation,
|
130
|
-
type=data_type
|
131
|
-
threads:
|
132
|
-
1
|
133
|
-
resources:
|
134
|
-
mem_mb_per_cpu=lambda wildcards, threads, attempt: 70_000 * np.log2(attempt + 1),
|
135
|
-
qos=QOS
|
136
|
-
benchmark: '{sample_name}/latent_to_gene/{sample_name}_gene_marker_score.feather.benchmark'
|
137
|
-
run:
|
138
|
-
command = f"""
|
139
|
-
gsmap run_latent_to_gene \
|
140
|
-
--input_hdf5_with_latent_path {input.hdf5_with_latent_path} \
|
141
|
-
--sample_name {wildcards.sample_name} \
|
142
|
-
--output_feather_path {output.feather_path} \
|
143
|
-
{ '--annotation ' + params.annotation if params.annotation is not None else ''} \
|
144
|
-
--type {params.type} \
|
145
|
-
--latent_representation {params.latent_representation} \
|
146
|
-
--num_neighbour {params.num_neighbour} \
|
147
|
-
--num_neighbour_spatial {params.num_neighbour_spatial} \
|
148
|
-
{'--species ' + params.species if params.species is not None else ''} \
|
149
|
-
{'--gs_species ' + params.gs_species if params.gs_species is not None else ''} \
|
150
|
-
{'--gM_slices ' + params.gM_slices if params.gM_slices is not None else ''}
|
151
|
-
"""
|
152
|
-
shell(
|
153
|
-
f'{command}'
|
154
|
-
)
|
155
|
-
|
156
|
-
|
157
|
-
rule generate_ldscore:
|
158
|
-
input:
|
159
|
-
mkscore_feather_file=rules.latent_to_gene.output.feather_path
|
160
|
-
output:
|
161
|
-
done='{sample_name}/generate_ldscore/{sample_name}_generate_ldscore_chr{chrom}.done'
|
162
|
-
params:
|
163
|
-
ld_score_save_dir='{sample_name}/generate_ldscore',
|
164
|
-
gtf_annotation_file="/storage/yangjianLab/songliyang/ReferenceGenome/GRCh37/gencode.v39lift37.annotation.gtf",
|
165
|
-
bfile_root="/storage/yangjianLab/sharedata/LDSC_resource/1000G_EUR_Phase3_plink/1000G.EUR.QC",
|
166
|
-
keep_snp_root="/storage/yangjianLab/sharedata/LDSC_resource/hapmap3_snps/hm",
|
167
|
-
gene_window_size=50000,
|
168
|
-
enhancer_annotation_file=None,
|
169
|
-
snp_multiple_enhancer_strategy='max_mkscore',
|
170
|
-
gene_window_enhancer_priority=None,
|
171
|
-
spots_per_chunk=1000,
|
172
|
-
ld_wind=1,
|
173
|
-
ld_unit="CM",
|
174
|
-
additional_baseline_annotation_dir_path=None
|
175
|
-
# additional_baseline_annotation_dir_path='/storage/yangjianLab/chenwenhao/projects/202312_GPS/data/resource/ldsc/baseline_v1.2/remove_base'
|
176
|
-
benchmark: '{sample_name}/generate_ldscore/{sample_name}_generate_ldscore_chr{chrom}.done.benchmark'
|
177
|
-
threads:
|
178
|
-
3
|
179
|
-
resources:
|
180
|
-
mem_mb_per_cpu=lambda wildcards, threads, attempt: 50_000 / threads * np.log2(attempt + 1),
|
181
|
-
qos=QOS
|
182
|
-
run:
|
183
|
-
command = f"""
|
184
|
-
gsmap run_generate_ldscore \
|
185
|
-
--sample_name {wildcards.sample_name} \
|
186
|
-
--chrom {wildcards.chrom} \
|
187
|
-
--ldscore_save_dir {params.ld_score_save_dir} \
|
188
|
-
--mkscore_feather_file {input.mkscore_feather_file} \
|
189
|
-
--bfile_root {params.bfile_root} \
|
190
|
-
--keep_snp_root {params.keep_snp_root} \
|
191
|
-
--gtf_annotation_file {params.gtf_annotation_file} \
|
192
|
-
--gene_window_size {params.gene_window_size} \
|
193
|
-
{'--enhancer_annotation_file ' + params.enhancer_annotation_file if params.enhancer_annotation_file is not None else ''} \
|
194
|
-
--snp_multiple_enhancer_strategy {params.snp_multiple_enhancer_strategy} \
|
195
|
-
{'--gene_window_enhancer_priority ' + params.gene_window_enhancer_priority if params.gene_window_enhancer_priority is not None else ''} \
|
196
|
-
--spots_per_chunk {params.spots_per_chunk} \
|
197
|
-
--ld_wind {params.ld_wind} \
|
198
|
-
--ld_unit {params.ld_unit} \
|
199
|
-
{ '--additional_baseline_annotation_dir_path ' + params.additional_baseline_annotation_dir_path if params.additional_baseline_annotation_dir_path is not None else '' }
|
200
|
-
"""
|
201
|
-
shell(command)
|
202
|
-
shell('touch {output.done}')
|
203
|
-
|
204
|
-
|
205
|
-
def get_h2_file(wildcards):
|
206
|
-
gwas_root = "/storage/yangjianLab/songliyang/GWAS_trait/LDSC"
|
207
|
-
return f"{gwas_root}/{wildcards.trait_name}.sumstats.gz",
|
208
|
-
|
209
|
-
|
210
|
-
def get_ldscore(wildcards):
|
211
|
-
if chrom == "all":
|
212
|
-
return f"{wildcards.sample_name}/generate_ldscore/{wildcards.sample_name}_generate_ldscore_chr{chrom}.done"
|
213
|
-
else:
|
214
|
-
assert tuple(chrom) == tuple(range(1,23)), "chrom must be all or range(1,23)"
|
215
|
-
return [f"{wildcards.sample_name}/generate_ldscore/{wildcards.sample_name}_generate_ldscore_chr{c}.done" for
|
216
|
-
c in chrom]
|
217
|
-
|
218
|
-
|
219
|
-
rule spatial_ldsc:
|
220
|
-
input:
|
221
|
-
# h2_file=get_h2_file,
|
222
|
-
generate_ldscore_done=get_ldscore
|
223
|
-
output:
|
224
|
-
done='{sample_name}/spatial_ldsc/{sample_name}.spatial_ldsc.done'
|
225
|
-
params:
|
226
|
-
ldscore_input_dir=rules.generate_ldscore.params.ld_score_save_dir,
|
227
|
-
ldsc_save_dir='{sample_name}/spatial_ldsc',
|
228
|
-
w_file="/storage/yangjianLab/sharedata/LDSC_resource/LDSC_SEG_ldscores/weights_hm3_no_hla/weights.",
|
229
|
-
sumstats_config_file='/storage/yangjianLab/chenwenhao/projects/202312_GPS/src/gsMap/example/sumstats_config_sub.yaml',
|
230
|
-
all_chunk = None
|
231
|
-
threads:
|
232
|
-
10
|
233
|
-
benchmark:
|
234
|
-
'{sample_name}/spatial_ldsc/{sample_name}.spatial_ldsc.done.benchmark'
|
235
|
-
resources:
|
236
|
-
mem_mb_per_cpu=lambda wildcards, threads, attempt: 40_000 / threads * np.log2(attempt + 1),
|
237
|
-
qos=QOS,
|
238
|
-
partition='intel-sc3,amd-ep2'
|
239
|
-
run:
|
240
|
-
command = f"""
|
241
|
-
gsmap run_spatial_ldsc --w_file {params.w_file} --sample_name {wildcards.sample_name} --num_processes {threads} --ldscore_input_dir {params.ldscore_input_dir} --ldsc_save_dir {params.ldsc_save_dir} --sumstats_config_file {params.sumstats_config_file} {f'--all_chunk {params.all_chunk}' if params.all_chunk else ''}
|
242
|
-
"""
|
243
|
-
shell(
|
244
|
-
f'{command}'
|
245
|
-
'touch {output.done}'
|
246
|
-
)
|
247
|
-
|
248
|
-
|
249
|
-
rule cauchy_combination:
|
250
|
-
output:
|
251
|
-
done='{sample_name}/cauchy_combination/{sample_name}_{trait_name}.Cauchy.csv.gz'
|
252
|
-
input:
|
253
|
-
hdf5_path=rules.find_latent_representations.output.hdf5_output,
|
254
|
-
ldsc_done=rules.spatial_ldsc.output.done
|
255
|
-
params:
|
256
|
-
cauchy_save_dir='{sample_name}/cauchy_combination',
|
257
|
-
annotation=annotation,
|
258
|
-
ldsc_dir=rules.spatial_ldsc.params.ldsc_save_dir
|
259
|
-
benchmark:
|
260
|
-
'{sample_name}/cauchy_combination/{sample_name}_{trait_name}.Cauchy.csv.gz.benchmark'
|
261
|
-
threads:
|
262
|
-
2
|
263
|
-
resources:
|
264
|
-
mem_mb_per_cpu=25_000
|
265
|
-
shell:
|
266
|
-
"""
|
267
|
-
gsmap run_cauchy_combination --input_hdf5_path {input.hdf5_path} --input_ldsc_dir {params.ldsc_dir} --sample_name {wildcards.sample_name} --output_cauchy_dir {params.cauchy_save_dir} --trait_name {wildcards.trait_name} --annotation {params.annotation}
|
268
|
-
"""
|
@@ -1,229 +0,0 @@
|
|
1
|
-
import numpy as np
|
2
|
-
|
3
|
-
workdir: '/storage/yangjianLab/chenwenhao/projects/202312_GPS/data/GPS_test/Nature_Neuroscience_2021/snake_workdir'
|
4
|
-
sample_names = ["Cortex_151507"]
|
5
|
-
# chrom = "all"
|
6
|
-
|
7
|
-
chrom = range(1,23)
|
8
|
-
# trait_names=[
|
9
|
-
# 'ADULT1_ADULT2_ONSET_ASTHMA'
|
10
|
-
# ]
|
11
|
-
annotation= "layer_guess"
|
12
|
-
data_type = 'count'
|
13
|
-
rule all:
|
14
|
-
input:
|
15
|
-
expand('{sample_name}/spatial_ldsc/{sample_name}.spatial_ldsc.done', sample_name=sample_names)
|
16
|
-
# expand('{sample_name}/cauchy_combination/{sample_name}_{trait_name}.Cauchy.csv.gz', trait_name=trait_names, sample_name=sample_names)
|
17
|
-
|
18
|
-
rule find_latent_representations:
|
19
|
-
input:
|
20
|
-
hdf5_path = "/storage/yangjianLab/songliyang/SpatialData/Data/Brain/Human/Nature_Neuroscience_2021/processed/h5ad/Cortex_151507.h5ad"
|
21
|
-
output:
|
22
|
-
hdf5_output='{sample_name}/find_latent_representations/{sample_name}_add_latent.h5ad'
|
23
|
-
params:
|
24
|
-
annotation=annotation,
|
25
|
-
type=data_type,
|
26
|
-
epochs=300,
|
27
|
-
feat_hidden1=256,
|
28
|
-
feat_hidden2=128,
|
29
|
-
feat_cell=3000,
|
30
|
-
gcn_hidden1=64,
|
31
|
-
gcn_hidden2=30,
|
32
|
-
p_drop=0.1,
|
33
|
-
gcn_lr=0.001,
|
34
|
-
gcn_decay=0.01,
|
35
|
-
n_neighbors=11,
|
36
|
-
label_w=1,
|
37
|
-
rec_w=1,
|
38
|
-
n_comps=300,
|
39
|
-
weighted_adj=False,
|
40
|
-
nheads=3,
|
41
|
-
var=False,
|
42
|
-
convergence_threshold=1e-4,
|
43
|
-
hierarchically=False
|
44
|
-
threads:
|
45
|
-
1
|
46
|
-
benchmark: '{sample_name}/find_latent_representations/{sample_name}_add_latent.h5ad.benchmark'
|
47
|
-
run:
|
48
|
-
command = f"""
|
49
|
-
gsmap run_find_latent_representations \
|
50
|
-
--input_hdf5_path {input.hdf5_path} \
|
51
|
-
--sample_name {wildcards.sample_name} \
|
52
|
-
--output_hdf5_path {output.hdf5_output} \
|
53
|
-
{ '--annotation ' + params.annotation if params.annotation is not None else ''} \
|
54
|
-
--type {params.type} \
|
55
|
-
--epochs {params.epochs} \
|
56
|
-
--feat_hidden1 {params.feat_hidden1} \
|
57
|
-
--feat_hidden2 {params.feat_hidden2} \
|
58
|
-
--feat_cell {params.feat_cell} \
|
59
|
-
--gcn_hidden1 {params.gcn_hidden1} \
|
60
|
-
--gcn_hidden2 {params.gcn_hidden2} \
|
61
|
-
--p_drop {params.p_drop} \
|
62
|
-
--gcn_lr {params.gcn_lr} \
|
63
|
-
--gcn_decay {params.gcn_decay} \
|
64
|
-
--n_neighbors {params.n_neighbors} \
|
65
|
-
--label_w {params.label_w} \
|
66
|
-
--rec_w {params.rec_w} \
|
67
|
-
--n_comps {params.n_comps} \
|
68
|
-
{'--weighted_adj' if params.weighted_adj else ''} \
|
69
|
-
--nheads {params.nheads} \
|
70
|
-
{'--var' if params.var else ''} \
|
71
|
-
--convergence_threshold {params.convergence_threshold} \
|
72
|
-
{'--hierarchically' if params.hierarchically else ''}
|
73
|
-
"""
|
74
|
-
shell(
|
75
|
-
f'{command}'
|
76
|
-
)
|
77
|
-
|
78
|
-
|
79
|
-
rule latent_to_gene:
|
80
|
-
input:
|
81
|
-
hdf5_with_latent_path=rules.find_latent_representations.output.hdf5_output
|
82
|
-
output:
|
83
|
-
feather_path='{sample_name}/latent_to_gene/{sample_name}_gene_marker_score.feather'
|
84
|
-
params:
|
85
|
-
latent_representation="latent_GVAE",
|
86
|
-
num_neighbour=51,
|
87
|
-
num_neighbour_spatial=201,
|
88
|
-
species=None,
|
89
|
-
gs_species=None,
|
90
|
-
gM_slices=None,
|
91
|
-
annotation=annotation,
|
92
|
-
type=data_type
|
93
|
-
threads:
|
94
|
-
1
|
95
|
-
resources:
|
96
|
-
mem_mb_per_cpu=lambda wildcards, threads, attempt: 70_000 * np.log2(attempt + 1),
|
97
|
-
qos='huge'
|
98
|
-
benchmark: '{sample_name}/latent_to_gene/{sample_name}_gene_marker_score.feather.benchmark'
|
99
|
-
run:
|
100
|
-
command = f"""
|
101
|
-
gsmap run_latent_to_gene \
|
102
|
-
--input_hdf5_with_latent_path {input.hdf5_with_latent_path} \
|
103
|
-
--sample_name {wildcards.sample_name} \
|
104
|
-
--output_feather_path {output.feather_path} \
|
105
|
-
{ '--annotation ' + params.annotation if params.annotation is not None else ''} \
|
106
|
-
--type {params.type} \
|
107
|
-
--latent_representation {params.latent_representation} \
|
108
|
-
--num_neighbour {params.num_neighbour} \
|
109
|
-
--num_neighbour_spatial {params.num_neighbour_spatial} \
|
110
|
-
{'--species ' + params.species if params.species is not None else ''} \
|
111
|
-
{'--gs_species ' + params.gs_species if params.gs_species is not None else ''} \
|
112
|
-
{'--gM_slices ' + params.gM_slices if params.gM_slices is not None else ''}
|
113
|
-
"""
|
114
|
-
shell(
|
115
|
-
f'{command}'
|
116
|
-
)
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
rule generate_ldscore:
|
121
|
-
input:
|
122
|
-
mkscore_feather_file=rules.latent_to_gene.output.feather_path
|
123
|
-
output:
|
124
|
-
done='{sample_name}/generate_ldscore/{sample_name}_generate_ldscore_chr{chrom}.done'
|
125
|
-
params:
|
126
|
-
ld_score_save_dir='{sample_name}/generate_ldscore',
|
127
|
-
gtf_annotation_file="/storage/yangjianLab/songliyang/ReferenceGenome/GRCh37/gencode.v39lift37.annotation.gtf",
|
128
|
-
bfile_root="/storage/yangjianLab/sharedata/LDSC_resource/1000G_EUR_Phase3_plink/1000G.EUR.QC",
|
129
|
-
keep_snp_root="/storage/yangjianLab/sharedata/LDSC_resource/hapmap3_snps/hm",
|
130
|
-
gene_window_size=50000,
|
131
|
-
enhancer_annotation_file=None,
|
132
|
-
snp_multiple_enhancer_strategy='max_mkscore',
|
133
|
-
gene_window_enhancer_priority=None,
|
134
|
-
spots_per_chunk=5000,
|
135
|
-
ld_wind=1,
|
136
|
-
ld_unit="CM",
|
137
|
-
additional_baseline_annotation_dir_path='/storage/yangjianLab/chenwenhao/projects/202312_GPS/data/resource/ldsc/baseline_v1.2/remove_base'
|
138
|
-
benchmark: '{sample_name}/generate_ldscore/{sample_name}_generate_ldscore_chr{chrom}.done.benchmark'
|
139
|
-
threads:
|
140
|
-
3
|
141
|
-
resources:
|
142
|
-
mem_mb_per_cpu=lambda wildcards, threads, attempt: 45_000 / threads * np.log2(attempt + 1),
|
143
|
-
qos='huge'
|
144
|
-
run:
|
145
|
-
command = f"""
|
146
|
-
gsmap run_generate_ldscore \
|
147
|
-
--sample_name {wildcards.sample_name} \
|
148
|
-
--chrom {wildcards.chrom} \
|
149
|
-
--ldscore_save_dir {params.ld_score_save_dir} \
|
150
|
-
--mkscore_feather_file {input.mkscore_feather_file} \
|
151
|
-
--bfile_root {params.bfile_root} \
|
152
|
-
--keep_snp_root {params.keep_snp_root} \
|
153
|
-
--gtf_annotation_file {params.gtf_annotation_file} \
|
154
|
-
--gene_window_size {params.gene_window_size} \
|
155
|
-
{'--enhancer_annotation_file ' + params.enhancer_annotation_file if params.enhancer_annotation_file is not None else ''} \
|
156
|
-
--snp_multiple_enhancer_strategy {params.snp_multiple_enhancer_strategy} \
|
157
|
-
{'--gene_window_enhancer_priority ' + params.gene_window_enhancer_priority if params.gene_window_enhancer_priority is not None else ''} \
|
158
|
-
--spots_per_chunk {params.spots_per_chunk} \
|
159
|
-
--ld_wind {params.ld_wind} \
|
160
|
-
--ld_unit {params.ld_unit} \
|
161
|
-
{ '--additional_baseline_annotation_dir_path' + params.additional_baseline_annotation_dir_path if params.additional_baseline_annotation_dir_path is not None else '' }
|
162
|
-
"""
|
163
|
-
shell(command)
|
164
|
-
shell('touch {output.done}')
|
165
|
-
|
166
|
-
|
167
|
-
def get_h2_file(wildcards):
|
168
|
-
gwas_root = "/storage/yangjianLab/songliyang/GWAS_trait/LDSC"
|
169
|
-
return f"{gwas_root}/{wildcards.trait_name}.sumstats.gz",
|
170
|
-
|
171
|
-
|
172
|
-
def get_ldscore(wildcards):
|
173
|
-
if chrom == "all":
|
174
|
-
return f"{wildcards.sample_name}/generate_ldscore/{wildcards.sample_name}_generate_ldscore_chr{chrom}.done"
|
175
|
-
else:
|
176
|
-
assert tuple(chrom) == tuple(range(1,23)), "chrom must be all or range(1,23)"
|
177
|
-
return [f"{wildcards.sample_name}/generate_ldscore/{wildcards.sample_name}_generate_ldscore_chr{c}.done" for
|
178
|
-
c in chrom]
|
179
|
-
|
180
|
-
|
181
|
-
rule spatial_ldsc:
|
182
|
-
input:
|
183
|
-
# h2_file=get_h2_file,
|
184
|
-
generate_ldscore_done=get_ldscore
|
185
|
-
output:
|
186
|
-
done='{sample_name}/spatial_ldsc/{sample_name}.spatial_ldsc.done'
|
187
|
-
params:
|
188
|
-
ldscore_input_dir=rules.generate_ldscore.params.ld_score_save_dir,
|
189
|
-
ldsc_save_dir='{sample_name}/spatial_ldsc',
|
190
|
-
w_file="/storage/yangjianLab/sharedata/LDSC_resource/LDSC_SEG_ldscores/weights_hm3_no_hla/weights.",
|
191
|
-
sumstats_config_file='/storage/yangjianLab/chenwenhao/projects/202312_GPS/src/gsMap/example/sumstats_config_sub.yaml',
|
192
|
-
all_chunk = None
|
193
|
-
threads:
|
194
|
-
2
|
195
|
-
benchmark:
|
196
|
-
'{sample_name}/spatial_ldsc/{sample_name}.spatial_ldsc.done.benchmark'
|
197
|
-
resources:
|
198
|
-
mem_mb_per_cpu=lambda wildcards, threads, attempt: 60_000 / threads * np.log2(attempt + 1),
|
199
|
-
qos='huge'
|
200
|
-
run:
|
201
|
-
command = f"""
|
202
|
-
gsmap run_spatial_ldsc --w_file {params.w_file} --sample_name {wildcards.sample_name} --num_processes {threads} --ldscore_input_dir {params.ldscore_input_dir} --ldsc_save_dir {params.ldsc_save_dir} --sumstats_config_file {params.sumstats_config_file} {f'--all_chunk {params.all_chunk}' if params.all_chunk else ''}
|
203
|
-
"""
|
204
|
-
shell(
|
205
|
-
f'{command}'
|
206
|
-
'touch {output.done}'
|
207
|
-
)
|
208
|
-
|
209
|
-
|
210
|
-
rule cauchy_combination:
|
211
|
-
output:
|
212
|
-
done='{sample_name}/cauchy_combination/{sample_name}_{trait_name}.Cauchy.csv.gz'
|
213
|
-
input:
|
214
|
-
hdf5_path=rules.find_latent_representations.output.hdf5_output,
|
215
|
-
ldsc_done=rules.spatial_ldsc.output.done
|
216
|
-
params:
|
217
|
-
cauchy_save_dir='{sample_name}/cauchy_combination',
|
218
|
-
annotation=annotation,
|
219
|
-
ldsc_dir=rules.spatial_ldsc.params.ldsc_save_dir
|
220
|
-
benchmark:
|
221
|
-
'{sample_name}/cauchy_combination/{sample_name}_{trait_name}.Cauchy.csv.gz.benchmark'
|
222
|
-
threads:
|
223
|
-
2
|
224
|
-
resources:
|
225
|
-
mem_mb_per_cpu=25_000
|
226
|
-
shell:
|
227
|
-
"""
|
228
|
-
gsmap run_cauchy_combination --input_hdf5_path {input.hdf5_path} --input_ldsc_dir {params.ldsc_dir} --sample_name {wildcards.sample_name} --output_cauchy_dir {params.cauchy_save_dir} --trait_name {wildcards.trait_name} --annotation {params.annotation}
|
229
|
-
"""
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|