gsMap 1.67__py3-none-any.whl → 1.70__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/{GNN_VAE → GNN}/__init__.py +0 -0
- gsMap/{GNN_VAE → GNN}/adjacency_matrix.py +75 -75
- gsMap/{GNN_VAE → GNN}/model.py +89 -89
- gsMap/{GNN_VAE → GNN}/train.py +88 -86
- gsMap/__init__.py +5 -5
- gsMap/__main__.py +2 -2
- gsMap/cauchy_combination_test.py +141 -141
- gsMap/config.py +805 -803
- gsMap/diagnosis.py +273 -273
- gsMap/find_latent_representation.py +133 -145
- gsMap/format_sumstats.py +407 -407
- gsMap/generate_ldscore.py +618 -618
- gsMap/latent_to_gene.py +234 -234
- gsMap/main.py +31 -31
- gsMap/report.py +160 -160
- gsMap/run_all_mode.py +194 -194
- gsMap/setup.py +0 -0
- gsMap/spatial_ldsc_multiple_sumstats.py +380 -380
- gsMap/templates/report_template.html +198 -198
- gsMap/utils/__init__.py +0 -0
- gsMap/utils/generate_r2_matrix.py +735 -735
- gsMap/utils/jackknife.py +514 -514
- gsMap/utils/make_annotations.py +518 -518
- gsMap/utils/manhattan_plot.py +639 -639
- gsMap/utils/regression_read.py +294 -294
- gsMap/visualize.py +198 -198
- {gsmap-1.67.dist-info → gsmap-1.70.dist-info}/LICENSE +21 -21
- {gsmap-1.67.dist-info → gsmap-1.70.dist-info}/METADATA +28 -22
- gsmap-1.70.dist-info/RECORD +31 -0
- gsmap-1.67.dist-info/RECORD +0 -31
- {gsmap-1.67.dist-info → gsmap-1.70.dist-info}/WHEEL +0 -0
- {gsmap-1.67.dist-info → gsmap-1.70.dist-info}/entry_points.txt +0 -0
gsMap/diagnosis.py
CHANGED
@@ -1,273 +1,273 @@
|
|
1
|
-
import logging
|
2
|
-
import warnings
|
3
|
-
from pathlib import Path
|
4
|
-
|
5
|
-
import numpy as np
|
6
|
-
import pandas as pd
|
7
|
-
import scanpy as sc
|
8
|
-
from scipy.stats import norm
|
9
|
-
|
10
|
-
from gsMap.config import DiagnosisConfig
|
11
|
-
from gsMap.utils.manhattan_plot import ManhattanPlot
|
12
|
-
from gsMap.visualize import draw_scatter, load_st_coord, estimate_point_size_for_plot
|
13
|
-
|
14
|
-
|
15
|
-
warnings.filterwarnings("ignore", category=FutureWarning)
|
16
|
-
logger = logging.getLogger(__name__)
|
17
|
-
|
18
|
-
|
19
|
-
def convert_z_to_p(gwas_data):
|
20
|
-
"""Convert Z-scores to P-values."""
|
21
|
-
gwas_data['P'] = norm.sf(abs(gwas_data['Z'])) * 2
|
22
|
-
min_p_value = 1e-300
|
23
|
-
gwas_data['P'] = gwas_data['P'].clip(lower=min_p_value)
|
24
|
-
return gwas_data
|
25
|
-
|
26
|
-
|
27
|
-
def load_ldsc(ldsc_input_file):
|
28
|
-
"""Load LDSC data and calculate logp."""
|
29
|
-
ldsc = pd.read_csv(ldsc_input_file, compression='gzip')
|
30
|
-
ldsc['spot'] = ldsc['spot'].astype(str).replace('\.0', '', regex=True)
|
31
|
-
ldsc.set_index('spot', inplace=True)
|
32
|
-
ldsc['logp'] = -np.log10(ldsc['p'])
|
33
|
-
return ldsc
|
34
|
-
|
35
|
-
|
36
|
-
def load_gene_diagnostic_info(config:DiagnosisConfig):
|
37
|
-
"""Load or compute gene diagnostic info."""
|
38
|
-
gene_diagnostic_info_save_path = config.get_gene_diagnostic_info_save_path(config.trait_name)
|
39
|
-
if gene_diagnostic_info_save_path.exists():
|
40
|
-
logger.info(f'Loading gene diagnostic information from {gene_diagnostic_info_save_path}...')
|
41
|
-
return pd.read_csv(gene_diagnostic_info_save_path)
|
42
|
-
else:
|
43
|
-
logger.info('Gene diagnostic information not found. Calculating gene diagnostic information...')
|
44
|
-
return compute_gene_diagnostic_info(config)
|
45
|
-
|
46
|
-
|
47
|
-
def compute_gene_diagnostic_info(config: DiagnosisConfig):
|
48
|
-
"""Calculate gene diagnostic info and save it to adata."""
|
49
|
-
logger.info('Loading ST data and LDSC results...')
|
50
|
-
# adata = sc.read_h5ad(config.hdf5_with_latent_path, backed='r')
|
51
|
-
mk_score = pd.read_feather(config.mkscore_feather_path)
|
52
|
-
mk_score.set_index('HUMAN_GENE_SYM', inplace=True)
|
53
|
-
mk_score = mk_score.T
|
54
|
-
trait_ldsc_result = load_ldsc(config.get_ldsc_result_file(config.trait_name))
|
55
|
-
|
56
|
-
# Align marker scores with trait LDSC results
|
57
|
-
mk_score = mk_score.loc[trait_ldsc_result.index]
|
58
|
-
mk_score = mk_score.loc[:, mk_score.sum(axis=0) != 0]
|
59
|
-
|
60
|
-
logger.info('Calculating correlation between gene marker scores and trait logp-values...')
|
61
|
-
corr = mk_score.corrwith(trait_ldsc_result['logp'])
|
62
|
-
corr.name = 'PCC'
|
63
|
-
|
64
|
-
grouped_mk_score = mk_score.groupby(adata.obs[config.annotation]).median()
|
65
|
-
max_annotations = grouped_mk_score.idxmax()
|
66
|
-
|
67
|
-
high_GSS_Gene_annotation_pair = pd.DataFrame({
|
68
|
-
'Gene': max_annotations.index,
|
69
|
-
'Annotation': max_annotations.values,
|
70
|
-
'Median_GSS': grouped_mk_score.max().values
|
71
|
-
})
|
72
|
-
|
73
|
-
# Filter based on median GSS score
|
74
|
-
high_GSS_Gene_annotation_pair = high_GSS_Gene_annotation_pair[high_GSS_Gene_annotation_pair['Median_GSS'] >= 1.0]
|
75
|
-
high_GSS_Gene_annotation_pair = high_GSS_Gene_annotation_pair.merge(corr, left_on='Gene', right_index=True)
|
76
|
-
|
77
|
-
# Prepare the final gene diagnostic info dataframe
|
78
|
-
gene_diagnostic_info_cols = ['Gene', 'Annotation', 'Median_GSS', 'PCC']
|
79
|
-
gene_diagnostic_info = high_GSS_Gene_annotation_pair[gene_diagnostic_info_cols].drop_duplicates().dropna(
|
80
|
-
subset=['Gene'])
|
81
|
-
gene_diagnostic_info.sort_values('PCC', ascending=False, inplace=True)
|
82
|
-
|
83
|
-
# Save gene diagnostic info to a file
|
84
|
-
gene_diagnostic_info_save_path = config.get_gene_diagnostic_info_save_path(config.trait_name)
|
85
|
-
gene_diagnostic_info.to_csv(gene_diagnostic_info_save_path, index=False)
|
86
|
-
logger.info(f'Gene diagnostic information saved to {gene_diagnostic_info_save_path}.')
|
87
|
-
|
88
|
-
# Save to adata.var with the trait_name prefix
|
89
|
-
logger.info('Saving gene diagnostic info to adata.var...')
|
90
|
-
gene_diagnostic_info.set_index('Gene', inplace=True) # Use 'Gene' as the index to align with adata.var
|
91
|
-
adata.var[f'{config.trait_name}_Annotation'] = gene_diagnostic_info['Annotation']
|
92
|
-
adata.var[f'{config.trait_name}_Median_GSS'] = gene_diagnostic_info['Median_GSS']
|
93
|
-
adata.var[f'{config.trait_name}_PCC'] = gene_diagnostic_info['PCC']
|
94
|
-
|
95
|
-
# Save trait_ldsc_result to adata.obs
|
96
|
-
logger.info(f'Saving trait LDSC results to adata.obs as gsMap_{config.trait_name}_p_value...')
|
97
|
-
adata.obs[f'gsMap_{config.trait_name}_p_value'] = trait_ldsc_result['p']
|
98
|
-
adata.write(config.hdf5_with_latent_path, )
|
99
|
-
|
100
|
-
return gene_diagnostic_info.reset_index()
|
101
|
-
|
102
|
-
|
103
|
-
def load_gwas_data(config:DiagnosisConfig):
|
104
|
-
"""Load and process GWAS data."""
|
105
|
-
logger.info('Loading and processing GWAS data...')
|
106
|
-
gwas_data = pd.read_csv(config.sumstats_file, compression='gzip', sep='\t')
|
107
|
-
return convert_z_to_p(gwas_data)
|
108
|
-
|
109
|
-
|
110
|
-
def load_snp_gene_pairs(config:DiagnosisConfig):
|
111
|
-
"""Load SNP-gene pairs from multiple chromosomes."""
|
112
|
-
ldscore_save_dir = Path(config.ldscore_save_dir)
|
113
|
-
return pd.concat([
|
114
|
-
pd.read_feather(ldscore_save_dir / f'SNP_gene_pair/SNP_gene_pair_chr{chrom}.feather')
|
115
|
-
for chrom in range(1, 23)
|
116
|
-
])
|
117
|
-
|
118
|
-
|
119
|
-
def filter_snps(gwas_data_with_gene_annotation_sort, SUBSAMPLE_SNP_NUMBER):
|
120
|
-
"""Filter the SNPs based on significance levels."""
|
121
|
-
pass_suggestive_line_mask = gwas_data_with_gene_annotation_sort['P'] < 1e-5
|
122
|
-
pass_suggestive_line_number = pass_suggestive_line_mask.sum()
|
123
|
-
|
124
|
-
if pass_suggestive_line_number > SUBSAMPLE_SNP_NUMBER:
|
125
|
-
snps2plot = gwas_data_with_gene_annotation_sort[pass_suggestive_line_mask].SNP
|
126
|
-
logger.info(f'To reduce the number of SNPs to plot, only {snps2plot.shape[0]} SNPs with P < 1e-5 are plotted.')
|
127
|
-
else:
|
128
|
-
snps2plot = gwas_data_with_gene_annotation_sort.head(SUBSAMPLE_SNP_NUMBER).SNP
|
129
|
-
logger.info(
|
130
|
-
f'To reduce the number of SNPs to plot, only {SUBSAMPLE_SNP_NUMBER} SNPs with the smallest P-values are plotted.')
|
131
|
-
|
132
|
-
return snps2plot
|
133
|
-
|
134
|
-
|
135
|
-
def generate_manhattan_plot(config: DiagnosisConfig):
|
136
|
-
"""Generate Manhattan plot."""
|
137
|
-
report_save_dir = config.get_report_dir(config.trait_name)
|
138
|
-
gwas_data = load_gwas_data(config)
|
139
|
-
snp_gene_pair = load_snp_gene_pairs(config)
|
140
|
-
gwas_data_with_gene = snp_gene_pair.merge(gwas_data, on='SNP', how='inner').rename(columns={'gene_name': 'GENE'})
|
141
|
-
gene_diagnostic_info = load_gene_diagnostic_info(config)
|
142
|
-
gwas_data_with_gene_annotation = gwas_data_with_gene.merge(gene_diagnostic_info, left_on='GENE', right_on='Gene',
|
143
|
-
how='left')
|
144
|
-
|
145
|
-
gwas_data_with_gene_annotation = gwas_data_with_gene_annotation[
|
146
|
-
~gwas_data_with_gene_annotation['Annotation'].isna()]
|
147
|
-
gwas_data_with_gene_annotation_sort = gwas_data_with_gene_annotation.sort_values('P')
|
148
|
-
|
149
|
-
snps2plot = filter_snps(gwas_data_with_gene_annotation_sort, SUBSAMPLE_SNP_NUMBER=100_000)
|
150
|
-
gwas_data_to_plot = gwas_data_with_gene_annotation[
|
151
|
-
gwas_data_with_gene_annotation['SNP'].isin(snps2plot)].reset_index(drop=True)
|
152
|
-
gwas_data_to_plot['Annotation_text'] = 'PCC: ' + gwas_data_to_plot['PCC'].round(2).astype(
|
153
|
-
str) + '<br>' + 'Annotation: ' + gwas_data_to_plot['Annotation'].astype(str)
|
154
|
-
|
155
|
-
fig = ManhattanPlot(
|
156
|
-
dataframe=gwas_data_to_plot,
|
157
|
-
title='gsMap Diagnosis Manhattan Plot',
|
158
|
-
point_size=3,
|
159
|
-
highlight_gene_list=config.selected_genes or gene_diagnostic_info.Gene.iloc[:config.top_corr_genes].tolist(),
|
160
|
-
suggestiveline_value=-np.log10(1e-5),
|
161
|
-
annotation='Annotation_text',
|
162
|
-
)
|
163
|
-
|
164
|
-
save_manhattan_plot_path = config.get_manhattan_html_plot_path(config.trait_name)
|
165
|
-
fig.write_html(save_manhattan_plot_path)
|
166
|
-
logger.info(f'Diagnostic Manhattan Plot saved to {save_manhattan_plot_path}.')
|
167
|
-
|
168
|
-
|
169
|
-
def generate_GSS_distribution(config: DiagnosisConfig):
|
170
|
-
"""Generate GSS distribution plots."""
|
171
|
-
# logger.info('Loading ST data...')
|
172
|
-
# adata = sc.read_h5ad(config.hdf5_with_latent_path)
|
173
|
-
mk_score = pd.read_feather(config.mkscore_feather_path).set_index('HUMAN_GENE_SYM').T
|
174
|
-
|
175
|
-
plot_genes = config.selected_genes or load_gene_diagnostic_info(config).Gene.iloc[:config.top_corr_genes].tolist()
|
176
|
-
if config.selected_genes is not None:
|
177
|
-
logger.info(f'Generating GSS & Expression distribution plot for selected genes: {plot_genes}...')
|
178
|
-
else:
|
179
|
-
logger.info(f'Generating GSS & Expression distribution plot for top {config.top_corr_genes} correlated genes...')
|
180
|
-
|
181
|
-
if config.customize_fig:
|
182
|
-
pixel_width, pixel_height, point_size = config.fig_width, config.fig_height, config.point_size
|
183
|
-
else:
|
184
|
-
(pixel_width, pixel_height), point_size = estimate_point_size_for_plot(adata.obsm['spatial'])
|
185
|
-
sub_fig_save_dir = config.get_GSS_plot_dir(config.trait_name)
|
186
|
-
|
187
|
-
# save plot gene list
|
188
|
-
config.get_GSS_plot_select_gene_file(config.trait_name).write_text('\n'.join(plot_genes))
|
189
|
-
|
190
|
-
for selected_gene in plot_genes:
|
191
|
-
expression_series = pd.Series(adata[:, selected_gene].X.toarray().flatten(), index=adata.obs.index,name='Expression')
|
192
|
-
threshold = np.quantile(expression_series,0.9999)
|
193
|
-
expression_series[expression_series > threshold] = threshold
|
194
|
-
generate_and_save_plots(adata, mk_score, expression_series, selected_gene, point_size, pixel_width,
|
195
|
-
pixel_height, sub_fig_save_dir, config.sample_name, config.annotation)
|
196
|
-
|
197
|
-
|
198
|
-
def generate_and_save_plots(adata, mk_score, expression_series, selected_gene, point_size, pixel_width, pixel_height,
|
199
|
-
sub_fig_save_dir, sample_name, annotation):
|
200
|
-
"""Generate and save the plots."""
|
201
|
-
select_gene_expression_with_space_coord = load_st_coord(adata, expression_series, annotation)
|
202
|
-
sub_fig_1 = draw_scatter(select_gene_expression_with_space_coord, title=f'{selected_gene} (Expression)',
|
203
|
-
annotation='annotation', color_by='Expression', point_size=point_size, width=pixel_width,
|
204
|
-
height=pixel_height)
|
205
|
-
save_plot(sub_fig_1, sub_fig_save_dir, sample_name, selected_gene, 'Expression')
|
206
|
-
|
207
|
-
select_gene_GSS_with_space_coord = load_st_coord(adata, mk_score[selected_gene].rename('GSS'), annotation)
|
208
|
-
sub_fig_2 = draw_scatter(select_gene_GSS_with_space_coord, title=f'{selected_gene} (GSS)', annotation='annotation',
|
209
|
-
color_by='GSS', point_size=point_size, width=pixel_width, height=pixel_height)
|
210
|
-
save_plot(sub_fig_2, sub_fig_save_dir, sample_name, selected_gene, 'GSS')
|
211
|
-
|
212
|
-
# combined_fig = make_subplots(rows=1, cols=2,
|
213
|
-
# subplot_titles=(f'{selected_gene} (Expression)', f'{selected_gene} (GSS)'))
|
214
|
-
# for trace in sub_fig_1.data:
|
215
|
-
# combined_fig.add_trace(trace, row=1, col=1)
|
216
|
-
# for trace in sub_fig_2.data:
|
217
|
-
# combined_fig.add_trace(trace, row=1, col=2)
|
218
|
-
#
|
219
|
-
|
220
|
-
def save_plot(sub_fig, sub_fig_save_dir, sample_name, selected_gene, plot_type):
|
221
|
-
"""Save the plot to HTML and PNG."""
|
222
|
-
save_sub_fig_path = sub_fig_save_dir / f'{sample_name}_{selected_gene}_{plot_type}_Distribution.html'
|
223
|
-
# sub_fig.write_html(str(save_sub_fig_path))
|
224
|
-
sub_fig.update_layout(showlegend=False)
|
225
|
-
sub_fig.write_image(str(save_sub_fig_path).replace('.html', '.png'))
|
226
|
-
|
227
|
-
|
228
|
-
def generate_gsMap_plot(config: DiagnosisConfig):
|
229
|
-
"""Generate gsMap plot."""
|
230
|
-
logger.info('Creating gsMap plot...')
|
231
|
-
|
232
|
-
trait_ldsc_result = load_ldsc(config.get_ldsc_result_file(config.trait_name))
|
233
|
-
space_coord_concat = load_st_coord(adata, trait_ldsc_result, annotation=config.annotation)
|
234
|
-
|
235
|
-
if config.customize_fig:
|
236
|
-
pixel_width, pixel_height, point_size = config.fig_width, config.fig_height, config.point_size
|
237
|
-
else:
|
238
|
-
(pixel_width, pixel_height), point_size = estimate_point_size_for_plot(adata.obsm['spatial'])
|
239
|
-
fig = draw_scatter(space_coord_concat,
|
240
|
-
title=f'{config.trait_name} (gsMap)',
|
241
|
-
point_size=point_size,
|
242
|
-
width=pixel_width,
|
243
|
-
height=pixel_height,
|
244
|
-
annotation=config.annotation
|
245
|
-
)
|
246
|
-
|
247
|
-
output_dir = config.get_gsMap_plot_save_dir(config.trait_name)
|
248
|
-
output_file_html = config.get_gsMap_html_plot_save_path(config.trait_name)
|
249
|
-
output_file_png = output_file_html.with_suffix('.png')
|
250
|
-
output_file_csv = output_file_html.with_suffix('.csv')
|
251
|
-
|
252
|
-
fig.write_html(output_file_html)
|
253
|
-
fig.write_image(output_file_png)
|
254
|
-
space_coord_concat.to_csv(output_file_csv)
|
255
|
-
|
256
|
-
logger.info(f'gsMap plot created and saved in {output_dir}.')
|
257
|
-
|
258
|
-
|
259
|
-
def run_Diagnosis(config: DiagnosisConfig):
|
260
|
-
"""Main function to run the diagnostic plot generation."""
|
261
|
-
global adata
|
262
|
-
adata = sc.read_h5ad(config.hdf5_with_latent_path)
|
263
|
-
if 'log1p' not in adata.uns.keys() and adata.X.max() > 14:
|
264
|
-
sc.pp.normalize_total(adata, target_sum=1e4)
|
265
|
-
sc.pp.log1p(adata)
|
266
|
-
|
267
|
-
if config.plot_type in ['manhattan', 'all']:
|
268
|
-
generate_manhattan_plot(config)
|
269
|
-
if config.plot_type in ['GSS', 'all']:
|
270
|
-
generate_GSS_distribution(config)
|
271
|
-
if config.plot_type in ['gsMap', 'all']:
|
272
|
-
generate_gsMap_plot(config)
|
273
|
-
|
1
|
+
import logging
|
2
|
+
import warnings
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import pandas as pd
|
7
|
+
import scanpy as sc
|
8
|
+
from scipy.stats import norm
|
9
|
+
|
10
|
+
from gsMap.config import DiagnosisConfig
|
11
|
+
from gsMap.utils.manhattan_plot import ManhattanPlot
|
12
|
+
from gsMap.visualize import draw_scatter, load_st_coord, estimate_point_size_for_plot
|
13
|
+
|
14
|
+
|
15
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
def convert_z_to_p(gwas_data):
|
20
|
+
"""Convert Z-scores to P-values."""
|
21
|
+
gwas_data['P'] = norm.sf(abs(gwas_data['Z'])) * 2
|
22
|
+
min_p_value = 1e-300
|
23
|
+
gwas_data['P'] = gwas_data['P'].clip(lower=min_p_value)
|
24
|
+
return gwas_data
|
25
|
+
|
26
|
+
|
27
|
+
def load_ldsc(ldsc_input_file):
|
28
|
+
"""Load LDSC data and calculate logp."""
|
29
|
+
ldsc = pd.read_csv(ldsc_input_file, compression='gzip')
|
30
|
+
ldsc['spot'] = ldsc['spot'].astype(str).replace('\.0', '', regex=True)
|
31
|
+
ldsc.set_index('spot', inplace=True)
|
32
|
+
ldsc['logp'] = -np.log10(ldsc['p'])
|
33
|
+
return ldsc
|
34
|
+
|
35
|
+
|
36
|
+
def load_gene_diagnostic_info(config:DiagnosisConfig):
|
37
|
+
"""Load or compute gene diagnostic info."""
|
38
|
+
gene_diagnostic_info_save_path = config.get_gene_diagnostic_info_save_path(config.trait_name)
|
39
|
+
if gene_diagnostic_info_save_path.exists():
|
40
|
+
logger.info(f'Loading gene diagnostic information from {gene_diagnostic_info_save_path}...')
|
41
|
+
return pd.read_csv(gene_diagnostic_info_save_path)
|
42
|
+
else:
|
43
|
+
logger.info('Gene diagnostic information not found. Calculating gene diagnostic information...')
|
44
|
+
return compute_gene_diagnostic_info(config)
|
45
|
+
|
46
|
+
|
47
|
+
def compute_gene_diagnostic_info(config: DiagnosisConfig):
|
48
|
+
"""Calculate gene diagnostic info and save it to adata."""
|
49
|
+
logger.info('Loading ST data and LDSC results...')
|
50
|
+
# adata = sc.read_h5ad(config.hdf5_with_latent_path, backed='r')
|
51
|
+
mk_score = pd.read_feather(config.mkscore_feather_path)
|
52
|
+
mk_score.set_index('HUMAN_GENE_SYM', inplace=True)
|
53
|
+
mk_score = mk_score.T
|
54
|
+
trait_ldsc_result = load_ldsc(config.get_ldsc_result_file(config.trait_name))
|
55
|
+
|
56
|
+
# Align marker scores with trait LDSC results
|
57
|
+
mk_score = mk_score.loc[trait_ldsc_result.index]
|
58
|
+
mk_score = mk_score.loc[:, mk_score.sum(axis=0) != 0]
|
59
|
+
|
60
|
+
logger.info('Calculating correlation between gene marker scores and trait logp-values...')
|
61
|
+
corr = mk_score.corrwith(trait_ldsc_result['logp'])
|
62
|
+
corr.name = 'PCC'
|
63
|
+
|
64
|
+
grouped_mk_score = mk_score.groupby(adata.obs[config.annotation]).median()
|
65
|
+
max_annotations = grouped_mk_score.idxmax()
|
66
|
+
|
67
|
+
high_GSS_Gene_annotation_pair = pd.DataFrame({
|
68
|
+
'Gene': max_annotations.index,
|
69
|
+
'Annotation': max_annotations.values,
|
70
|
+
'Median_GSS': grouped_mk_score.max().values
|
71
|
+
})
|
72
|
+
|
73
|
+
# Filter based on median GSS score
|
74
|
+
high_GSS_Gene_annotation_pair = high_GSS_Gene_annotation_pair[high_GSS_Gene_annotation_pair['Median_GSS'] >= 1.0]
|
75
|
+
high_GSS_Gene_annotation_pair = high_GSS_Gene_annotation_pair.merge(corr, left_on='Gene', right_index=True)
|
76
|
+
|
77
|
+
# Prepare the final gene diagnostic info dataframe
|
78
|
+
gene_diagnostic_info_cols = ['Gene', 'Annotation', 'Median_GSS', 'PCC']
|
79
|
+
gene_diagnostic_info = high_GSS_Gene_annotation_pair[gene_diagnostic_info_cols].drop_duplicates().dropna(
|
80
|
+
subset=['Gene'])
|
81
|
+
gene_diagnostic_info.sort_values('PCC', ascending=False, inplace=True)
|
82
|
+
|
83
|
+
# Save gene diagnostic info to a file
|
84
|
+
gene_diagnostic_info_save_path = config.get_gene_diagnostic_info_save_path(config.trait_name)
|
85
|
+
gene_diagnostic_info.to_csv(gene_diagnostic_info_save_path, index=False)
|
86
|
+
logger.info(f'Gene diagnostic information saved to {gene_diagnostic_info_save_path}.')
|
87
|
+
|
88
|
+
# Save to adata.var with the trait_name prefix
|
89
|
+
logger.info('Saving gene diagnostic info to adata.var...')
|
90
|
+
gene_diagnostic_info.set_index('Gene', inplace=True) # Use 'Gene' as the index to align with adata.var
|
91
|
+
adata.var[f'{config.trait_name}_Annotation'] = gene_diagnostic_info['Annotation']
|
92
|
+
adata.var[f'{config.trait_name}_Median_GSS'] = gene_diagnostic_info['Median_GSS']
|
93
|
+
adata.var[f'{config.trait_name}_PCC'] = gene_diagnostic_info['PCC']
|
94
|
+
|
95
|
+
# Save trait_ldsc_result to adata.obs
|
96
|
+
logger.info(f'Saving trait LDSC results to adata.obs as gsMap_{config.trait_name}_p_value...')
|
97
|
+
adata.obs[f'gsMap_{config.trait_name}_p_value'] = trait_ldsc_result['p']
|
98
|
+
adata.write(config.hdf5_with_latent_path, )
|
99
|
+
|
100
|
+
return gene_diagnostic_info.reset_index()
|
101
|
+
|
102
|
+
|
103
|
+
def load_gwas_data(config:DiagnosisConfig):
|
104
|
+
"""Load and process GWAS data."""
|
105
|
+
logger.info('Loading and processing GWAS data...')
|
106
|
+
gwas_data = pd.read_csv(config.sumstats_file, compression='gzip', sep='\t')
|
107
|
+
return convert_z_to_p(gwas_data)
|
108
|
+
|
109
|
+
|
110
|
+
def load_snp_gene_pairs(config:DiagnosisConfig):
|
111
|
+
"""Load SNP-gene pairs from multiple chromosomes."""
|
112
|
+
ldscore_save_dir = Path(config.ldscore_save_dir)
|
113
|
+
return pd.concat([
|
114
|
+
pd.read_feather(ldscore_save_dir / f'SNP_gene_pair/SNP_gene_pair_chr{chrom}.feather')
|
115
|
+
for chrom in range(1, 23)
|
116
|
+
])
|
117
|
+
|
118
|
+
|
119
|
+
def filter_snps(gwas_data_with_gene_annotation_sort, SUBSAMPLE_SNP_NUMBER):
|
120
|
+
"""Filter the SNPs based on significance levels."""
|
121
|
+
pass_suggestive_line_mask = gwas_data_with_gene_annotation_sort['P'] < 1e-5
|
122
|
+
pass_suggestive_line_number = pass_suggestive_line_mask.sum()
|
123
|
+
|
124
|
+
if pass_suggestive_line_number > SUBSAMPLE_SNP_NUMBER:
|
125
|
+
snps2plot = gwas_data_with_gene_annotation_sort[pass_suggestive_line_mask].SNP
|
126
|
+
logger.info(f'To reduce the number of SNPs to plot, only {snps2plot.shape[0]} SNPs with P < 1e-5 are plotted.')
|
127
|
+
else:
|
128
|
+
snps2plot = gwas_data_with_gene_annotation_sort.head(SUBSAMPLE_SNP_NUMBER).SNP
|
129
|
+
logger.info(
|
130
|
+
f'To reduce the number of SNPs to plot, only {SUBSAMPLE_SNP_NUMBER} SNPs with the smallest P-values are plotted.')
|
131
|
+
|
132
|
+
return snps2plot
|
133
|
+
|
134
|
+
|
135
|
+
def generate_manhattan_plot(config: DiagnosisConfig):
|
136
|
+
"""Generate Manhattan plot."""
|
137
|
+
report_save_dir = config.get_report_dir(config.trait_name)
|
138
|
+
gwas_data = load_gwas_data(config)
|
139
|
+
snp_gene_pair = load_snp_gene_pairs(config)
|
140
|
+
gwas_data_with_gene = snp_gene_pair.merge(gwas_data, on='SNP', how='inner').rename(columns={'gene_name': 'GENE'})
|
141
|
+
gene_diagnostic_info = load_gene_diagnostic_info(config)
|
142
|
+
gwas_data_with_gene_annotation = gwas_data_with_gene.merge(gene_diagnostic_info, left_on='GENE', right_on='Gene',
|
143
|
+
how='left')
|
144
|
+
|
145
|
+
gwas_data_with_gene_annotation = gwas_data_with_gene_annotation[
|
146
|
+
~gwas_data_with_gene_annotation['Annotation'].isna()]
|
147
|
+
gwas_data_with_gene_annotation_sort = gwas_data_with_gene_annotation.sort_values('P')
|
148
|
+
|
149
|
+
snps2plot = filter_snps(gwas_data_with_gene_annotation_sort, SUBSAMPLE_SNP_NUMBER=100_000)
|
150
|
+
gwas_data_to_plot = gwas_data_with_gene_annotation[
|
151
|
+
gwas_data_with_gene_annotation['SNP'].isin(snps2plot)].reset_index(drop=True)
|
152
|
+
gwas_data_to_plot['Annotation_text'] = 'PCC: ' + gwas_data_to_plot['PCC'].round(2).astype(
|
153
|
+
str) + '<br>' + 'Annotation: ' + gwas_data_to_plot['Annotation'].astype(str)
|
154
|
+
|
155
|
+
fig = ManhattanPlot(
|
156
|
+
dataframe=gwas_data_to_plot,
|
157
|
+
title='gsMap Diagnosis Manhattan Plot',
|
158
|
+
point_size=3,
|
159
|
+
highlight_gene_list=config.selected_genes or gene_diagnostic_info.Gene.iloc[:config.top_corr_genes].tolist(),
|
160
|
+
suggestiveline_value=-np.log10(1e-5),
|
161
|
+
annotation='Annotation_text',
|
162
|
+
)
|
163
|
+
|
164
|
+
save_manhattan_plot_path = config.get_manhattan_html_plot_path(config.trait_name)
|
165
|
+
fig.write_html(save_manhattan_plot_path)
|
166
|
+
logger.info(f'Diagnostic Manhattan Plot saved to {save_manhattan_plot_path}.')
|
167
|
+
|
168
|
+
|
169
|
+
def generate_GSS_distribution(config: DiagnosisConfig):
|
170
|
+
"""Generate GSS distribution plots."""
|
171
|
+
# logger.info('Loading ST data...')
|
172
|
+
# adata = sc.read_h5ad(config.hdf5_with_latent_path)
|
173
|
+
mk_score = pd.read_feather(config.mkscore_feather_path).set_index('HUMAN_GENE_SYM').T
|
174
|
+
|
175
|
+
plot_genes = config.selected_genes or load_gene_diagnostic_info(config).Gene.iloc[:config.top_corr_genes].tolist()
|
176
|
+
if config.selected_genes is not None:
|
177
|
+
logger.info(f'Generating GSS & Expression distribution plot for selected genes: {plot_genes}...')
|
178
|
+
else:
|
179
|
+
logger.info(f'Generating GSS & Expression distribution plot for top {config.top_corr_genes} correlated genes...')
|
180
|
+
|
181
|
+
if config.customize_fig:
|
182
|
+
pixel_width, pixel_height, point_size = config.fig_width, config.fig_height, config.point_size
|
183
|
+
else:
|
184
|
+
(pixel_width, pixel_height), point_size = estimate_point_size_for_plot(adata.obsm['spatial'])
|
185
|
+
sub_fig_save_dir = config.get_GSS_plot_dir(config.trait_name)
|
186
|
+
|
187
|
+
# save plot gene list
|
188
|
+
config.get_GSS_plot_select_gene_file(config.trait_name).write_text('\n'.join(plot_genes))
|
189
|
+
|
190
|
+
for selected_gene in plot_genes:
|
191
|
+
expression_series = pd.Series(adata[:, selected_gene].X.toarray().flatten(), index=adata.obs.index,name='Expression')
|
192
|
+
threshold = np.quantile(expression_series,0.9999)
|
193
|
+
expression_series[expression_series > threshold] = threshold
|
194
|
+
generate_and_save_plots(adata, mk_score, expression_series, selected_gene, point_size, pixel_width,
|
195
|
+
pixel_height, sub_fig_save_dir, config.sample_name, config.annotation)
|
196
|
+
|
197
|
+
|
198
|
+
def generate_and_save_plots(adata, mk_score, expression_series, selected_gene, point_size, pixel_width, pixel_height,
|
199
|
+
sub_fig_save_dir, sample_name, annotation):
|
200
|
+
"""Generate and save the plots."""
|
201
|
+
select_gene_expression_with_space_coord = load_st_coord(adata, expression_series, annotation)
|
202
|
+
sub_fig_1 = draw_scatter(select_gene_expression_with_space_coord, title=f'{selected_gene} (Expression)',
|
203
|
+
annotation='annotation', color_by='Expression', point_size=point_size, width=pixel_width,
|
204
|
+
height=pixel_height)
|
205
|
+
save_plot(sub_fig_1, sub_fig_save_dir, sample_name, selected_gene, 'Expression')
|
206
|
+
|
207
|
+
select_gene_GSS_with_space_coord = load_st_coord(adata, mk_score[selected_gene].rename('GSS'), annotation)
|
208
|
+
sub_fig_2 = draw_scatter(select_gene_GSS_with_space_coord, title=f'{selected_gene} (GSS)', annotation='annotation',
|
209
|
+
color_by='GSS', point_size=point_size, width=pixel_width, height=pixel_height)
|
210
|
+
save_plot(sub_fig_2, sub_fig_save_dir, sample_name, selected_gene, 'GSS')
|
211
|
+
|
212
|
+
# combined_fig = make_subplots(rows=1, cols=2,
|
213
|
+
# subplot_titles=(f'{selected_gene} (Expression)', f'{selected_gene} (GSS)'))
|
214
|
+
# for trace in sub_fig_1.data:
|
215
|
+
# combined_fig.add_trace(trace, row=1, col=1)
|
216
|
+
# for trace in sub_fig_2.data:
|
217
|
+
# combined_fig.add_trace(trace, row=1, col=2)
|
218
|
+
#
|
219
|
+
|
220
|
+
def save_plot(sub_fig, sub_fig_save_dir, sample_name, selected_gene, plot_type):
|
221
|
+
"""Save the plot to HTML and PNG."""
|
222
|
+
save_sub_fig_path = sub_fig_save_dir / f'{sample_name}_{selected_gene}_{plot_type}_Distribution.html'
|
223
|
+
# sub_fig.write_html(str(save_sub_fig_path))
|
224
|
+
sub_fig.update_layout(showlegend=False)
|
225
|
+
sub_fig.write_image(str(save_sub_fig_path).replace('.html', '.png'))
|
226
|
+
|
227
|
+
|
228
|
+
def generate_gsMap_plot(config: DiagnosisConfig):
|
229
|
+
"""Generate gsMap plot."""
|
230
|
+
logger.info('Creating gsMap plot...')
|
231
|
+
|
232
|
+
trait_ldsc_result = load_ldsc(config.get_ldsc_result_file(config.trait_name))
|
233
|
+
space_coord_concat = load_st_coord(adata, trait_ldsc_result, annotation=config.annotation)
|
234
|
+
|
235
|
+
if config.customize_fig:
|
236
|
+
pixel_width, pixel_height, point_size = config.fig_width, config.fig_height, config.point_size
|
237
|
+
else:
|
238
|
+
(pixel_width, pixel_height), point_size = estimate_point_size_for_plot(adata.obsm['spatial'])
|
239
|
+
fig = draw_scatter(space_coord_concat,
|
240
|
+
title=f'{config.trait_name} (gsMap)',
|
241
|
+
point_size=point_size,
|
242
|
+
width=pixel_width,
|
243
|
+
height=pixel_height,
|
244
|
+
annotation=config.annotation
|
245
|
+
)
|
246
|
+
|
247
|
+
output_dir = config.get_gsMap_plot_save_dir(config.trait_name)
|
248
|
+
output_file_html = config.get_gsMap_html_plot_save_path(config.trait_name)
|
249
|
+
output_file_png = output_file_html.with_suffix('.png')
|
250
|
+
output_file_csv = output_file_html.with_suffix('.csv')
|
251
|
+
|
252
|
+
fig.write_html(output_file_html)
|
253
|
+
fig.write_image(output_file_png)
|
254
|
+
space_coord_concat.to_csv(output_file_csv)
|
255
|
+
|
256
|
+
logger.info(f'gsMap plot created and saved in {output_dir}.')
|
257
|
+
|
258
|
+
|
259
|
+
def run_Diagnosis(config: DiagnosisConfig):
|
260
|
+
"""Main function to run the diagnostic plot generation."""
|
261
|
+
global adata
|
262
|
+
adata = sc.read_h5ad(config.hdf5_with_latent_path)
|
263
|
+
if 'log1p' not in adata.uns.keys() and adata.X.max() > 14:
|
264
|
+
sc.pp.normalize_total(adata, target_sum=1e4)
|
265
|
+
sc.pp.log1p(adata)
|
266
|
+
|
267
|
+
if config.plot_type in ['manhattan', 'all']:
|
268
|
+
generate_manhattan_plot(config)
|
269
|
+
if config.plot_type in ['GSS', 'all']:
|
270
|
+
generate_GSS_distribution(config)
|
271
|
+
if config.plot_type in ['gsMap', 'all']:
|
272
|
+
generate_gsMap_plot(config)
|
273
|
+
|