gsMap 1.60__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN_VAE/__init__.py +0 -0
- gsMap/GNN_VAE/adjacency_matrix.py +95 -0
- gsMap/GNN_VAE/model.py +87 -0
- gsMap/GNN_VAE/train.py +97 -0
- gsMap/__init__.py +5 -0
- gsMap/__main__.py +3 -0
- gsMap/cauchy_combination_test.py +163 -0
- gsMap/config.py +734 -0
- gsMap/find_latent_representation.py +209 -0
- gsMap/format_sumstats.py +410 -0
- gsMap/generate_ldscore.py +551 -0
- gsMap/generate_r2_matrix.py +743 -0
- gsMap/jackknife.py +514 -0
- gsMap/latent_to_gene.py +257 -0
- gsMap/main.py +39 -0
- gsMap/make_annotations.py +560 -0
- gsMap/regression_read.py +294 -0
- gsMap/spatial_ldsc_multiple_sumstats.py +307 -0
- gsMap/visualize.py +154 -0
- gsmap-1.60.dist-info/LICENSE +21 -0
- gsmap-1.60.dist-info/METADATA +124 -0
- gsmap-1.60.dist-info/RECORD +24 -0
- gsmap-1.60.dist-info/WHEEL +4 -0
- gsmap-1.60.dist-info/entry_points.txt +3 -0
gsMap/regression_read.py
ADDED
@@ -0,0 +1,294 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
import os
|
4
|
+
|
5
|
+
|
6
|
+
# Fun for reading gwas data
|
7
|
+
def _read_sumstats(fh, alleles=False, dropna=False):
|
8
|
+
'''
|
9
|
+
Parse gwas summary statistics.
|
10
|
+
'''
|
11
|
+
print('Reading summary statistics from {S} ...'.format(S=fh))
|
12
|
+
sumstats = ps_sumstats(fh, alleles=alleles, dropna=dropna)
|
13
|
+
print('Read summary statistics for {N} SNPs.'.format(N=len(sumstats)))
|
14
|
+
|
15
|
+
m = len(sumstats)
|
16
|
+
sumstats = sumstats.drop_duplicates(subset='SNP')
|
17
|
+
if m > len(sumstats):
|
18
|
+
print('Dropped {M} SNPs with duplicated rs numbers.'.format(M=m - len(sumstats)))
|
19
|
+
|
20
|
+
return sumstats
|
21
|
+
|
22
|
+
|
23
|
+
def ps_sumstats(fh, alleles=False, dropna=True):
|
24
|
+
'''
|
25
|
+
Parses .sumstats files. See docs/file_formats_sumstats.txt.
|
26
|
+
'''
|
27
|
+
|
28
|
+
dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str}
|
29
|
+
compression = get_compression(fh)
|
30
|
+
usecols = ['SNP', 'Z', 'N']
|
31
|
+
if alleles:
|
32
|
+
usecols += ['A1', 'A2']
|
33
|
+
|
34
|
+
try:
|
35
|
+
x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression)
|
36
|
+
except (AttributeError, ValueError) as e:
|
37
|
+
raise ValueError('Improperly formatted sumstats file: ' + str(e.args))
|
38
|
+
|
39
|
+
if dropna:
|
40
|
+
x = x.dropna(how='any')
|
41
|
+
|
42
|
+
return x
|
43
|
+
|
44
|
+
|
45
|
+
def get_compression(fh):
|
46
|
+
'''
|
47
|
+
Determin the format of compression used with read_csv?
|
48
|
+
'''
|
49
|
+
if fh.endswith('gz'):
|
50
|
+
compression = 'gzip'
|
51
|
+
elif fh.endswith('bz2'):
|
52
|
+
compression = 'bz2'
|
53
|
+
else:
|
54
|
+
compression = None
|
55
|
+
# -
|
56
|
+
return compression
|
57
|
+
|
58
|
+
|
59
|
+
def read_csv(fh, **kwargs):
|
60
|
+
'''
|
61
|
+
Read the csv data
|
62
|
+
'''
|
63
|
+
return pd.read_csv(fh, delim_whitespace=True, na_values='.', **kwargs)
|
64
|
+
|
65
|
+
|
66
|
+
# Fun for reading loading LD scores
|
67
|
+
def which_compression(fh):
|
68
|
+
'''
|
69
|
+
Given a file prefix, figure out what sort of compression to use.
|
70
|
+
'''
|
71
|
+
if os.access(fh + '.bz2', 4):
|
72
|
+
suffix = '.bz2'
|
73
|
+
compression = 'bz2'
|
74
|
+
elif os.access(fh + '.gz', 4):
|
75
|
+
suffix = '.gz'
|
76
|
+
compression = 'gzip'
|
77
|
+
elif os.access(fh + '.parquet', 4):
|
78
|
+
suffix = '.parquet'
|
79
|
+
compression = 'parquet'
|
80
|
+
elif os.access(fh + '.feather', 4):
|
81
|
+
suffix = '.feather'
|
82
|
+
compression = 'feather'
|
83
|
+
elif os.access(fh, 4):
|
84
|
+
suffix = ''
|
85
|
+
compression = None
|
86
|
+
else:
|
87
|
+
raise IOError('Could not open {F}[./gz/bz2/parquet/feather]'.format(F=fh))
|
88
|
+
# -
|
89
|
+
return suffix, compression
|
90
|
+
|
91
|
+
|
92
|
+
def _read_ref_ld(ld_file):
|
93
|
+
suffix = '.l2.ldscore'
|
94
|
+
file = ld_file
|
95
|
+
first_fh = f'{file}1{suffix}'
|
96
|
+
s, compression = which_compression(first_fh)
|
97
|
+
#
|
98
|
+
ldscore_array = []
|
99
|
+
print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
|
100
|
+
|
101
|
+
for chr in range(1, 23):
|
102
|
+
file_chr = f'{file}{chr}{suffix}{s}'
|
103
|
+
#
|
104
|
+
if compression == 'parquet':
|
105
|
+
x = pd.read_parquet(file_chr)
|
106
|
+
elif compression == 'feather':
|
107
|
+
x = pd.read_feather(file_chr)
|
108
|
+
else:
|
109
|
+
x = pd.read_csv(file_chr, compression=compression, sep='\t')
|
110
|
+
|
111
|
+
x = x.sort_values(by=['CHR', 'BP']) # SEs will be wrong unless sorted
|
112
|
+
|
113
|
+
columns_to_drop = ['MAF', 'CM', 'Gene', 'TSS', 'CHR', 'BP']
|
114
|
+
columns_to_drop = [col for col in columns_to_drop if col in x.columns]
|
115
|
+
x = x.drop(columns_to_drop, axis=1)
|
116
|
+
|
117
|
+
ldscore_array.append(x)
|
118
|
+
#
|
119
|
+
ref_ld = pd.concat(ldscore_array, axis=0)
|
120
|
+
return ref_ld
|
121
|
+
|
122
|
+
|
123
|
+
def _read_ref_ld_v2(ld_file):
|
124
|
+
suffix = '.l2.ldscore'
|
125
|
+
file = ld_file
|
126
|
+
first_fh = f'{file}1{suffix}'
|
127
|
+
s, compression = which_compression(first_fh)
|
128
|
+
print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
|
129
|
+
ref_ld = pd.concat(
|
130
|
+
[pd.read_feather(f'{file}{chr}{suffix}{s}') for chr in range(1, 23)], axis=0
|
131
|
+
)
|
132
|
+
# set first column as index
|
133
|
+
ref_ld.rename(columns={'index': 'SNP'}, inplace=True)
|
134
|
+
ref_ld.set_index('SNP', inplace=True)
|
135
|
+
return ref_ld
|
136
|
+
|
137
|
+
def _read_M_v2(ld_file, n_annot, not_M_5_50):
|
138
|
+
suffix = '.l2.M'
|
139
|
+
if not not_M_5_50:
|
140
|
+
suffix += '_5_50'
|
141
|
+
M_annot= np.array(
|
142
|
+
[
|
143
|
+
np.loadtxt(f'{ld_file}{chr}{suffix}', )
|
144
|
+
for chr in range(1, 23)]
|
145
|
+
|
146
|
+
)
|
147
|
+
assert M_annot.shape == (22, n_annot)
|
148
|
+
return M_annot.sum(axis=0).reshape((1, n_annot))
|
149
|
+
# Fun for reading M annotations
|
150
|
+
def _read_M(ld_file, n_annot, not_M_5_50):
|
151
|
+
'''
|
152
|
+
Read M (--M, --M-file, etc).
|
153
|
+
'''
|
154
|
+
M_annot = M(ld_file, common=(not not_M_5_50))
|
155
|
+
|
156
|
+
try:
|
157
|
+
M_annot = np.array(M_annot).reshape((1, n_annot))
|
158
|
+
except ValueError as e:
|
159
|
+
raise ValueError('# terms in --M must match # of LD Scores in --ref-ld.\n' + str(e.args))
|
160
|
+
return M_annot
|
161
|
+
|
162
|
+
|
163
|
+
def M(fh, common=False):
|
164
|
+
'''
|
165
|
+
Parses .l{N}.M files, split across num chromosomes.
|
166
|
+
'''
|
167
|
+
suffix = '.l2.M'
|
168
|
+
if common:
|
169
|
+
suffix += '_5_50'
|
170
|
+
# -
|
171
|
+
M_array = []
|
172
|
+
for i in range(1, 23):
|
173
|
+
M_current = pd.read_csv(f'{fh}{i}' + suffix, header=None)
|
174
|
+
M_array.append(M_current)
|
175
|
+
|
176
|
+
M_array = pd.concat(M_array, axis=1).sum(axis=1)
|
177
|
+
# -
|
178
|
+
return np.array(M_array).reshape((1, len(M_array)))
|
179
|
+
|
180
|
+
|
181
|
+
def _check_variance(M_annot, ref_ld):
|
182
|
+
'''
|
183
|
+
Remove zero-variance LD Scores.
|
184
|
+
'''
|
185
|
+
ii = ref_ld.iloc[:, 1:].var() == 0 # NB there is a SNP column here
|
186
|
+
if ii.all():
|
187
|
+
raise ValueError('All LD Scores have zero variance.')
|
188
|
+
else:
|
189
|
+
print('Removing partitioned LD Scores with zero variance.')
|
190
|
+
ii_snp = np.array([True] + list(~ii))
|
191
|
+
ii_m = np.array(~ii)
|
192
|
+
ref_ld = ref_ld.iloc[:, ii_snp]
|
193
|
+
M_annot = M_annot[:, ii_m]
|
194
|
+
# -
|
195
|
+
return M_annot, ref_ld, ii
|
196
|
+
def _check_variance_v2(M_annot, ref_ld):
|
197
|
+
ii = ref_ld.var() == 0
|
198
|
+
if ii.all():
|
199
|
+
raise ValueError('All LD Scores have zero variance.')
|
200
|
+
elif not ii.any():
|
201
|
+
print('No partitioned LD Scores have zero variance.')
|
202
|
+
else:
|
203
|
+
ii_snp= ii_m = np.array(~ii)
|
204
|
+
print(f'Removing {sum(ii)} partitioned LD Scores with zero variance.')
|
205
|
+
ref_ld = ref_ld.iloc[:, ii_snp]
|
206
|
+
M_annot = M_annot[:, ii_m]
|
207
|
+
return M_annot, ref_ld
|
208
|
+
|
209
|
+
|
210
|
+
# Fun for reading regression weights
|
211
|
+
def which_compression(fh):
|
212
|
+
'''
|
213
|
+
Given a file prefix, figure out what sort of compression to use.
|
214
|
+
'''
|
215
|
+
if os.access(fh + '.bz2', 4):
|
216
|
+
suffix = '.bz2'
|
217
|
+
compression = 'bz2'
|
218
|
+
elif os.access(fh + '.gz', 4):
|
219
|
+
suffix = '.gz'
|
220
|
+
compression = 'gzip'
|
221
|
+
elif os.access(fh + '.parquet', 4):
|
222
|
+
suffix = '.parquet'
|
223
|
+
compression = 'parquet'
|
224
|
+
elif os.access(fh + '.feather', 4):
|
225
|
+
suffix = '.feather'
|
226
|
+
compression = 'feather'
|
227
|
+
elif os.access(fh, 4):
|
228
|
+
suffix = ''
|
229
|
+
compression = None
|
230
|
+
else:
|
231
|
+
raise IOError('Could not open {F}[./gz/bz2/parquet/feather]'.format(F=fh))
|
232
|
+
# -
|
233
|
+
return suffix, compression
|
234
|
+
|
235
|
+
|
236
|
+
def _read_w_ld(w_file):
|
237
|
+
suffix = '.l2.ldscore'
|
238
|
+
file = w_file
|
239
|
+
first_fh = f'{file}1{suffix}'
|
240
|
+
s, compression = which_compression(first_fh)
|
241
|
+
#
|
242
|
+
w_array = []
|
243
|
+
print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
|
244
|
+
|
245
|
+
for chr in range(1, 23):
|
246
|
+
file_chr = f'{file}{chr}{suffix}{s}'
|
247
|
+
#
|
248
|
+
if compression == 'parquet':
|
249
|
+
x = pd.read_parquet(file_chr)
|
250
|
+
elif compression == 'feather':
|
251
|
+
x = pd.read_feather(file_chr)
|
252
|
+
else:
|
253
|
+
x = pd.read_csv(file_chr, compression=compression, sep='\t')
|
254
|
+
|
255
|
+
x = x.sort_values(by=['CHR', 'BP'])
|
256
|
+
|
257
|
+
columns_to_drop = ['MAF', 'CM', 'Gene', 'TSS', 'CHR', 'BP']
|
258
|
+
columns_to_drop = [col for col in columns_to_drop if col in x.columns]
|
259
|
+
x = x.drop(columns_to_drop, axis=1)
|
260
|
+
|
261
|
+
w_array.append(x)
|
262
|
+
#
|
263
|
+
w_ld = pd.concat(w_array, axis=0)
|
264
|
+
w_ld.columns = ['SNP', 'LD_weights']
|
265
|
+
|
266
|
+
return w_ld
|
267
|
+
|
268
|
+
|
269
|
+
# Fun for merging
|
270
|
+
def _merge_and_log(ld, sumstats, noun):
|
271
|
+
'''
|
272
|
+
Wrap smart merge with log messages about # of SNPs.
|
273
|
+
'''
|
274
|
+
sumstats = smart_merge(ld, sumstats)
|
275
|
+
msg = 'After merging with {F}, {N} SNPs remain.'
|
276
|
+
if len(sumstats) == 0:
|
277
|
+
raise ValueError(msg.format(N=len(sumstats), F=noun))
|
278
|
+
else:
|
279
|
+
print(msg.format(N=len(sumstats), F=noun))
|
280
|
+
# -
|
281
|
+
return sumstats
|
282
|
+
|
283
|
+
|
284
|
+
def smart_merge(x, y):
|
285
|
+
'''
|
286
|
+
Check if SNP columns are equal. If so, save time by using concat instead of merge.
|
287
|
+
'''
|
288
|
+
if len(x) == len(y) and (x.index == y.index).all() and (x.SNP == y.SNP).all():
|
289
|
+
x = x.reset_index(drop=True)
|
290
|
+
y = y.reset_index(drop=True).drop('SNP', 1)
|
291
|
+
out = pd.concat([x, y], axis=1)
|
292
|
+
else:
|
293
|
+
out = pd.merge(x, y, how='inner', on='SNP')
|
294
|
+
return out
|
@@ -0,0 +1,307 @@
|
|
1
|
+
import os
|
2
|
+
import numpy as np
|
3
|
+
import pandas as pd
|
4
|
+
|
5
|
+
import argparse
|
6
|
+
import logging
|
7
|
+
import multiprocessing
|
8
|
+
from collections import defaultdict
|
9
|
+
from pathlib import Path
|
10
|
+
|
11
|
+
from scipy.stats import norm
|
12
|
+
from tqdm.contrib.concurrent import process_map
|
13
|
+
|
14
|
+
import gsMap.jackknife as jk
|
15
|
+
from gsMap.config import add_spatial_ldsc_args, SpatialLDSCConfig
|
16
|
+
from gsMap.regression_read import _read_sumstats, _read_w_ld, _read_ref_ld_v2, _read_M_v2
|
17
|
+
|
18
|
+
logger = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
# %%
|
22
|
+
def _coef_new(jknife):
|
23
|
+
# return coef[0], coef_se[0], z[0]]
|
24
|
+
est_ = jknife.est[0, 0] / Nbar
|
25
|
+
se_ = jknife.jknife_se[0, 0] / Nbar
|
26
|
+
return est_, se_
|
27
|
+
|
28
|
+
|
29
|
+
def append_intercept(x):
|
30
|
+
n_row = x.shape[0]
|
31
|
+
intercept = np.ones((n_row, 1))
|
32
|
+
x_new = np.concatenate((x, intercept), axis=1)
|
33
|
+
return x_new
|
34
|
+
|
35
|
+
|
36
|
+
def filter_sumstats_by_chisq(sumstats, chisq_max):
|
37
|
+
before_len = len(sumstats)
|
38
|
+
if chisq_max is None:
|
39
|
+
chisq_max = max(0.001 * sumstats.N.max(), 80)
|
40
|
+
logger.info(f'No chi^2 threshold provided, using {chisq_max} as default')
|
41
|
+
sumstats['chisq'] = sumstats.Z ** 2
|
42
|
+
sumstats = sumstats[sumstats.chisq < chisq_max]
|
43
|
+
after_len = len(sumstats)
|
44
|
+
if after_len < before_len:
|
45
|
+
logger.info(f'Removed {before_len - after_len} SNPs with chi^2 > {chisq_max} ({after_len} SNPs remain)')
|
46
|
+
else:
|
47
|
+
logger.info(f'No SNPs removed with chi^2 > {chisq_max} ({after_len} SNPs remain)')
|
48
|
+
return sumstats
|
49
|
+
|
50
|
+
|
51
|
+
def aggregate(y, x, N, M, intercept=1):
|
52
|
+
num = M * (np.mean(y) - intercept)
|
53
|
+
denom = np.mean(np.multiply(x, N))
|
54
|
+
return num / denom
|
55
|
+
|
56
|
+
|
57
|
+
def weights(ld, w_ld, N, M, hsq, intercept=1):
|
58
|
+
M = float(M)
|
59
|
+
hsq = np.clip(hsq, 0.0, 1.0)
|
60
|
+
ld = np.maximum(ld, 1.0)
|
61
|
+
w_ld = np.maximum(w_ld, 1.0)
|
62
|
+
c = hsq * N / M
|
63
|
+
het_w = 1.0 / (2 * np.square(intercept + np.multiply(c, ld)))
|
64
|
+
oc_w = 1.0 / w_ld
|
65
|
+
w = np.multiply(het_w, oc_w)
|
66
|
+
return w
|
67
|
+
|
68
|
+
|
69
|
+
def jackknife_for_processmap(spot_id):
|
70
|
+
# calculate the initial weight for each spot
|
71
|
+
initial_w = (
|
72
|
+
get_weight_optimized(sumstats, x_tot_precomputed_common_snp[:, spot_id], 10000, w_ld_common_snp, intercept=1)
|
73
|
+
.astype(np.float32)
|
74
|
+
.reshape((-1, 1)))
|
75
|
+
|
76
|
+
# apply the weight to baseline annotation, spatial annotation and CHISQ
|
77
|
+
initial_w_scaled = initial_w / np.sum(initial_w)
|
78
|
+
baseline_annotation_spot = baseline_annotation * initial_w_scaled
|
79
|
+
spatial_annotation_spot = spatial_annotation.iloc[:, spot_id].values.reshape((-1, 1)) * initial_w_scaled
|
80
|
+
CHISQ = sumstats.chisq.to_numpy(dtype=np.float32).reshape((-1, 1)).copy()
|
81
|
+
y = CHISQ * initial_w_scaled
|
82
|
+
|
83
|
+
# run the jackknife
|
84
|
+
x_focal = np.concatenate((spatial_annotation_spot,
|
85
|
+
baseline_annotation_spot), axis=1)
|
86
|
+
try:
|
87
|
+
jknife = jk.LstsqJackknifeFast(x_focal, y, n_blocks)
|
88
|
+
# LinAlgError
|
89
|
+
except np.linalg.LinAlgError as e:
|
90
|
+
logger.warning(f'LinAlgError: {e}')
|
91
|
+
return np.nan, np.nan
|
92
|
+
return _coef_new(jknife)
|
93
|
+
|
94
|
+
|
95
|
+
# Updated function
|
96
|
+
def get_weight_optimized(sumstats, x_tot_precomputed, M_tot, w_ld, intercept=1):
|
97
|
+
tot_agg = aggregate(sumstats.chisq, x_tot_precomputed, sumstats.N, M_tot, intercept)
|
98
|
+
initial_w = weights(x_tot_precomputed, w_ld.LD_weights.values, sumstats.N.values, M_tot, tot_agg, intercept)
|
99
|
+
initial_w = np.sqrt(initial_w)
|
100
|
+
return initial_w
|
101
|
+
|
102
|
+
|
103
|
+
def _preprocess_sumstats(trait_name, sumstat_file_path, baseline_and_w_ld_common_snp: pd.Index, chisq_max=None):
|
104
|
+
# Load the gwas summary statistics
|
105
|
+
sumstats = _read_sumstats(fh=sumstat_file_path, alleles=False, dropna=False)
|
106
|
+
sumstats.set_index('SNP', inplace=True)
|
107
|
+
sumstats = sumstats.astype(np.float32)
|
108
|
+
sumstats = filter_sumstats_by_chisq(sumstats, chisq_max)
|
109
|
+
|
110
|
+
# NB: The intersection order is essential for keeping the same order of SNPs by its BP location
|
111
|
+
common_snp = baseline_and_w_ld_common_snp.intersection(sumstats.index)
|
112
|
+
if len(common_snp) < 200000:
|
113
|
+
logger.warning(f'WARNING: number of SNPs less than 200k; for {trait_name} this is almost always bad.')
|
114
|
+
|
115
|
+
sumstats = sumstats.loc[common_snp]
|
116
|
+
return sumstats
|
117
|
+
|
118
|
+
|
119
|
+
def _get_sumstats_from_sumstats_dict(sumstats_config_dict: dict, baseline_and_w_ld_common_snp: pd.Index,
|
120
|
+
chisq_max=None):
|
121
|
+
# first validate if all sumstats file exists
|
122
|
+
logger.info('Validating sumstats files...')
|
123
|
+
for trait_name, sumstat_file_path in sumstats_config_dict.items():
|
124
|
+
if not os.path.exists(sumstat_file_path):
|
125
|
+
raise FileNotFoundError(f'{sumstat_file_path} not found')
|
126
|
+
# then load all sumstats
|
127
|
+
sumstats_cleaned_dict = {}
|
128
|
+
for trait_name, sumstat_file_path in sumstats_config_dict.items():
|
129
|
+
sumstats_cleaned_dict[trait_name] = _preprocess_sumstats(trait_name, sumstat_file_path,
|
130
|
+
baseline_and_w_ld_common_snp, chisq_max)
|
131
|
+
logger.info('cleaned sumstats loaded')
|
132
|
+
return sumstats_cleaned_dict
|
133
|
+
|
134
|
+
|
135
|
+
def run_spatial_ldsc(config: SpatialLDSCConfig):
|
136
|
+
global spatial_annotation, baseline_annotation, n_blocks, Nbar, sumstats, x_tot_precomputed_common_snp, w_ld_common_snp
|
137
|
+
# config
|
138
|
+
n_blocks = config.n_blocks
|
139
|
+
sample_name = config.sample_name
|
140
|
+
|
141
|
+
print(f'------Running Spatial LDSC for {sample_name}...')
|
142
|
+
# Load the regression weights
|
143
|
+
w_ld = _read_w_ld(config.w_file)
|
144
|
+
w_ld_cname = w_ld.columns[1]
|
145
|
+
w_ld.set_index('SNP', inplace=True)
|
146
|
+
|
147
|
+
# Load the baseline annotations
|
148
|
+
ld_file_baseline = f'{config.ldscore_input_dir}/baseline/baseline.'
|
149
|
+
ref_ld_baseline = _read_ref_ld_v2(ld_file_baseline)
|
150
|
+
n_annot_baseline = len(ref_ld_baseline.columns)
|
151
|
+
M_annot_baseline = _read_M_v2(ld_file_baseline, n_annot_baseline, config.not_M_5_50)
|
152
|
+
|
153
|
+
# common snp between baseline and w_ld
|
154
|
+
baseline_and_w_ld_common_snp = ref_ld_baseline.index.intersection(w_ld.index)
|
155
|
+
if len(baseline_and_w_ld_common_snp) < 200000:
|
156
|
+
logger.warning(f'WARNING: number of SNPs less than 200k; for {sample_name} this is almost always bad.')
|
157
|
+
ref_ld_baseline = ref_ld_baseline.loc[baseline_and_w_ld_common_snp]
|
158
|
+
|
159
|
+
# load additional baseline annotations
|
160
|
+
if config.use_additional_baseline_annotation:
|
161
|
+
ld_file_baseline_additional = f'{config.ldscore_input_dir}/additional_baseline/baseline.'
|
162
|
+
ref_ld_baseline_additional = _read_ref_ld_v2(ld_file_baseline_additional)
|
163
|
+
n_annot_baseline_additional = len(ref_ld_baseline_additional.columns)
|
164
|
+
logger.info(f'{len(ref_ld_baseline_additional.columns)} additional baseline annotations loaded')
|
165
|
+
# M_annot_baseline_additional = _read_M_v2(ld_file_baseline_additional, n_annot_baseline_additional,
|
166
|
+
# config.not_M_5_50)
|
167
|
+
ref_ld_baseline_additional = ref_ld_baseline_additional.loc[baseline_and_w_ld_common_snp]
|
168
|
+
ref_ld_baseline = pd.concat([ref_ld_baseline, ref_ld_baseline_additional], axis=1)
|
169
|
+
del ref_ld_baseline_additional
|
170
|
+
|
171
|
+
w_ld = w_ld.loc[baseline_and_w_ld_common_snp]
|
172
|
+
|
173
|
+
# Clean the sumstats
|
174
|
+
sumstats_cleaned_dict = _get_sumstats_from_sumstats_dict(config.sumstats_config_dict, baseline_and_w_ld_common_snp,
|
175
|
+
chisq_max=config.chisq_max)
|
176
|
+
|
177
|
+
# Detect avalable chunk files
|
178
|
+
all_file = os.listdir(config.ldscore_input_dir)
|
179
|
+
if config.all_chunk is None:
|
180
|
+
all_chunk = sum('chunk' in name for name in all_file)
|
181
|
+
print(f'\t')
|
182
|
+
print(f'Find {all_chunk} chunked files')
|
183
|
+
else:
|
184
|
+
all_chunk = config.all_chunk
|
185
|
+
print(f'using {all_chunk} chunked files by provided argument')
|
186
|
+
print(f'\t')
|
187
|
+
print(f'Input {all_chunk} chunked files')
|
188
|
+
|
189
|
+
# Process each chunk
|
190
|
+
output_dict = defaultdict(list)
|
191
|
+
for chunk_index in range(1, all_chunk + 1):
|
192
|
+
print(f'------Processing chunk-{chunk_index}')
|
193
|
+
|
194
|
+
# Load the spatial annotations for this chunk
|
195
|
+
ld_file_spatial = f'{config.ldscore_input_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.'
|
196
|
+
ref_ld_spatial = _read_ref_ld_v2(ld_file_spatial)
|
197
|
+
ref_ld_spatial = ref_ld_spatial.loc[baseline_and_w_ld_common_snp]
|
198
|
+
ref_ld_spatial = ref_ld_spatial.astype(np.float32, copy=False)
|
199
|
+
|
200
|
+
# get the x_tot_precomputed matrix by adding baseline and spatial annotation
|
201
|
+
x_tot_precomputed = ref_ld_spatial + ref_ld_baseline.sum(axis=1).values.reshape((-1, 1))
|
202
|
+
|
203
|
+
for trait_name, sumstats in sumstats_cleaned_dict.items():
|
204
|
+
logger.info(f'Processing {trait_name}...')
|
205
|
+
|
206
|
+
# filter ldscore by common snp
|
207
|
+
common_snp = sumstats.index
|
208
|
+
spatial_annotation = ref_ld_spatial.loc[common_snp].astype(np.float32, copy=False)
|
209
|
+
spatial_annotation_cnames = spatial_annotation.columns
|
210
|
+
baseline_annotation = ref_ld_baseline.loc[common_snp].astype(np.float32, copy=False)
|
211
|
+
w_ld_common_snp = w_ld.loc[common_snp].astype(np.float32, copy=False)
|
212
|
+
x_tot_precomputed_common_snp = x_tot_precomputed.loc[common_snp].values
|
213
|
+
|
214
|
+
# weight the baseline annotation by N
|
215
|
+
baseline_annotation = baseline_annotation * sumstats.N.values.reshape((-1, 1)) / sumstats.N.mean()
|
216
|
+
# append intercept
|
217
|
+
baseline_annotation = append_intercept(baseline_annotation)
|
218
|
+
|
219
|
+
# Run the jackknife
|
220
|
+
Nbar = sumstats.N.mean()
|
221
|
+
chunk_size = spatial_annotation.shape[1]
|
222
|
+
out_chunk = process_map(jackknife_for_processmap, range(chunk_size),
|
223
|
+
max_workers=config.num_processes,
|
224
|
+
chunksize=10,
|
225
|
+
desc=f'LDSC chunk-{chunk_index}: {trait_name}')
|
226
|
+
|
227
|
+
# cache the results
|
228
|
+
out_chunk = pd.DataFrame.from_records(out_chunk,
|
229
|
+
columns=['beta', 'se', ],
|
230
|
+
index=spatial_annotation_cnames)
|
231
|
+
# get the spots with nan
|
232
|
+
nan_spots = out_chunk[out_chunk.isna().any(axis=1)].index
|
233
|
+
logger.info(f'Nan spots: {nan_spots} in chunk-{chunk_index} for {trait_name}. They are removed.')
|
234
|
+
# drop the nan
|
235
|
+
out_chunk = out_chunk.dropna()
|
236
|
+
|
237
|
+
out_chunk['z'] = out_chunk.beta / out_chunk.se
|
238
|
+
out_chunk['p'] = norm.sf(out_chunk['z'])
|
239
|
+
output_dict[trait_name].append(out_chunk)
|
240
|
+
|
241
|
+
# garbage collection
|
242
|
+
del spatial_annotation
|
243
|
+
|
244
|
+
# Save the results
|
245
|
+
out_dir = Path(config.ldsc_save_dir)
|
246
|
+
out_dir.mkdir(parents=True, exist_ok=True, mode=0o777)
|
247
|
+
for trait_name, out_chunk_list in output_dict.items():
|
248
|
+
out_all = pd.concat(out_chunk_list, axis=0)
|
249
|
+
out_file_name = out_dir / f'{sample_name}_{trait_name}.csv.gz'
|
250
|
+
out_all['spot'] = out_all.index
|
251
|
+
out_all = out_all[['spot', 'beta', 'se', 'z', 'p']]
|
252
|
+
out_all.to_csv(out_file_name, compression='gzip', index=False)
|
253
|
+
logger.info(f'Output saved to {out_file_name} for {trait_name}')
|
254
|
+
logger.info(f'------Spatial LDSC for {sample_name} finished!')
|
255
|
+
|
256
|
+
|
257
|
+
# %%
|
258
|
+
if __name__ == '__main__':
|
259
|
+
# Main function of analysis
|
260
|
+
parser = argparse.ArgumentParser(
|
261
|
+
description="Run Spatial LD Score Regression (LDSC) analysis for GWAS and spatial transcriptomic data."
|
262
|
+
)
|
263
|
+
parser = add_spatial_ldsc_args(parser)
|
264
|
+
TEST = True
|
265
|
+
if TEST:
|
266
|
+
gwas_root = "/storage/yangjianLab/songliyang/GWAS_trait/LDSC"
|
267
|
+
gwas_trait = "/storage/yangjianLab/songliyang/GWAS_trait/GWAS_Public_Use_MaxPower.csv"
|
268
|
+
root = "/storage/yangjianLab/songliyang/SpatialData/Data/Brain/Human/Nature_Neuroscience_2021/processed/h5ad"
|
269
|
+
|
270
|
+
name = 'Cortex_151507'
|
271
|
+
spe_name = name
|
272
|
+
# ld_pth = f"/storage/yangjianLab/songliyang/SpatialData/Data/Brain/Human/Nature_Neuroscience_2021/annotation/{spe_name}/snp_annotation"
|
273
|
+
ld_pth = f"/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/snake_workdir/{name}/generate_ldscore"
|
274
|
+
out_pth = f"/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/snake_workdir/{name}/ldsc"
|
275
|
+
gwas_file = "ADULT1_ADULT2_ONSET_ASTHMA"
|
276
|
+
# Prepare the arguments list using f-strings
|
277
|
+
args_list = [
|
278
|
+
"--h2", f"{gwas_root}/{gwas_file}.sumstats.gz",
|
279
|
+
"--w_file", "/storage/yangjianLab/sharedata/LDSC_resource/LDSC_SEG_ldscores/weights_hm3_no_hla/weights.",
|
280
|
+
"--sample_name", spe_name,
|
281
|
+
"--num_processes", '4',
|
282
|
+
"--ldscore_input_dir", ld_pth,
|
283
|
+
"--ldsc_save_dir", out_pth,
|
284
|
+
'--trait_name', 'adult1_adult2_onset_asthma'
|
285
|
+
]
|
286
|
+
# args = parser.parse_args(args_list)
|
287
|
+
else:
|
288
|
+
args = parser.parse_args()
|
289
|
+
|
290
|
+
os.chdir('/storage/yangjianLab/chenwenhao/tmp/gsMap_Height_debug')
|
291
|
+
TASK_ID = 16
|
292
|
+
spe_name = f'E{TASK_ID}.5_E1S1'
|
293
|
+
config = SpatialLDSCConfig(**{'all_chunk': None,
|
294
|
+
'chisq_max': None,
|
295
|
+
# 'sumstats_file': '/storage/yangjianLab/songliyang/GWAS_trait/LDSC/GIANT_EUR_Height_2022_Nature.sumstats.gz',
|
296
|
+
'ldsc_save_dir': f'{spe_name}/ldsc_results_three_row_sum_sub_config_traits',
|
297
|
+
'ldscore_input_dir': '/storage/yangjianLab/songliyang/SpatialData/Data/Embryo/Mice/Cell_MOSTA/annotation/E16.5_E1S1/generate_ldscore_new',
|
298
|
+
'n_blocks': 200,
|
299
|
+
'not_M_5_50': False,
|
300
|
+
'num_processes': 15,
|
301
|
+
'sample_name': spe_name,
|
302
|
+
# 'trait_name': 'GIANT_EUR_Height_2022_Nature',
|
303
|
+
'sumstats_config_file': '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/src/gsMap/example/sumstats_config_sub.yaml',
|
304
|
+
'w_file': '/storage/yangjianLab/sharedata/LDSC_resource/LDSC_SEG_ldscores/weights_hm3_no_hla/weights.'
|
305
|
+
})
|
306
|
+
# config = SpatialLDSCConfig(**vars(args))
|
307
|
+
run_spatial_ldsc(config)
|