gsMap 1.71__py3-none-any.whl → 1.71.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN/__init__.py +0 -0
- gsMap/GNN/adjacency_matrix.py +75 -75
- gsMap/GNN/model.py +90 -89
- gsMap/GNN/train.py +0 -0
- gsMap/__init__.py +5 -5
- gsMap/__main__.py +2 -2
- gsMap/cauchy_combination_test.py +141 -141
- gsMap/config.py +805 -805
- gsMap/diagnosis.py +273 -273
- gsMap/find_latent_representation.py +133 -133
- gsMap/format_sumstats.py +407 -407
- gsMap/generate_ldscore.py +618 -618
- gsMap/latent_to_gene.py +234 -234
- gsMap/main.py +31 -31
- gsMap/report.py +160 -160
- gsMap/run_all_mode.py +194 -194
- gsMap/setup.py +0 -0
- gsMap/spatial_ldsc_multiple_sumstats.py +380 -380
- gsMap/templates/report_template.html +198 -198
- gsMap/utils/__init__.py +0 -0
- gsMap/utils/generate_r2_matrix.py +735 -735
- gsMap/utils/jackknife.py +514 -514
- gsMap/utils/make_annotations.py +518 -518
- gsMap/utils/manhattan_plot.py +639 -639
- gsMap/utils/regression_read.py +294 -294
- gsMap/visualize.py +198 -198
- {gsmap-1.71.dist-info → gsmap-1.71.1.dist-info}/LICENSE +21 -21
- {gsmap-1.71.dist-info → gsmap-1.71.1.dist-info}/METADATA +2 -2
- gsmap-1.71.1.dist-info/RECORD +31 -0
- gsmap-1.71.dist-info/RECORD +0 -31
- {gsmap-1.71.dist-info → gsmap-1.71.1.dist-info}/WHEEL +0 -0
- {gsmap-1.71.dist-info → gsmap-1.71.1.dist-info}/entry_points.txt +0 -0
gsMap/utils/regression_read.py
CHANGED
@@ -1,294 +1,294 @@
|
|
1
|
-
import numpy as np
|
2
|
-
import pandas as pd
|
3
|
-
import os
|
4
|
-
|
5
|
-
|
6
|
-
# Fun for reading gwas data
|
7
|
-
def _read_sumstats(fh, alleles=False, dropna=False):
|
8
|
-
'''
|
9
|
-
Parse gwas summary statistics.
|
10
|
-
'''
|
11
|
-
print('Reading summary statistics from {S} ...'.format(S=fh))
|
12
|
-
sumstats = ps_sumstats(fh, alleles=alleles, dropna=dropna)
|
13
|
-
print('Read summary statistics for {N} SNPs.'.format(N=len(sumstats)))
|
14
|
-
|
15
|
-
m = len(sumstats)
|
16
|
-
sumstats = sumstats.drop_duplicates(subset='SNP')
|
17
|
-
if m > len(sumstats):
|
18
|
-
print('Dropped {M} SNPs with duplicated rs numbers.'.format(M=m - len(sumstats)))
|
19
|
-
|
20
|
-
return sumstats
|
21
|
-
|
22
|
-
|
23
|
-
def ps_sumstats(fh, alleles=False, dropna=True):
|
24
|
-
'''
|
25
|
-
Parses .sumstats files. See docs/file_formats_sumstats.txt.
|
26
|
-
'''
|
27
|
-
|
28
|
-
dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str}
|
29
|
-
compression = get_compression(fh)
|
30
|
-
usecols = ['SNP', 'Z', 'N']
|
31
|
-
if alleles:
|
32
|
-
usecols += ['A1', 'A2']
|
33
|
-
|
34
|
-
try:
|
35
|
-
x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression)
|
36
|
-
except (AttributeError, ValueError) as e:
|
37
|
-
raise ValueError('Improperly formatted sumstats file: ' + str(e.args))
|
38
|
-
|
39
|
-
if dropna:
|
40
|
-
x = x.dropna(how='any')
|
41
|
-
|
42
|
-
return x
|
43
|
-
|
44
|
-
|
45
|
-
def get_compression(fh):
|
46
|
-
'''
|
47
|
-
Determin the format of compression used with read_csv?
|
48
|
-
'''
|
49
|
-
if fh.endswith('gz'):
|
50
|
-
compression = 'gzip'
|
51
|
-
elif fh.endswith('bz2'):
|
52
|
-
compression = 'bz2'
|
53
|
-
else:
|
54
|
-
compression = None
|
55
|
-
# -
|
56
|
-
return compression
|
57
|
-
|
58
|
-
|
59
|
-
def read_csv(fh, **kwargs):
|
60
|
-
'''
|
61
|
-
Read the csv data
|
62
|
-
'''
|
63
|
-
return pd.read_csv(fh, sep='\s+', na_values='.', **kwargs)
|
64
|
-
|
65
|
-
|
66
|
-
# Fun for reading loading LD scores
|
67
|
-
def which_compression(fh):
|
68
|
-
'''
|
69
|
-
Given a file prefix, figure out what sort of compression to use.
|
70
|
-
'''
|
71
|
-
if os.access(fh + '.bz2', 4):
|
72
|
-
suffix = '.bz2'
|
73
|
-
compression = 'bz2'
|
74
|
-
elif os.access(fh + '.gz', 4):
|
75
|
-
suffix = '.gz'
|
76
|
-
compression = 'gzip'
|
77
|
-
elif os.access(fh + '.parquet', 4):
|
78
|
-
suffix = '.parquet'
|
79
|
-
compression = 'parquet'
|
80
|
-
elif os.access(fh + '.feather', 4):
|
81
|
-
suffix = '.feather'
|
82
|
-
compression = 'feather'
|
83
|
-
elif os.access(fh, 4):
|
84
|
-
suffix = ''
|
85
|
-
compression = None
|
86
|
-
else:
|
87
|
-
raise IOError('Could not open {F}[./gz/bz2/parquet/feather]'.format(F=fh))
|
88
|
-
# -
|
89
|
-
return suffix, compression
|
90
|
-
|
91
|
-
|
92
|
-
def _read_ref_ld(ld_file):
|
93
|
-
suffix = '.l2.ldscore'
|
94
|
-
file = ld_file
|
95
|
-
first_fh = f'{file}1{suffix}'
|
96
|
-
s, compression = which_compression(first_fh)
|
97
|
-
#
|
98
|
-
ldscore_array = []
|
99
|
-
print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
|
100
|
-
|
101
|
-
for chr in range(1, 23):
|
102
|
-
file_chr = f'{file}{chr}{suffix}{s}'
|
103
|
-
#
|
104
|
-
if compression == 'parquet':
|
105
|
-
x = pd.read_parquet(file_chr)
|
106
|
-
elif compression == 'feather':
|
107
|
-
x = pd.read_feather(file_chr)
|
108
|
-
else:
|
109
|
-
x = pd.read_csv(file_chr, compression=compression, sep='\t')
|
110
|
-
|
111
|
-
x = x.sort_values(by=['CHR', 'BP']) # SEs will be wrong unless sorted
|
112
|
-
|
113
|
-
columns_to_drop = ['MAF', 'CM', 'Gene', 'TSS', 'CHR', 'BP']
|
114
|
-
columns_to_drop = [col for col in columns_to_drop if col in x.columns]
|
115
|
-
x = x.drop(columns_to_drop, axis=1)
|
116
|
-
|
117
|
-
ldscore_array.append(x)
|
118
|
-
#
|
119
|
-
ref_ld = pd.concat(ldscore_array, axis=0)
|
120
|
-
return ref_ld
|
121
|
-
|
122
|
-
|
123
|
-
def _read_ref_ld_v2(ld_file):
|
124
|
-
suffix = '.l2.ldscore'
|
125
|
-
file = ld_file
|
126
|
-
first_fh = f'{file}1{suffix}'
|
127
|
-
s, compression = which_compression(first_fh)
|
128
|
-
print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
|
129
|
-
ref_ld = pd.concat(
|
130
|
-
[pd.read_feather(f'{file}{chr}{suffix}{s}') for chr in range(1, 23)], axis=0
|
131
|
-
)
|
132
|
-
# set first column as index
|
133
|
-
ref_ld.rename(columns={'index': 'SNP'}, inplace=True)
|
134
|
-
ref_ld.set_index('SNP', inplace=True)
|
135
|
-
return ref_ld
|
136
|
-
|
137
|
-
def _read_M_v2(ld_file, n_annot, not_M_5_50):
|
138
|
-
suffix = '.l2.M'
|
139
|
-
if not not_M_5_50:
|
140
|
-
suffix += '_5_50'
|
141
|
-
M_annot= np.array(
|
142
|
-
[
|
143
|
-
np.loadtxt(f'{ld_file}{chr}{suffix}', )
|
144
|
-
for chr in range(1, 23)]
|
145
|
-
|
146
|
-
)
|
147
|
-
assert M_annot.shape == (22, n_annot)
|
148
|
-
return M_annot.sum(axis=0).reshape((1, n_annot))
|
149
|
-
# Fun for reading M annotations
|
150
|
-
def _read_M(ld_file, n_annot, not_M_5_50):
|
151
|
-
'''
|
152
|
-
Read M (--M, --M-file, etc).
|
153
|
-
'''
|
154
|
-
M_annot = M(ld_file, common=(not not_M_5_50))
|
155
|
-
|
156
|
-
try:
|
157
|
-
M_annot = np.array(M_annot).reshape((1, n_annot))
|
158
|
-
except ValueError as e:
|
159
|
-
raise ValueError('# terms in --M must match # of LD Scores in --ref-ld.\n' + str(e.args))
|
160
|
-
return M_annot
|
161
|
-
|
162
|
-
|
163
|
-
def M(fh, common=False):
|
164
|
-
'''
|
165
|
-
Parses .l{N}.M files, split across num chromosomes.
|
166
|
-
'''
|
167
|
-
suffix = '.l2.M'
|
168
|
-
if common:
|
169
|
-
suffix += '_5_50'
|
170
|
-
# -
|
171
|
-
M_array = []
|
172
|
-
for i in range(1, 23):
|
173
|
-
M_current = pd.read_csv(f'{fh}{i}' + suffix, header=None)
|
174
|
-
M_array.append(M_current)
|
175
|
-
|
176
|
-
M_array = pd.concat(M_array, axis=1).sum(axis=1)
|
177
|
-
# -
|
178
|
-
return np.array(M_array).reshape((1, len(M_array)))
|
179
|
-
|
180
|
-
|
181
|
-
def _check_variance(M_annot, ref_ld):
|
182
|
-
'''
|
183
|
-
Remove zero-variance LD Scores.
|
184
|
-
'''
|
185
|
-
ii = ref_ld.iloc[:, 1:].var() == 0 # NB there is a SNP column here
|
186
|
-
if ii.all():
|
187
|
-
raise ValueError('All LD Scores have zero variance.')
|
188
|
-
else:
|
189
|
-
print('Removing partitioned LD Scores with zero variance.')
|
190
|
-
ii_snp = np.array([True] + list(~ii))
|
191
|
-
ii_m = np.array(~ii)
|
192
|
-
ref_ld = ref_ld.iloc[:, ii_snp]
|
193
|
-
M_annot = M_annot[:, ii_m]
|
194
|
-
# -
|
195
|
-
return M_annot, ref_ld, ii
|
196
|
-
def _check_variance_v2(M_annot, ref_ld):
|
197
|
-
ii = ref_ld.var() == 0
|
198
|
-
if ii.all():
|
199
|
-
raise ValueError('All LD Scores have zero variance.')
|
200
|
-
elif not ii.any():
|
201
|
-
print('No partitioned LD Scores have zero variance.')
|
202
|
-
else:
|
203
|
-
ii_snp= ii_m = np.array(~ii)
|
204
|
-
print(f'Removing {sum(ii)} partitioned LD Scores with zero variance.')
|
205
|
-
ref_ld = ref_ld.iloc[:, ii_snp]
|
206
|
-
M_annot = M_annot[:, ii_m]
|
207
|
-
return M_annot, ref_ld
|
208
|
-
|
209
|
-
|
210
|
-
# Fun for reading regression weights
|
211
|
-
def which_compression(fh):
|
212
|
-
'''
|
213
|
-
Given a file prefix, figure out what sort of compression to use.
|
214
|
-
'''
|
215
|
-
if os.access(fh + '.bz2', 4):
|
216
|
-
suffix = '.bz2'
|
217
|
-
compression = 'bz2'
|
218
|
-
elif os.access(fh + '.gz', 4):
|
219
|
-
suffix = '.gz'
|
220
|
-
compression = 'gzip'
|
221
|
-
elif os.access(fh + '.parquet', 4):
|
222
|
-
suffix = '.parquet'
|
223
|
-
compression = 'parquet'
|
224
|
-
elif os.access(fh + '.feather', 4):
|
225
|
-
suffix = '.feather'
|
226
|
-
compression = 'feather'
|
227
|
-
elif os.access(fh, 4):
|
228
|
-
suffix = ''
|
229
|
-
compression = None
|
230
|
-
else:
|
231
|
-
raise IOError('Could not open {F}[./gz/bz2/parquet/feather]'.format(F=fh))
|
232
|
-
# -
|
233
|
-
return suffix, compression
|
234
|
-
|
235
|
-
|
236
|
-
def _read_w_ld(w_file):
|
237
|
-
suffix = '.l2.ldscore'
|
238
|
-
file = w_file
|
239
|
-
first_fh = f'{file}1{suffix}'
|
240
|
-
s, compression = which_compression(first_fh)
|
241
|
-
#
|
242
|
-
w_array = []
|
243
|
-
print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
|
244
|
-
|
245
|
-
for chr in range(1, 23):
|
246
|
-
file_chr = f'{file}{chr}{suffix}{s}'
|
247
|
-
#
|
248
|
-
if compression == 'parquet':
|
249
|
-
x = pd.read_parquet(file_chr)
|
250
|
-
elif compression == 'feather':
|
251
|
-
x = pd.read_feather(file_chr)
|
252
|
-
else:
|
253
|
-
x = pd.read_csv(file_chr, compression=compression, sep='\t')
|
254
|
-
|
255
|
-
x = x.sort_values(by=['CHR', 'BP'])
|
256
|
-
|
257
|
-
columns_to_drop = ['MAF', 'CM', 'Gene', 'TSS', 'CHR', 'BP']
|
258
|
-
columns_to_drop = [col for col in columns_to_drop if col in x.columns]
|
259
|
-
x = x.drop(columns_to_drop, axis=1)
|
260
|
-
|
261
|
-
w_array.append(x)
|
262
|
-
#
|
263
|
-
w_ld = pd.concat(w_array, axis=0)
|
264
|
-
w_ld.columns = ['SNP', 'LD_weights']
|
265
|
-
|
266
|
-
return w_ld
|
267
|
-
|
268
|
-
|
269
|
-
# Fun for merging
|
270
|
-
def _merge_and_log(ld, sumstats, noun):
|
271
|
-
'''
|
272
|
-
Wrap smart merge with log messages about # of SNPs.
|
273
|
-
'''
|
274
|
-
sumstats = smart_merge(ld, sumstats)
|
275
|
-
msg = 'After merging with {F}, {N} SNPs remain.'
|
276
|
-
if len(sumstats) == 0:
|
277
|
-
raise ValueError(msg.format(N=len(sumstats), F=noun))
|
278
|
-
else:
|
279
|
-
print(msg.format(N=len(sumstats), F=noun))
|
280
|
-
# -
|
281
|
-
return sumstats
|
282
|
-
|
283
|
-
|
284
|
-
def smart_merge(x, y):
|
285
|
-
'''
|
286
|
-
Check if SNP columns are equal. If so, save time by using concat instead of merge.
|
287
|
-
'''
|
288
|
-
if len(x) == len(y) and (x.index == y.index).all() and (x.SNP == y.SNP).all():
|
289
|
-
x = x.reset_index(drop=True)
|
290
|
-
y = y.reset_index(drop=True).drop('SNP', 1)
|
291
|
-
out = pd.concat([x, y], axis=1)
|
292
|
-
else:
|
293
|
-
out = pd.merge(x, y, how='inner', on='SNP')
|
294
|
-
return out
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
import os
|
4
|
+
|
5
|
+
|
6
|
+
# Fun for reading gwas data
|
7
|
+
def _read_sumstats(fh, alleles=False, dropna=False):
|
8
|
+
'''
|
9
|
+
Parse gwas summary statistics.
|
10
|
+
'''
|
11
|
+
print('Reading summary statistics from {S} ...'.format(S=fh))
|
12
|
+
sumstats = ps_sumstats(fh, alleles=alleles, dropna=dropna)
|
13
|
+
print('Read summary statistics for {N} SNPs.'.format(N=len(sumstats)))
|
14
|
+
|
15
|
+
m = len(sumstats)
|
16
|
+
sumstats = sumstats.drop_duplicates(subset='SNP')
|
17
|
+
if m > len(sumstats):
|
18
|
+
print('Dropped {M} SNPs with duplicated rs numbers.'.format(M=m - len(sumstats)))
|
19
|
+
|
20
|
+
return sumstats
|
21
|
+
|
22
|
+
|
23
|
+
def ps_sumstats(fh, alleles=False, dropna=True):
|
24
|
+
'''
|
25
|
+
Parses .sumstats files. See docs/file_formats_sumstats.txt.
|
26
|
+
'''
|
27
|
+
|
28
|
+
dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str}
|
29
|
+
compression = get_compression(fh)
|
30
|
+
usecols = ['SNP', 'Z', 'N']
|
31
|
+
if alleles:
|
32
|
+
usecols += ['A1', 'A2']
|
33
|
+
|
34
|
+
try:
|
35
|
+
x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression)
|
36
|
+
except (AttributeError, ValueError) as e:
|
37
|
+
raise ValueError('Improperly formatted sumstats file: ' + str(e.args))
|
38
|
+
|
39
|
+
if dropna:
|
40
|
+
x = x.dropna(how='any')
|
41
|
+
|
42
|
+
return x
|
43
|
+
|
44
|
+
|
45
|
+
def get_compression(fh):
|
46
|
+
'''
|
47
|
+
Determin the format of compression used with read_csv?
|
48
|
+
'''
|
49
|
+
if fh.endswith('gz'):
|
50
|
+
compression = 'gzip'
|
51
|
+
elif fh.endswith('bz2'):
|
52
|
+
compression = 'bz2'
|
53
|
+
else:
|
54
|
+
compression = None
|
55
|
+
# -
|
56
|
+
return compression
|
57
|
+
|
58
|
+
|
59
|
+
def read_csv(fh, **kwargs):
|
60
|
+
'''
|
61
|
+
Read the csv data
|
62
|
+
'''
|
63
|
+
return pd.read_csv(fh, sep='\s+', na_values='.', **kwargs)
|
64
|
+
|
65
|
+
|
66
|
+
# Fun for reading loading LD scores
|
67
|
+
def which_compression(fh):
|
68
|
+
'''
|
69
|
+
Given a file prefix, figure out what sort of compression to use.
|
70
|
+
'''
|
71
|
+
if os.access(fh + '.bz2', 4):
|
72
|
+
suffix = '.bz2'
|
73
|
+
compression = 'bz2'
|
74
|
+
elif os.access(fh + '.gz', 4):
|
75
|
+
suffix = '.gz'
|
76
|
+
compression = 'gzip'
|
77
|
+
elif os.access(fh + '.parquet', 4):
|
78
|
+
suffix = '.parquet'
|
79
|
+
compression = 'parquet'
|
80
|
+
elif os.access(fh + '.feather', 4):
|
81
|
+
suffix = '.feather'
|
82
|
+
compression = 'feather'
|
83
|
+
elif os.access(fh, 4):
|
84
|
+
suffix = ''
|
85
|
+
compression = None
|
86
|
+
else:
|
87
|
+
raise IOError('Could not open {F}[./gz/bz2/parquet/feather]'.format(F=fh))
|
88
|
+
# -
|
89
|
+
return suffix, compression
|
90
|
+
|
91
|
+
|
92
|
+
def _read_ref_ld(ld_file):
|
93
|
+
suffix = '.l2.ldscore'
|
94
|
+
file = ld_file
|
95
|
+
first_fh = f'{file}1{suffix}'
|
96
|
+
s, compression = which_compression(first_fh)
|
97
|
+
#
|
98
|
+
ldscore_array = []
|
99
|
+
print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
|
100
|
+
|
101
|
+
for chr in range(1, 23):
|
102
|
+
file_chr = f'{file}{chr}{suffix}{s}'
|
103
|
+
#
|
104
|
+
if compression == 'parquet':
|
105
|
+
x = pd.read_parquet(file_chr)
|
106
|
+
elif compression == 'feather':
|
107
|
+
x = pd.read_feather(file_chr)
|
108
|
+
else:
|
109
|
+
x = pd.read_csv(file_chr, compression=compression, sep='\t')
|
110
|
+
|
111
|
+
x = x.sort_values(by=['CHR', 'BP']) # SEs will be wrong unless sorted
|
112
|
+
|
113
|
+
columns_to_drop = ['MAF', 'CM', 'Gene', 'TSS', 'CHR', 'BP']
|
114
|
+
columns_to_drop = [col for col in columns_to_drop if col in x.columns]
|
115
|
+
x = x.drop(columns_to_drop, axis=1)
|
116
|
+
|
117
|
+
ldscore_array.append(x)
|
118
|
+
#
|
119
|
+
ref_ld = pd.concat(ldscore_array, axis=0)
|
120
|
+
return ref_ld
|
121
|
+
|
122
|
+
|
123
|
+
def _read_ref_ld_v2(ld_file):
|
124
|
+
suffix = '.l2.ldscore'
|
125
|
+
file = ld_file
|
126
|
+
first_fh = f'{file}1{suffix}'
|
127
|
+
s, compression = which_compression(first_fh)
|
128
|
+
print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
|
129
|
+
ref_ld = pd.concat(
|
130
|
+
[pd.read_feather(f'{file}{chr}{suffix}{s}') for chr in range(1, 23)], axis=0
|
131
|
+
)
|
132
|
+
# set first column as index
|
133
|
+
ref_ld.rename(columns={'index': 'SNP'}, inplace=True)
|
134
|
+
ref_ld.set_index('SNP', inplace=True)
|
135
|
+
return ref_ld
|
136
|
+
|
137
|
+
def _read_M_v2(ld_file, n_annot, not_M_5_50):
|
138
|
+
suffix = '.l2.M'
|
139
|
+
if not not_M_5_50:
|
140
|
+
suffix += '_5_50'
|
141
|
+
M_annot= np.array(
|
142
|
+
[
|
143
|
+
np.loadtxt(f'{ld_file}{chr}{suffix}', )
|
144
|
+
for chr in range(1, 23)]
|
145
|
+
|
146
|
+
)
|
147
|
+
assert M_annot.shape == (22, n_annot)
|
148
|
+
return M_annot.sum(axis=0).reshape((1, n_annot))
|
149
|
+
# Fun for reading M annotations
|
150
|
+
def _read_M(ld_file, n_annot, not_M_5_50):
|
151
|
+
'''
|
152
|
+
Read M (--M, --M-file, etc).
|
153
|
+
'''
|
154
|
+
M_annot = M(ld_file, common=(not not_M_5_50))
|
155
|
+
|
156
|
+
try:
|
157
|
+
M_annot = np.array(M_annot).reshape((1, n_annot))
|
158
|
+
except ValueError as e:
|
159
|
+
raise ValueError('# terms in --M must match # of LD Scores in --ref-ld.\n' + str(e.args))
|
160
|
+
return M_annot
|
161
|
+
|
162
|
+
|
163
|
+
def M(fh, common=False):
|
164
|
+
'''
|
165
|
+
Parses .l{N}.M files, split across num chromosomes.
|
166
|
+
'''
|
167
|
+
suffix = '.l2.M'
|
168
|
+
if common:
|
169
|
+
suffix += '_5_50'
|
170
|
+
# -
|
171
|
+
M_array = []
|
172
|
+
for i in range(1, 23):
|
173
|
+
M_current = pd.read_csv(f'{fh}{i}' + suffix, header=None)
|
174
|
+
M_array.append(M_current)
|
175
|
+
|
176
|
+
M_array = pd.concat(M_array, axis=1).sum(axis=1)
|
177
|
+
# -
|
178
|
+
return np.array(M_array).reshape((1, len(M_array)))
|
179
|
+
|
180
|
+
|
181
|
+
def _check_variance(M_annot, ref_ld):
|
182
|
+
'''
|
183
|
+
Remove zero-variance LD Scores.
|
184
|
+
'''
|
185
|
+
ii = ref_ld.iloc[:, 1:].var() == 0 # NB there is a SNP column here
|
186
|
+
if ii.all():
|
187
|
+
raise ValueError('All LD Scores have zero variance.')
|
188
|
+
else:
|
189
|
+
print('Removing partitioned LD Scores with zero variance.')
|
190
|
+
ii_snp = np.array([True] + list(~ii))
|
191
|
+
ii_m = np.array(~ii)
|
192
|
+
ref_ld = ref_ld.iloc[:, ii_snp]
|
193
|
+
M_annot = M_annot[:, ii_m]
|
194
|
+
# -
|
195
|
+
return M_annot, ref_ld, ii
|
196
|
+
def _check_variance_v2(M_annot, ref_ld):
|
197
|
+
ii = ref_ld.var() == 0
|
198
|
+
if ii.all():
|
199
|
+
raise ValueError('All LD Scores have zero variance.')
|
200
|
+
elif not ii.any():
|
201
|
+
print('No partitioned LD Scores have zero variance.')
|
202
|
+
else:
|
203
|
+
ii_snp= ii_m = np.array(~ii)
|
204
|
+
print(f'Removing {sum(ii)} partitioned LD Scores with zero variance.')
|
205
|
+
ref_ld = ref_ld.iloc[:, ii_snp]
|
206
|
+
M_annot = M_annot[:, ii_m]
|
207
|
+
return M_annot, ref_ld
|
208
|
+
|
209
|
+
|
210
|
+
# Fun for reading regression weights
|
211
|
+
def which_compression(fh):
|
212
|
+
'''
|
213
|
+
Given a file prefix, figure out what sort of compression to use.
|
214
|
+
'''
|
215
|
+
if os.access(fh + '.bz2', 4):
|
216
|
+
suffix = '.bz2'
|
217
|
+
compression = 'bz2'
|
218
|
+
elif os.access(fh + '.gz', 4):
|
219
|
+
suffix = '.gz'
|
220
|
+
compression = 'gzip'
|
221
|
+
elif os.access(fh + '.parquet', 4):
|
222
|
+
suffix = '.parquet'
|
223
|
+
compression = 'parquet'
|
224
|
+
elif os.access(fh + '.feather', 4):
|
225
|
+
suffix = '.feather'
|
226
|
+
compression = 'feather'
|
227
|
+
elif os.access(fh, 4):
|
228
|
+
suffix = ''
|
229
|
+
compression = None
|
230
|
+
else:
|
231
|
+
raise IOError('Could not open {F}[./gz/bz2/parquet/feather]'.format(F=fh))
|
232
|
+
# -
|
233
|
+
return suffix, compression
|
234
|
+
|
235
|
+
|
236
|
+
def _read_w_ld(w_file):
|
237
|
+
suffix = '.l2.ldscore'
|
238
|
+
file = w_file
|
239
|
+
first_fh = f'{file}1{suffix}'
|
240
|
+
s, compression = which_compression(first_fh)
|
241
|
+
#
|
242
|
+
w_array = []
|
243
|
+
print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
|
244
|
+
|
245
|
+
for chr in range(1, 23):
|
246
|
+
file_chr = f'{file}{chr}{suffix}{s}'
|
247
|
+
#
|
248
|
+
if compression == 'parquet':
|
249
|
+
x = pd.read_parquet(file_chr)
|
250
|
+
elif compression == 'feather':
|
251
|
+
x = pd.read_feather(file_chr)
|
252
|
+
else:
|
253
|
+
x = pd.read_csv(file_chr, compression=compression, sep='\t')
|
254
|
+
|
255
|
+
x = x.sort_values(by=['CHR', 'BP'])
|
256
|
+
|
257
|
+
columns_to_drop = ['MAF', 'CM', 'Gene', 'TSS', 'CHR', 'BP']
|
258
|
+
columns_to_drop = [col for col in columns_to_drop if col in x.columns]
|
259
|
+
x = x.drop(columns_to_drop, axis=1)
|
260
|
+
|
261
|
+
w_array.append(x)
|
262
|
+
#
|
263
|
+
w_ld = pd.concat(w_array, axis=0)
|
264
|
+
w_ld.columns = ['SNP', 'LD_weights']
|
265
|
+
|
266
|
+
return w_ld
|
267
|
+
|
268
|
+
|
269
|
+
# Fun for merging
|
270
|
+
def _merge_and_log(ld, sumstats, noun):
|
271
|
+
'''
|
272
|
+
Wrap smart merge with log messages about # of SNPs.
|
273
|
+
'''
|
274
|
+
sumstats = smart_merge(ld, sumstats)
|
275
|
+
msg = 'After merging with {F}, {N} SNPs remain.'
|
276
|
+
if len(sumstats) == 0:
|
277
|
+
raise ValueError(msg.format(N=len(sumstats), F=noun))
|
278
|
+
else:
|
279
|
+
print(msg.format(N=len(sumstats), F=noun))
|
280
|
+
# -
|
281
|
+
return sumstats
|
282
|
+
|
283
|
+
|
284
|
+
def smart_merge(x, y):
|
285
|
+
'''
|
286
|
+
Check if SNP columns are equal. If so, save time by using concat instead of merge.
|
287
|
+
'''
|
288
|
+
if len(x) == len(y) and (x.index == y.index).all() and (x.SNP == y.SNP).all():
|
289
|
+
x = x.reset_index(drop=True)
|
290
|
+
y = y.reset_index(drop=True).drop('SNP', 1)
|
291
|
+
out = pd.concat([x, y], axis=1)
|
292
|
+
else:
|
293
|
+
out = pd.merge(x, y, how='inner', on='SNP')
|
294
|
+
return out
|