gsMap 1.71__py3-none-any.whl → 1.71.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,294 +1,294 @@
1
- import numpy as np
2
- import pandas as pd
3
- import os
4
-
5
-
6
- # Fun for reading gwas data
7
- def _read_sumstats(fh, alleles=False, dropna=False):
8
- '''
9
- Parse gwas summary statistics.
10
- '''
11
- print('Reading summary statistics from {S} ...'.format(S=fh))
12
- sumstats = ps_sumstats(fh, alleles=alleles, dropna=dropna)
13
- print('Read summary statistics for {N} SNPs.'.format(N=len(sumstats)))
14
-
15
- m = len(sumstats)
16
- sumstats = sumstats.drop_duplicates(subset='SNP')
17
- if m > len(sumstats):
18
- print('Dropped {M} SNPs with duplicated rs numbers.'.format(M=m - len(sumstats)))
19
-
20
- return sumstats
21
-
22
-
23
- def ps_sumstats(fh, alleles=False, dropna=True):
24
- '''
25
- Parses .sumstats files. See docs/file_formats_sumstats.txt.
26
- '''
27
-
28
- dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str}
29
- compression = get_compression(fh)
30
- usecols = ['SNP', 'Z', 'N']
31
- if alleles:
32
- usecols += ['A1', 'A2']
33
-
34
- try:
35
- x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression)
36
- except (AttributeError, ValueError) as e:
37
- raise ValueError('Improperly formatted sumstats file: ' + str(e.args))
38
-
39
- if dropna:
40
- x = x.dropna(how='any')
41
-
42
- return x
43
-
44
-
45
- def get_compression(fh):
46
- '''
47
- Determin the format of compression used with read_csv?
48
- '''
49
- if fh.endswith('gz'):
50
- compression = 'gzip'
51
- elif fh.endswith('bz2'):
52
- compression = 'bz2'
53
- else:
54
- compression = None
55
- # -
56
- return compression
57
-
58
-
59
- def read_csv(fh, **kwargs):
60
- '''
61
- Read the csv data
62
- '''
63
- return pd.read_csv(fh, sep='\s+', na_values='.', **kwargs)
64
-
65
-
66
- # Fun for reading loading LD scores
67
- def which_compression(fh):
68
- '''
69
- Given a file prefix, figure out what sort of compression to use.
70
- '''
71
- if os.access(fh + '.bz2', 4):
72
- suffix = '.bz2'
73
- compression = 'bz2'
74
- elif os.access(fh + '.gz', 4):
75
- suffix = '.gz'
76
- compression = 'gzip'
77
- elif os.access(fh + '.parquet', 4):
78
- suffix = '.parquet'
79
- compression = 'parquet'
80
- elif os.access(fh + '.feather', 4):
81
- suffix = '.feather'
82
- compression = 'feather'
83
- elif os.access(fh, 4):
84
- suffix = ''
85
- compression = None
86
- else:
87
- raise IOError('Could not open {F}[./gz/bz2/parquet/feather]'.format(F=fh))
88
- # -
89
- return suffix, compression
90
-
91
-
92
- def _read_ref_ld(ld_file):
93
- suffix = '.l2.ldscore'
94
- file = ld_file
95
- first_fh = f'{file}1{suffix}'
96
- s, compression = which_compression(first_fh)
97
- #
98
- ldscore_array = []
99
- print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
100
-
101
- for chr in range(1, 23):
102
- file_chr = f'{file}{chr}{suffix}{s}'
103
- #
104
- if compression == 'parquet':
105
- x = pd.read_parquet(file_chr)
106
- elif compression == 'feather':
107
- x = pd.read_feather(file_chr)
108
- else:
109
- x = pd.read_csv(file_chr, compression=compression, sep='\t')
110
-
111
- x = x.sort_values(by=['CHR', 'BP']) # SEs will be wrong unless sorted
112
-
113
- columns_to_drop = ['MAF', 'CM', 'Gene', 'TSS', 'CHR', 'BP']
114
- columns_to_drop = [col for col in columns_to_drop if col in x.columns]
115
- x = x.drop(columns_to_drop, axis=1)
116
-
117
- ldscore_array.append(x)
118
- #
119
- ref_ld = pd.concat(ldscore_array, axis=0)
120
- return ref_ld
121
-
122
-
123
- def _read_ref_ld_v2(ld_file):
124
- suffix = '.l2.ldscore'
125
- file = ld_file
126
- first_fh = f'{file}1{suffix}'
127
- s, compression = which_compression(first_fh)
128
- print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
129
- ref_ld = pd.concat(
130
- [pd.read_feather(f'{file}{chr}{suffix}{s}') for chr in range(1, 23)], axis=0
131
- )
132
- # set first column as index
133
- ref_ld.rename(columns={'index': 'SNP'}, inplace=True)
134
- ref_ld.set_index('SNP', inplace=True)
135
- return ref_ld
136
-
137
- def _read_M_v2(ld_file, n_annot, not_M_5_50):
138
- suffix = '.l2.M'
139
- if not not_M_5_50:
140
- suffix += '_5_50'
141
- M_annot= np.array(
142
- [
143
- np.loadtxt(f'{ld_file}{chr}{suffix}', )
144
- for chr in range(1, 23)]
145
-
146
- )
147
- assert M_annot.shape == (22, n_annot)
148
- return M_annot.sum(axis=0).reshape((1, n_annot))
149
- # Fun for reading M annotations
150
- def _read_M(ld_file, n_annot, not_M_5_50):
151
- '''
152
- Read M (--M, --M-file, etc).
153
- '''
154
- M_annot = M(ld_file, common=(not not_M_5_50))
155
-
156
- try:
157
- M_annot = np.array(M_annot).reshape((1, n_annot))
158
- except ValueError as e:
159
- raise ValueError('# terms in --M must match # of LD Scores in --ref-ld.\n' + str(e.args))
160
- return M_annot
161
-
162
-
163
- def M(fh, common=False):
164
- '''
165
- Parses .l{N}.M files, split across num chromosomes.
166
- '''
167
- suffix = '.l2.M'
168
- if common:
169
- suffix += '_5_50'
170
- # -
171
- M_array = []
172
- for i in range(1, 23):
173
- M_current = pd.read_csv(f'{fh}{i}' + suffix, header=None)
174
- M_array.append(M_current)
175
-
176
- M_array = pd.concat(M_array, axis=1).sum(axis=1)
177
- # -
178
- return np.array(M_array).reshape((1, len(M_array)))
179
-
180
-
181
- def _check_variance(M_annot, ref_ld):
182
- '''
183
- Remove zero-variance LD Scores.
184
- '''
185
- ii = ref_ld.iloc[:, 1:].var() == 0 # NB there is a SNP column here
186
- if ii.all():
187
- raise ValueError('All LD Scores have zero variance.')
188
- else:
189
- print('Removing partitioned LD Scores with zero variance.')
190
- ii_snp = np.array([True] + list(~ii))
191
- ii_m = np.array(~ii)
192
- ref_ld = ref_ld.iloc[:, ii_snp]
193
- M_annot = M_annot[:, ii_m]
194
- # -
195
- return M_annot, ref_ld, ii
196
- def _check_variance_v2(M_annot, ref_ld):
197
- ii = ref_ld.var() == 0
198
- if ii.all():
199
- raise ValueError('All LD Scores have zero variance.')
200
- elif not ii.any():
201
- print('No partitioned LD Scores have zero variance.')
202
- else:
203
- ii_snp= ii_m = np.array(~ii)
204
- print(f'Removing {sum(ii)} partitioned LD Scores with zero variance.')
205
- ref_ld = ref_ld.iloc[:, ii_snp]
206
- M_annot = M_annot[:, ii_m]
207
- return M_annot, ref_ld
208
-
209
-
210
- # Fun for reading regression weights
211
- def which_compression(fh):
212
- '''
213
- Given a file prefix, figure out what sort of compression to use.
214
- '''
215
- if os.access(fh + '.bz2', 4):
216
- suffix = '.bz2'
217
- compression = 'bz2'
218
- elif os.access(fh + '.gz', 4):
219
- suffix = '.gz'
220
- compression = 'gzip'
221
- elif os.access(fh + '.parquet', 4):
222
- suffix = '.parquet'
223
- compression = 'parquet'
224
- elif os.access(fh + '.feather', 4):
225
- suffix = '.feather'
226
- compression = 'feather'
227
- elif os.access(fh, 4):
228
- suffix = ''
229
- compression = None
230
- else:
231
- raise IOError('Could not open {F}[./gz/bz2/parquet/feather]'.format(F=fh))
232
- # -
233
- return suffix, compression
234
-
235
-
236
- def _read_w_ld(w_file):
237
- suffix = '.l2.ldscore'
238
- file = w_file
239
- first_fh = f'{file}1{suffix}'
240
- s, compression = which_compression(first_fh)
241
- #
242
- w_array = []
243
- print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
244
-
245
- for chr in range(1, 23):
246
- file_chr = f'{file}{chr}{suffix}{s}'
247
- #
248
- if compression == 'parquet':
249
- x = pd.read_parquet(file_chr)
250
- elif compression == 'feather':
251
- x = pd.read_feather(file_chr)
252
- else:
253
- x = pd.read_csv(file_chr, compression=compression, sep='\t')
254
-
255
- x = x.sort_values(by=['CHR', 'BP'])
256
-
257
- columns_to_drop = ['MAF', 'CM', 'Gene', 'TSS', 'CHR', 'BP']
258
- columns_to_drop = [col for col in columns_to_drop if col in x.columns]
259
- x = x.drop(columns_to_drop, axis=1)
260
-
261
- w_array.append(x)
262
- #
263
- w_ld = pd.concat(w_array, axis=0)
264
- w_ld.columns = ['SNP', 'LD_weights']
265
-
266
- return w_ld
267
-
268
-
269
- # Fun for merging
270
- def _merge_and_log(ld, sumstats, noun):
271
- '''
272
- Wrap smart merge with log messages about # of SNPs.
273
- '''
274
- sumstats = smart_merge(ld, sumstats)
275
- msg = 'After merging with {F}, {N} SNPs remain.'
276
- if len(sumstats) == 0:
277
- raise ValueError(msg.format(N=len(sumstats), F=noun))
278
- else:
279
- print(msg.format(N=len(sumstats), F=noun))
280
- # -
281
- return sumstats
282
-
283
-
284
- def smart_merge(x, y):
285
- '''
286
- Check if SNP columns are equal. If so, save time by using concat instead of merge.
287
- '''
288
- if len(x) == len(y) and (x.index == y.index).all() and (x.SNP == y.SNP).all():
289
- x = x.reset_index(drop=True)
290
- y = y.reset_index(drop=True).drop('SNP', 1)
291
- out = pd.concat([x, y], axis=1)
292
- else:
293
- out = pd.merge(x, y, how='inner', on='SNP')
294
- return out
1
+ import numpy as np
2
+ import pandas as pd
3
+ import os
4
+
5
+
6
+ # Fun for reading gwas data
7
+ def _read_sumstats(fh, alleles=False, dropna=False):
8
+ '''
9
+ Parse gwas summary statistics.
10
+ '''
11
+ print('Reading summary statistics from {S} ...'.format(S=fh))
12
+ sumstats = ps_sumstats(fh, alleles=alleles, dropna=dropna)
13
+ print('Read summary statistics for {N} SNPs.'.format(N=len(sumstats)))
14
+
15
+ m = len(sumstats)
16
+ sumstats = sumstats.drop_duplicates(subset='SNP')
17
+ if m > len(sumstats):
18
+ print('Dropped {M} SNPs with duplicated rs numbers.'.format(M=m - len(sumstats)))
19
+
20
+ return sumstats
21
+
22
+
23
+ def ps_sumstats(fh, alleles=False, dropna=True):
24
+ '''
25
+ Parses .sumstats files. See docs/file_formats_sumstats.txt.
26
+ '''
27
+
28
+ dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str}
29
+ compression = get_compression(fh)
30
+ usecols = ['SNP', 'Z', 'N']
31
+ if alleles:
32
+ usecols += ['A1', 'A2']
33
+
34
+ try:
35
+ x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression)
36
+ except (AttributeError, ValueError) as e:
37
+ raise ValueError('Improperly formatted sumstats file: ' + str(e.args))
38
+
39
+ if dropna:
40
+ x = x.dropna(how='any')
41
+
42
+ return x
43
+
44
+
45
+ def get_compression(fh):
46
+ '''
47
+ Determin the format of compression used with read_csv?
48
+ '''
49
+ if fh.endswith('gz'):
50
+ compression = 'gzip'
51
+ elif fh.endswith('bz2'):
52
+ compression = 'bz2'
53
+ else:
54
+ compression = None
55
+ # -
56
+ return compression
57
+
58
+
59
+ def read_csv(fh, **kwargs):
60
+ '''
61
+ Read the csv data
62
+ '''
63
+ return pd.read_csv(fh, sep='\s+', na_values='.', **kwargs)
64
+
65
+
66
+ # Fun for reading loading LD scores
67
+ def which_compression(fh):
68
+ '''
69
+ Given a file prefix, figure out what sort of compression to use.
70
+ '''
71
+ if os.access(fh + '.bz2', 4):
72
+ suffix = '.bz2'
73
+ compression = 'bz2'
74
+ elif os.access(fh + '.gz', 4):
75
+ suffix = '.gz'
76
+ compression = 'gzip'
77
+ elif os.access(fh + '.parquet', 4):
78
+ suffix = '.parquet'
79
+ compression = 'parquet'
80
+ elif os.access(fh + '.feather', 4):
81
+ suffix = '.feather'
82
+ compression = 'feather'
83
+ elif os.access(fh, 4):
84
+ suffix = ''
85
+ compression = None
86
+ else:
87
+ raise IOError('Could not open {F}[./gz/bz2/parquet/feather]'.format(F=fh))
88
+ # -
89
+ return suffix, compression
90
+
91
+
92
+ def _read_ref_ld(ld_file):
93
+ suffix = '.l2.ldscore'
94
+ file = ld_file
95
+ first_fh = f'{file}1{suffix}'
96
+ s, compression = which_compression(first_fh)
97
+ #
98
+ ldscore_array = []
99
+ print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
100
+
101
+ for chr in range(1, 23):
102
+ file_chr = f'{file}{chr}{suffix}{s}'
103
+ #
104
+ if compression == 'parquet':
105
+ x = pd.read_parquet(file_chr)
106
+ elif compression == 'feather':
107
+ x = pd.read_feather(file_chr)
108
+ else:
109
+ x = pd.read_csv(file_chr, compression=compression, sep='\t')
110
+
111
+ x = x.sort_values(by=['CHR', 'BP']) # SEs will be wrong unless sorted
112
+
113
+ columns_to_drop = ['MAF', 'CM', 'Gene', 'TSS', 'CHR', 'BP']
114
+ columns_to_drop = [col for col in columns_to_drop if col in x.columns]
115
+ x = x.drop(columns_to_drop, axis=1)
116
+
117
+ ldscore_array.append(x)
118
+ #
119
+ ref_ld = pd.concat(ldscore_array, axis=0)
120
+ return ref_ld
121
+
122
+
123
+ def _read_ref_ld_v2(ld_file):
124
+ suffix = '.l2.ldscore'
125
+ file = ld_file
126
+ first_fh = f'{file}1{suffix}'
127
+ s, compression = which_compression(first_fh)
128
+ print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
129
+ ref_ld = pd.concat(
130
+ [pd.read_feather(f'{file}{chr}{suffix}{s}') for chr in range(1, 23)], axis=0
131
+ )
132
+ # set first column as index
133
+ ref_ld.rename(columns={'index': 'SNP'}, inplace=True)
134
+ ref_ld.set_index('SNP', inplace=True)
135
+ return ref_ld
136
+
137
+ def _read_M_v2(ld_file, n_annot, not_M_5_50):
138
+ suffix = '.l2.M'
139
+ if not not_M_5_50:
140
+ suffix += '_5_50'
141
+ M_annot= np.array(
142
+ [
143
+ np.loadtxt(f'{ld_file}{chr}{suffix}', )
144
+ for chr in range(1, 23)]
145
+
146
+ )
147
+ assert M_annot.shape == (22, n_annot)
148
+ return M_annot.sum(axis=0).reshape((1, n_annot))
149
+ # Fun for reading M annotations
150
+ def _read_M(ld_file, n_annot, not_M_5_50):
151
+ '''
152
+ Read M (--M, --M-file, etc).
153
+ '''
154
+ M_annot = M(ld_file, common=(not not_M_5_50))
155
+
156
+ try:
157
+ M_annot = np.array(M_annot).reshape((1, n_annot))
158
+ except ValueError as e:
159
+ raise ValueError('# terms in --M must match # of LD Scores in --ref-ld.\n' + str(e.args))
160
+ return M_annot
161
+
162
+
163
+ def M(fh, common=False):
164
+ '''
165
+ Parses .l{N}.M files, split across num chromosomes.
166
+ '''
167
+ suffix = '.l2.M'
168
+ if common:
169
+ suffix += '_5_50'
170
+ # -
171
+ M_array = []
172
+ for i in range(1, 23):
173
+ M_current = pd.read_csv(f'{fh}{i}' + suffix, header=None)
174
+ M_array.append(M_current)
175
+
176
+ M_array = pd.concat(M_array, axis=1).sum(axis=1)
177
+ # -
178
+ return np.array(M_array).reshape((1, len(M_array)))
179
+
180
+
181
+ def _check_variance(M_annot, ref_ld):
182
+ '''
183
+ Remove zero-variance LD Scores.
184
+ '''
185
+ ii = ref_ld.iloc[:, 1:].var() == 0 # NB there is a SNP column here
186
+ if ii.all():
187
+ raise ValueError('All LD Scores have zero variance.')
188
+ else:
189
+ print('Removing partitioned LD Scores with zero variance.')
190
+ ii_snp = np.array([True] + list(~ii))
191
+ ii_m = np.array(~ii)
192
+ ref_ld = ref_ld.iloc[:, ii_snp]
193
+ M_annot = M_annot[:, ii_m]
194
+ # -
195
+ return M_annot, ref_ld, ii
196
+ def _check_variance_v2(M_annot, ref_ld):
197
+ ii = ref_ld.var() == 0
198
+ if ii.all():
199
+ raise ValueError('All LD Scores have zero variance.')
200
+ elif not ii.any():
201
+ print('No partitioned LD Scores have zero variance.')
202
+ else:
203
+ ii_snp= ii_m = np.array(~ii)
204
+ print(f'Removing {sum(ii)} partitioned LD Scores with zero variance.')
205
+ ref_ld = ref_ld.iloc[:, ii_snp]
206
+ M_annot = M_annot[:, ii_m]
207
+ return M_annot, ref_ld
208
+
209
+
210
+ # Fun for reading regression weights
211
+ def which_compression(fh):
212
+ '''
213
+ Given a file prefix, figure out what sort of compression to use.
214
+ '''
215
+ if os.access(fh + '.bz2', 4):
216
+ suffix = '.bz2'
217
+ compression = 'bz2'
218
+ elif os.access(fh + '.gz', 4):
219
+ suffix = '.gz'
220
+ compression = 'gzip'
221
+ elif os.access(fh + '.parquet', 4):
222
+ suffix = '.parquet'
223
+ compression = 'parquet'
224
+ elif os.access(fh + '.feather', 4):
225
+ suffix = '.feather'
226
+ compression = 'feather'
227
+ elif os.access(fh, 4):
228
+ suffix = ''
229
+ compression = None
230
+ else:
231
+ raise IOError('Could not open {F}[./gz/bz2/parquet/feather]'.format(F=fh))
232
+ # -
233
+ return suffix, compression
234
+
235
+
236
+ def _read_w_ld(w_file):
237
+ suffix = '.l2.ldscore'
238
+ file = w_file
239
+ first_fh = f'{file}1{suffix}'
240
+ s, compression = which_compression(first_fh)
241
+ #
242
+ w_array = []
243
+ print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
244
+
245
+ for chr in range(1, 23):
246
+ file_chr = f'{file}{chr}{suffix}{s}'
247
+ #
248
+ if compression == 'parquet':
249
+ x = pd.read_parquet(file_chr)
250
+ elif compression == 'feather':
251
+ x = pd.read_feather(file_chr)
252
+ else:
253
+ x = pd.read_csv(file_chr, compression=compression, sep='\t')
254
+
255
+ x = x.sort_values(by=['CHR', 'BP'])
256
+
257
+ columns_to_drop = ['MAF', 'CM', 'Gene', 'TSS', 'CHR', 'BP']
258
+ columns_to_drop = [col for col in columns_to_drop if col in x.columns]
259
+ x = x.drop(columns_to_drop, axis=1)
260
+
261
+ w_array.append(x)
262
+ #
263
+ w_ld = pd.concat(w_array, axis=0)
264
+ w_ld.columns = ['SNP', 'LD_weights']
265
+
266
+ return w_ld
267
+
268
+
269
+ # Fun for merging
270
+ def _merge_and_log(ld, sumstats, noun):
271
+ '''
272
+ Wrap smart merge with log messages about # of SNPs.
273
+ '''
274
+ sumstats = smart_merge(ld, sumstats)
275
+ msg = 'After merging with {F}, {N} SNPs remain.'
276
+ if len(sumstats) == 0:
277
+ raise ValueError(msg.format(N=len(sumstats), F=noun))
278
+ else:
279
+ print(msg.format(N=len(sumstats), F=noun))
280
+ # -
281
+ return sumstats
282
+
283
+
284
+ def smart_merge(x, y):
285
+ '''
286
+ Check if SNP columns are equal. If so, save time by using concat instead of merge.
287
+ '''
288
+ if len(x) == len(y) and (x.index == y.index).all() and (x.SNP == y.SNP).all():
289
+ x = x.reset_index(drop=True)
290
+ y = y.reset_index(drop=True).drop('SNP', 1)
291
+ out = pd.concat([x, y], axis=1)
292
+ else:
293
+ out = pd.merge(x, y, how='inner', on='SNP')
294
+ return out