gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +22 -5
- gwaslab/g_Sumstats.py +110 -163
- gwaslab/g_SumstatsPair.py +76 -25
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +36 -17
- gwaslab/hm_harmonize_sumstats.py +354 -221
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +49 -43
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/io_to_formats.py +428 -295
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +3 -3
- gwaslab/qc_fix_sumstats.py +891 -778
- gwaslab/util_ex_calculate_ldmatrix.py +31 -13
- gwaslab/util_ex_gwascatalog.py +25 -25
- gwaslab/util_ex_ldproxyfinder.py +10 -10
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +30 -30
- gwaslab/util_in_filter_value.py +201 -74
- gwaslab/util_in_get_density.py +10 -10
- gwaslab/util_in_get_sig.py +445 -71
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +42 -37
- gwaslab/viz_aux_reposition_text.py +10 -7
- gwaslab/viz_aux_save_figure.py +18 -8
- gwaslab/viz_plot_compare_af.py +32 -33
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +34 -26
- gwaslab/viz_plot_mqqplot.py +126 -75
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +36 -33
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +65 -61
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.37.dist-info/RECORD +0 -72
- /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/ldsc_parse.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
'''
|
|
2
|
+
(c) 2014 Brendan Bulik-Sullivan and Hilary Finucane
|
|
3
|
+
|
|
4
|
+
This module contains functions for parsing various ldsc-defined file formats.
|
|
5
|
+
|
|
6
|
+
'''
|
|
7
|
+
|
|
8
|
+
from __future__ import division
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import os
|
|
12
|
+
import glob
|
|
13
|
+
|
|
14
|
+
def xrange(*args):
|
|
15
|
+
return range(*args)
|
|
16
|
+
|
|
17
|
+
def series_eq(x, y):
|
|
18
|
+
'''Compare series, return False if lengths not equal.'''
|
|
19
|
+
return len(x) == len(y) and (x == y).all()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def read_csv(fh, **kwargs):
|
|
23
|
+
return pd.read_csv(fh, sep='\s+', na_values='.', **kwargs)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def sub_chr(s, chrom):
|
|
27
|
+
'''Substitute chr for @, else append chr to the end of str.'''
|
|
28
|
+
if '@' not in s:
|
|
29
|
+
s += '@'
|
|
30
|
+
|
|
31
|
+
return s.replace('@', str(chrom))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_present_chrs(fh, num):
|
|
35
|
+
'''Checks which chromosomes exist, assuming that the file base will be appended by a dot in any suffix.'''
|
|
36
|
+
chrs = []
|
|
37
|
+
for chrom in xrange(1,num):
|
|
38
|
+
if glob.glob(sub_chr(fh, chrom) + '.*'):
|
|
39
|
+
chrs.append(chrom)
|
|
40
|
+
return chrs
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def which_compression(fh):
|
|
44
|
+
'''Given a file prefix, figure out what sort of compression to use.'''
|
|
45
|
+
if os.access(fh + '.bz2', 4):
|
|
46
|
+
suffix = '.bz2'
|
|
47
|
+
compression = 'bz2'
|
|
48
|
+
elif os.access(fh + '.gz', 4):
|
|
49
|
+
suffix = '.gz'
|
|
50
|
+
compression = 'gzip'
|
|
51
|
+
elif os.access(fh, 4):
|
|
52
|
+
suffix = ''
|
|
53
|
+
compression = None
|
|
54
|
+
else:
|
|
55
|
+
raise IOError('Could not open {F}[./gz/bz2]'.format(F=fh))
|
|
56
|
+
|
|
57
|
+
return suffix, compression
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_compression(fh):
|
|
61
|
+
'''Which sort of compression should we use with read_csv?'''
|
|
62
|
+
if fh.endswith('gz'):
|
|
63
|
+
compression = 'gzip'
|
|
64
|
+
elif fh.endswith('bz2'):
|
|
65
|
+
compression = 'bz2'
|
|
66
|
+
else:
|
|
67
|
+
compression = None
|
|
68
|
+
|
|
69
|
+
return compression
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def read_cts(fh, match_snps):
|
|
73
|
+
'''Reads files for --cts-bin.'''
|
|
74
|
+
compression = get_compression(fh)
|
|
75
|
+
cts = read_csv(fh, compression=compression, header=None, names=['SNP', 'ANNOT'])
|
|
76
|
+
if not series_eq(cts.SNP, match_snps):
|
|
77
|
+
raise ValueError('--cts-bin and the .bim file must have identical SNP columns.')
|
|
78
|
+
|
|
79
|
+
return cts.ANNOT.values
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def sumstats(fh, alleles=False, dropna=True):
|
|
83
|
+
'''Parses .sumstats files. See docs/file_formats_sumstats.txt.'''
|
|
84
|
+
dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str}
|
|
85
|
+
compression = get_compression(fh)
|
|
86
|
+
usecols = ['SNP', 'Z', 'N']
|
|
87
|
+
if alleles:
|
|
88
|
+
usecols += ['A1', 'A2']
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression)
|
|
92
|
+
except (AttributeError, ValueError) as e:
|
|
93
|
+
raise ValueError('Improperly formatted sumstats file: ' + str(e.args))
|
|
94
|
+
|
|
95
|
+
if dropna:
|
|
96
|
+
x = x.dropna(how='any')
|
|
97
|
+
|
|
98
|
+
return x
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def ldscore_fromlist(flist, num=None):
|
|
102
|
+
'''Sideways concatenation of a list of LD Score files.'''
|
|
103
|
+
ldscore_array = []
|
|
104
|
+
for i, fh in enumerate(flist):
|
|
105
|
+
y = ldscore(fh, num)
|
|
106
|
+
if i > 0:
|
|
107
|
+
if not series_eq(y.SNP, ldscore_array[0].SNP):
|
|
108
|
+
raise ValueError('LD Scores for concatenation must have identical SNP columns.')
|
|
109
|
+
else: # keep SNP column from only the first file
|
|
110
|
+
y = y.drop(['SNP'], axis=1)
|
|
111
|
+
|
|
112
|
+
new_col_dict = {c: c + '_' + str(i) for c in y.columns if c != 'SNP'}
|
|
113
|
+
y.rename(columns=new_col_dict, inplace=True)
|
|
114
|
+
ldscore_array.append(y)
|
|
115
|
+
|
|
116
|
+
return pd.concat(ldscore_array, axis=1)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def l2_parser(fh, compression):
|
|
120
|
+
'''Parse LD Score files'''
|
|
121
|
+
x = read_csv(fh, header=0, compression=compression)
|
|
122
|
+
if 'MAF' in x.columns and 'CM' in x.columns: # for backwards compatibility w/ v<1.0.0
|
|
123
|
+
x = x.drop(['MAF', 'CM'], axis=1)
|
|
124
|
+
return x
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def annot_parser(fh, compression, frqfile_full=None, compression_frq=None):
|
|
128
|
+
'''Parse annot files'''
|
|
129
|
+
df_annot = read_csv(fh, header=0, compression=compression).drop(['SNP','CHR', 'BP', 'CM'], axis=1, errors='ignore').astype(float)
|
|
130
|
+
if frqfile_full is not None:
|
|
131
|
+
df_frq = frq_parser(frqfile_full, compression_frq)
|
|
132
|
+
df_annot = df_annot[(.95 > df_frq.FRQ) & (df_frq.FRQ > 0.05)]
|
|
133
|
+
return df_annot
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def frq_parser(fh, compression):
|
|
137
|
+
'''Parse frequency files.'''
|
|
138
|
+
df = read_csv(fh, header=0, compression=compression)
|
|
139
|
+
if 'MAF' in df.columns:
|
|
140
|
+
df.rename(columns={'MAF': 'FRQ'}, inplace=True)
|
|
141
|
+
return df[['SNP', 'FRQ']]
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def ldscore(fh, num=None):
|
|
145
|
+
'''Parse .l2.ldscore files, split across num chromosomes. See docs/file_formats_ld.txt.'''
|
|
146
|
+
suffix = '.l2.ldscore'
|
|
147
|
+
if num is not None: # num files, e.g., one per chromosome
|
|
148
|
+
chrs = get_present_chrs(fh, num+1)
|
|
149
|
+
first_fh = sub_chr(fh, chrs[0]) + suffix
|
|
150
|
+
s, compression = which_compression(first_fh)
|
|
151
|
+
chr_ld = [l2_parser(sub_chr(fh, i) + suffix + s, compression) for i in chrs]
|
|
152
|
+
x = pd.concat(chr_ld) # automatically sorted by chromosome
|
|
153
|
+
else: # just one file
|
|
154
|
+
s, compression = which_compression(fh + suffix)
|
|
155
|
+
x = l2_parser(fh + suffix + s, compression)
|
|
156
|
+
|
|
157
|
+
x = x.sort_values(by=['CHR', 'BP']) # SEs will be wrong unless sorted
|
|
158
|
+
x = x.drop(['CHR', 'BP'], axis=1).drop_duplicates(subset='SNP')
|
|
159
|
+
return x
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def M(fh, num=None, N=2, common=False):
|
|
163
|
+
'''Parses .l{N}.M files, split across num chromosomes. See docs/file_formats_ld.txt.'''
|
|
164
|
+
parsefunc = lambda y: [float(z) for z in open(y, 'r').readline().split()]
|
|
165
|
+
suffix = '.l' + str(N) + '.M'
|
|
166
|
+
if common:
|
|
167
|
+
suffix += '_5_50'
|
|
168
|
+
|
|
169
|
+
if num is not None:
|
|
170
|
+
x = np.sum([parsefunc(sub_chr(fh, i) + suffix) for i in get_present_chrs(fh, num+1)], axis=0)
|
|
171
|
+
else:
|
|
172
|
+
x = parsefunc(fh + suffix)
|
|
173
|
+
|
|
174
|
+
return np.array(x).reshape((1, len(x)))
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def M_fromlist(flist, num=None, N=2, common=False):
|
|
178
|
+
'''Read a list of .M* files and concatenate sideways.'''
|
|
179
|
+
return np.hstack([M(fh, num, N, common) for fh in flist])
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def annot(fh_list, num=None, frqfile=None):
|
|
183
|
+
'''
|
|
184
|
+
Parses .annot files and returns an overlap matrix. See docs/file_formats_ld.txt.
|
|
185
|
+
If num is not None, parses .annot files split across [num] chromosomes (e.g., the
|
|
186
|
+
output of parallelizing ldsc.py --l2 across chromosomes).
|
|
187
|
+
|
|
188
|
+
'''
|
|
189
|
+
annot_suffix = ['.annot' for fh in fh_list]
|
|
190
|
+
annot_compression = []
|
|
191
|
+
if num is not None: # 22 files, one for each chromosome
|
|
192
|
+
chrs = get_present_chrs(fh, num+1)
|
|
193
|
+
for i, fh in enumerate(fh_list):
|
|
194
|
+
first_fh = sub_chr(fh, chrs[0]) + annot_suffix[i]
|
|
195
|
+
annot_s, annot_comp_single = which_compression(first_fh)
|
|
196
|
+
annot_suffix[i] += annot_s
|
|
197
|
+
annot_compression.append(annot_comp_single)
|
|
198
|
+
|
|
199
|
+
if frqfile is not None:
|
|
200
|
+
frq_suffix = '.frq'
|
|
201
|
+
first_frqfile = sub_chr(frqfile, 1) + frq_suffix
|
|
202
|
+
frq_s, frq_compression = which_compression(first_frqfile)
|
|
203
|
+
frq_suffix += frq_s
|
|
204
|
+
|
|
205
|
+
y = []
|
|
206
|
+
M_tot = 0
|
|
207
|
+
for chrom in chrs:
|
|
208
|
+
if frqfile is not None:
|
|
209
|
+
df_annot_chr_list = [annot_parser(sub_chr(fh, chrom) + annot_suffix[i], annot_compression[i],
|
|
210
|
+
sub_chr(frqfile, chrom) + frq_suffix, frq_compression)
|
|
211
|
+
for i, fh in enumerate(fh_list)]
|
|
212
|
+
else:
|
|
213
|
+
df_annot_chr_list = [annot_parser(sub_chr(fh, chrom) + annot_suffix[i], annot_compression[i])
|
|
214
|
+
for i, fh in enumerate(fh_list)]
|
|
215
|
+
|
|
216
|
+
annot_matrix_chr_list = [np.matrix(df_annot_chr) for df_annot_chr in df_annot_chr_list]
|
|
217
|
+
annot_matrix_chr = np.hstack(annot_matrix_chr_list)
|
|
218
|
+
y.append(np.dot(annot_matrix_chr.T, annot_matrix_chr))
|
|
219
|
+
M_tot += len(df_annot_chr_list[0])
|
|
220
|
+
|
|
221
|
+
x = sum(y)
|
|
222
|
+
else: # just one file
|
|
223
|
+
for i, fh in enumerate(fh_list):
|
|
224
|
+
annot_s, annot_comp_single = which_compression(fh + annot_suffix[i])
|
|
225
|
+
annot_suffix[i] += annot_s
|
|
226
|
+
annot_compression.append(annot_comp_single)
|
|
227
|
+
|
|
228
|
+
if frqfile is not None:
|
|
229
|
+
frq_suffix = '.frq'
|
|
230
|
+
frq_s, frq_compression = which_compression(frqfile + frq_suffix)
|
|
231
|
+
frq_suffix += frq_s
|
|
232
|
+
|
|
233
|
+
df_annot_list = [annot_parser(fh + annot_suffix[i], annot_compression[i],
|
|
234
|
+
frqfile + frq_suffix, frq_compression) for i, fh in enumerate(fh_list)]
|
|
235
|
+
|
|
236
|
+
else:
|
|
237
|
+
df_annot_list = [annot_parser(fh + annot_suffix[i], annot_compression[i])
|
|
238
|
+
for i, fh in enumerate(fh_list)]
|
|
239
|
+
|
|
240
|
+
annot_matrix_list = [np.matrix(y) for y in df_annot_list]
|
|
241
|
+
annot_matrix = np.hstack(annot_matrix_list)
|
|
242
|
+
x = np.dot(annot_matrix.T, annot_matrix)
|
|
243
|
+
M_tot = len(df_annot_list[0])
|
|
244
|
+
|
|
245
|
+
return x, M_tot
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def __ID_List_Factory__(colnames, keepcol, fname_end, header=None, usecols=None):
|
|
249
|
+
|
|
250
|
+
class IDContainer(object):
|
|
251
|
+
|
|
252
|
+
def __init__(self, fname):
|
|
253
|
+
self.__usecols__ = usecols
|
|
254
|
+
self.__colnames__ = colnames
|
|
255
|
+
self.__keepcol__ = keepcol
|
|
256
|
+
self.__fname_end__ = fname_end
|
|
257
|
+
self.__header__ = header
|
|
258
|
+
self.__read__(fname)
|
|
259
|
+
self.n = len(self.df)
|
|
260
|
+
|
|
261
|
+
def __read__(self, fname):
|
|
262
|
+
end = self.__fname_end__
|
|
263
|
+
if end and not fname.endswith(end):
|
|
264
|
+
raise ValueError('{f} filename must end in {f}'.format(f=end))
|
|
265
|
+
|
|
266
|
+
comp = get_compression(fname)
|
|
267
|
+
self.df = pd.read_csv(fname, header=self.__header__, usecols=self.__usecols__,
|
|
268
|
+
sep='\s+', compression=comp)
|
|
269
|
+
|
|
270
|
+
if self.__colnames__:
|
|
271
|
+
self.df.columns = self.__colnames__
|
|
272
|
+
|
|
273
|
+
if self.__keepcol__ is not None:
|
|
274
|
+
self.IDList = self.df.iloc[:, [self.__keepcol__]].astype('object')
|
|
275
|
+
|
|
276
|
+
def loj(self, externalDf):
|
|
277
|
+
'''Returns indices of those elements of self.IDList that appear in exernalDf.'''
|
|
278
|
+
r = externalDf.columns[0]
|
|
279
|
+
l = self.IDList.columns[0]
|
|
280
|
+
merge_df = externalDf.iloc[:, [0]]
|
|
281
|
+
merge_df['keep'] = True
|
|
282
|
+
z = pd.merge(self.IDList, merge_df, how='left', left_on=l, right_on=r,
|
|
283
|
+
sort=False)
|
|
284
|
+
ii = z['keep'] == True
|
|
285
|
+
return np.nonzero(ii)[0]
|
|
286
|
+
|
|
287
|
+
return IDContainer
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
PlinkBIMFile = __ID_List_Factory__(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
|
|
291
|
+
PlinkFAMFile = __ID_List_Factory__(['IID'], 0, '.fam', usecols=[1])
|
|
292
|
+
FilterFile = __ID_List_Factory__(['ID'], 0, None, usecols=[0])
|
|
293
|
+
AnnotFile = __ID_List_Factory__(None, 2, None, header=0, usecols=None)
|
|
294
|
+
ThinAnnotFile = __ID_List_Factory__(None, None, None, header=0, usecols=None)
|