gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (51) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/g_Log.py +14 -5
  5. gwaslab/g_Sumstats.py +86 -18
  6. gwaslab/g_SumstatsPair.py +70 -23
  7. gwaslab/g_SumstatsT.py +2 -2
  8. gwaslab/g_version.py +10 -10
  9. gwaslab/hm_casting.py +9 -4
  10. gwaslab/hm_harmonize_sumstats.py +88 -83
  11. gwaslab/io_preformat_input.py +14 -14
  12. gwaslab/io_read_ldsc.py +49 -1
  13. gwaslab/ldsc_irwls.py +198 -0
  14. gwaslab/ldsc_jackknife.py +514 -0
  15. gwaslab/ldsc_ldscore.py +417 -0
  16. gwaslab/ldsc_parse.py +294 -0
  17. gwaslab/ldsc_regressions.py +747 -0
  18. gwaslab/ldsc_sumstats.py +629 -0
  19. gwaslab/qc_check_datatype.py +1 -1
  20. gwaslab/qc_fix_sumstats.py +163 -161
  21. gwaslab/util_ex_calculate_ldmatrix.py +2 -2
  22. gwaslab/util_ex_gwascatalog.py +24 -24
  23. gwaslab/util_ex_ldproxyfinder.py +9 -9
  24. gwaslab/util_ex_ldsc.py +189 -0
  25. gwaslab/util_in_calculate_gc.py +6 -6
  26. gwaslab/util_in_calculate_power.py +42 -43
  27. gwaslab/util_in_convert_h2.py +8 -8
  28. gwaslab/util_in_fill_data.py +28 -28
  29. gwaslab/util_in_filter_value.py +91 -52
  30. gwaslab/util_in_get_density.py +8 -8
  31. gwaslab/util_in_get_sig.py +407 -65
  32. gwaslab/viz_aux_annotate_plot.py +12 -12
  33. gwaslab/viz_aux_quickfix.py +18 -18
  34. gwaslab/viz_aux_reposition_text.py +3 -3
  35. gwaslab/viz_aux_save_figure.py +14 -5
  36. gwaslab/viz_plot_compare_af.py +29 -30
  37. gwaslab/viz_plot_compare_effect.py +63 -71
  38. gwaslab/viz_plot_miamiplot2.py +6 -6
  39. gwaslab/viz_plot_mqqplot.py +17 -3
  40. gwaslab/viz_plot_qqplot.py +1 -1
  41. gwaslab/viz_plot_regionalplot.py +33 -32
  42. gwaslab/viz_plot_rg_heatmap.py +28 -26
  43. gwaslab/viz_plot_stackedregional.py +40 -21
  44. gwaslab/viz_plot_trumpetplot.py +50 -55
  45. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  46. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
  47. gwaslab-3.4.39.dist-info/RECORD +80 -0
  48. gwaslab-3.4.38.dist-info/RECORD +0 -72
  49. /gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  50. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  51. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/ldsc_parse.py ADDED
@@ -0,0 +1,294 @@
1
+ '''
2
+ (c) 2014 Brendan Bulik-Sullivan and Hilary Finucane
3
+
4
+ This module contains functions for parsing various ldsc-defined file formats.
5
+
6
+ '''
7
+
8
+ from __future__ import division
9
+ import numpy as np
10
+ import pandas as pd
11
+ import os
12
+ import glob
13
+
14
+ def xrange(*args):
15
+ return range(*args)
16
+
17
+ def series_eq(x, y):
18
+ '''Compare series, return False if lengths not equal.'''
19
+ return len(x) == len(y) and (x == y).all()
20
+
21
+
22
+ def read_csv(fh, **kwargs):
23
+ return pd.read_csv(fh, sep='\s+', na_values='.', **kwargs)
24
+
25
+
26
+ def sub_chr(s, chrom):
27
+ '''Substitute chr for @, else append chr to the end of str.'''
28
+ if '@' not in s:
29
+ s += '@'
30
+
31
+ return s.replace('@', str(chrom))
32
+
33
+
34
+ def get_present_chrs(fh, num):
35
+ '''Checks which chromosomes exist, assuming that the file base will be appended by a dot in any suffix.'''
36
+ chrs = []
37
+ for chrom in xrange(1,num):
38
+ if glob.glob(sub_chr(fh, chrom) + '.*'):
39
+ chrs.append(chrom)
40
+ return chrs
41
+
42
+
43
+ def which_compression(fh):
44
+ '''Given a file prefix, figure out what sort of compression to use.'''
45
+ if os.access(fh + '.bz2', 4):
46
+ suffix = '.bz2'
47
+ compression = 'bz2'
48
+ elif os.access(fh + '.gz', 4):
49
+ suffix = '.gz'
50
+ compression = 'gzip'
51
+ elif os.access(fh, 4):
52
+ suffix = ''
53
+ compression = None
54
+ else:
55
+ raise IOError('Could not open {F}[./gz/bz2]'.format(F=fh))
56
+
57
+ return suffix, compression
58
+
59
+
60
+ def get_compression(fh):
61
+ '''Which sort of compression should we use with read_csv?'''
62
+ if fh.endswith('gz'):
63
+ compression = 'gzip'
64
+ elif fh.endswith('bz2'):
65
+ compression = 'bz2'
66
+ else:
67
+ compression = None
68
+
69
+ return compression
70
+
71
+
72
+ def read_cts(fh, match_snps):
73
+ '''Reads files for --cts-bin.'''
74
+ compression = get_compression(fh)
75
+ cts = read_csv(fh, compression=compression, header=None, names=['SNP', 'ANNOT'])
76
+ if not series_eq(cts.SNP, match_snps):
77
+ raise ValueError('--cts-bin and the .bim file must have identical SNP columns.')
78
+
79
+ return cts.ANNOT.values
80
+
81
+
82
+ def sumstats(fh, alleles=False, dropna=True):
83
+ '''Parses .sumstats files. See docs/file_formats_sumstats.txt.'''
84
+ dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str}
85
+ compression = get_compression(fh)
86
+ usecols = ['SNP', 'Z', 'N']
87
+ if alleles:
88
+ usecols += ['A1', 'A2']
89
+
90
+ try:
91
+ x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression)
92
+ except (AttributeError, ValueError) as e:
93
+ raise ValueError('Improperly formatted sumstats file: ' + str(e.args))
94
+
95
+ if dropna:
96
+ x = x.dropna(how='any')
97
+
98
+ return x
99
+
100
+
101
+ def ldscore_fromlist(flist, num=None):
102
+ '''Sideways concatenation of a list of LD Score files.'''
103
+ ldscore_array = []
104
+ for i, fh in enumerate(flist):
105
+ y = ldscore(fh, num)
106
+ if i > 0:
107
+ if not series_eq(y.SNP, ldscore_array[0].SNP):
108
+ raise ValueError('LD Scores for concatenation must have identical SNP columns.')
109
+ else: # keep SNP column from only the first file
110
+ y = y.drop(['SNP'], axis=1)
111
+
112
+ new_col_dict = {c: c + '_' + str(i) for c in y.columns if c != 'SNP'}
113
+ y.rename(columns=new_col_dict, inplace=True)
114
+ ldscore_array.append(y)
115
+
116
+ return pd.concat(ldscore_array, axis=1)
117
+
118
+
119
+ def l2_parser(fh, compression):
120
+ '''Parse LD Score files'''
121
+ x = read_csv(fh, header=0, compression=compression)
122
+ if 'MAF' in x.columns and 'CM' in x.columns: # for backwards compatibility w/ v<1.0.0
123
+ x = x.drop(['MAF', 'CM'], axis=1)
124
+ return x
125
+
126
+
127
+ def annot_parser(fh, compression, frqfile_full=None, compression_frq=None):
128
+ '''Parse annot files'''
129
+ df_annot = read_csv(fh, header=0, compression=compression).drop(['SNP','CHR', 'BP', 'CM'], axis=1, errors='ignore').astype(float)
130
+ if frqfile_full is not None:
131
+ df_frq = frq_parser(frqfile_full, compression_frq)
132
+ df_annot = df_annot[(.95 > df_frq.FRQ) & (df_frq.FRQ > 0.05)]
133
+ return df_annot
134
+
135
+
136
+ def frq_parser(fh, compression):
137
+ '''Parse frequency files.'''
138
+ df = read_csv(fh, header=0, compression=compression)
139
+ if 'MAF' in df.columns:
140
+ df.rename(columns={'MAF': 'FRQ'}, inplace=True)
141
+ return df[['SNP', 'FRQ']]
142
+
143
+
144
+ def ldscore(fh, num=None):
145
+ '''Parse .l2.ldscore files, split across num chromosomes. See docs/file_formats_ld.txt.'''
146
+ suffix = '.l2.ldscore'
147
+ if num is not None: # num files, e.g., one per chromosome
148
+ chrs = get_present_chrs(fh, num+1)
149
+ first_fh = sub_chr(fh, chrs[0]) + suffix
150
+ s, compression = which_compression(first_fh)
151
+ chr_ld = [l2_parser(sub_chr(fh, i) + suffix + s, compression) for i in chrs]
152
+ x = pd.concat(chr_ld) # automatically sorted by chromosome
153
+ else: # just one file
154
+ s, compression = which_compression(fh + suffix)
155
+ x = l2_parser(fh + suffix + s, compression)
156
+
157
+ x = x.sort_values(by=['CHR', 'BP']) # SEs will be wrong unless sorted
158
+ x = x.drop(['CHR', 'BP'], axis=1).drop_duplicates(subset='SNP')
159
+ return x
160
+
161
+
162
+ def M(fh, num=None, N=2, common=False):
163
+ '''Parses .l{N}.M files, split across num chromosomes. See docs/file_formats_ld.txt.'''
164
+ parsefunc = lambda y: [float(z) for z in open(y, 'r').readline().split()]
165
+ suffix = '.l' + str(N) + '.M'
166
+ if common:
167
+ suffix += '_5_50'
168
+
169
+ if num is not None:
170
+ x = np.sum([parsefunc(sub_chr(fh, i) + suffix) for i in get_present_chrs(fh, num+1)], axis=0)
171
+ else:
172
+ x = parsefunc(fh + suffix)
173
+
174
+ return np.array(x).reshape((1, len(x)))
175
+
176
+
177
+ def M_fromlist(flist, num=None, N=2, common=False):
178
+ '''Read a list of .M* files and concatenate sideways.'''
179
+ return np.hstack([M(fh, num, N, common) for fh in flist])
180
+
181
+
182
+ def annot(fh_list, num=None, frqfile=None):
183
+ '''
184
+ Parses .annot files and returns an overlap matrix. See docs/file_formats_ld.txt.
185
+ If num is not None, parses .annot files split across [num] chromosomes (e.g., the
186
+ output of parallelizing ldsc.py --l2 across chromosomes).
187
+
188
+ '''
189
+ annot_suffix = ['.annot' for fh in fh_list]
190
+ annot_compression = []
191
+ if num is not None: # 22 files, one for each chromosome
192
+ chrs = get_present_chrs(fh, num+1)
193
+ for i, fh in enumerate(fh_list):
194
+ first_fh = sub_chr(fh, chrs[0]) + annot_suffix[i]
195
+ annot_s, annot_comp_single = which_compression(first_fh)
196
+ annot_suffix[i] += annot_s
197
+ annot_compression.append(annot_comp_single)
198
+
199
+ if frqfile is not None:
200
+ frq_suffix = '.frq'
201
+ first_frqfile = sub_chr(frqfile, 1) + frq_suffix
202
+ frq_s, frq_compression = which_compression(first_frqfile)
203
+ frq_suffix += frq_s
204
+
205
+ y = []
206
+ M_tot = 0
207
+ for chrom in chrs:
208
+ if frqfile is not None:
209
+ df_annot_chr_list = [annot_parser(sub_chr(fh, chrom) + annot_suffix[i], annot_compression[i],
210
+ sub_chr(frqfile, chrom) + frq_suffix, frq_compression)
211
+ for i, fh in enumerate(fh_list)]
212
+ else:
213
+ df_annot_chr_list = [annot_parser(sub_chr(fh, chrom) + annot_suffix[i], annot_compression[i])
214
+ for i, fh in enumerate(fh_list)]
215
+
216
+ annot_matrix_chr_list = [np.matrix(df_annot_chr) for df_annot_chr in df_annot_chr_list]
217
+ annot_matrix_chr = np.hstack(annot_matrix_chr_list)
218
+ y.append(np.dot(annot_matrix_chr.T, annot_matrix_chr))
219
+ M_tot += len(df_annot_chr_list[0])
220
+
221
+ x = sum(y)
222
+ else: # just one file
223
+ for i, fh in enumerate(fh_list):
224
+ annot_s, annot_comp_single = which_compression(fh + annot_suffix[i])
225
+ annot_suffix[i] += annot_s
226
+ annot_compression.append(annot_comp_single)
227
+
228
+ if frqfile is not None:
229
+ frq_suffix = '.frq'
230
+ frq_s, frq_compression = which_compression(frqfile + frq_suffix)
231
+ frq_suffix += frq_s
232
+
233
+ df_annot_list = [annot_parser(fh + annot_suffix[i], annot_compression[i],
234
+ frqfile + frq_suffix, frq_compression) for i, fh in enumerate(fh_list)]
235
+
236
+ else:
237
+ df_annot_list = [annot_parser(fh + annot_suffix[i], annot_compression[i])
238
+ for i, fh in enumerate(fh_list)]
239
+
240
+ annot_matrix_list = [np.matrix(y) for y in df_annot_list]
241
+ annot_matrix = np.hstack(annot_matrix_list)
242
+ x = np.dot(annot_matrix.T, annot_matrix)
243
+ M_tot = len(df_annot_list[0])
244
+
245
+ return x, M_tot
246
+
247
+
248
+ def __ID_List_Factory__(colnames, keepcol, fname_end, header=None, usecols=None):
249
+
250
+ class IDContainer(object):
251
+
252
+ def __init__(self, fname):
253
+ self.__usecols__ = usecols
254
+ self.__colnames__ = colnames
255
+ self.__keepcol__ = keepcol
256
+ self.__fname_end__ = fname_end
257
+ self.__header__ = header
258
+ self.__read__(fname)
259
+ self.n = len(self.df)
260
+
261
+ def __read__(self, fname):
262
+ end = self.__fname_end__
263
+ if end and not fname.endswith(end):
264
+ raise ValueError('{f} filename must end in {f}'.format(f=end))
265
+
266
+ comp = get_compression(fname)
267
+ self.df = pd.read_csv(fname, header=self.__header__, usecols=self.__usecols__,
268
+ sep='\s+', compression=comp)
269
+
270
+ if self.__colnames__:
271
+ self.df.columns = self.__colnames__
272
+
273
+ if self.__keepcol__ is not None:
274
+ self.IDList = self.df.iloc[:, [self.__keepcol__]].astype('object')
275
+
276
+ def loj(self, externalDf):
277
+ '''Returns indices of those elements of self.IDList that appear in exernalDf.'''
278
+ r = externalDf.columns[0]
279
+ l = self.IDList.columns[0]
280
+ merge_df = externalDf.iloc[:, [0]]
281
+ merge_df['keep'] = True
282
+ z = pd.merge(self.IDList, merge_df, how='left', left_on=l, right_on=r,
283
+ sort=False)
284
+ ii = z['keep'] == True
285
+ return np.nonzero(ii)[0]
286
+
287
+ return IDContainer
288
+
289
+
290
+ PlinkBIMFile = __ID_List_Factory__(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
291
+ PlinkFAMFile = __ID_List_Factory__(['IID'], 0, '.fam', usecols=[1])
292
+ FilterFile = __ID_List_Factory__(['ID'], 0, None, usecols=[0])
293
+ AnnotFile = __ID_List_Factory__(None, 2, None, header=0, usecols=None)
294
+ ThinAnnotFile = __ID_List_Factory__(None, None, None, header=0, usecols=None)