gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +22 -5
- gwaslab/g_Sumstats.py +110 -163
- gwaslab/g_SumstatsPair.py +76 -25
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +36 -17
- gwaslab/hm_harmonize_sumstats.py +354 -221
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +49 -43
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/io_to_formats.py +428 -295
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +3 -3
- gwaslab/qc_fix_sumstats.py +891 -778
- gwaslab/util_ex_calculate_ldmatrix.py +31 -13
- gwaslab/util_ex_gwascatalog.py +25 -25
- gwaslab/util_ex_ldproxyfinder.py +10 -10
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +30 -30
- gwaslab/util_in_filter_value.py +201 -74
- gwaslab/util_in_get_density.py +10 -10
- gwaslab/util_in_get_sig.py +445 -71
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +42 -37
- gwaslab/viz_aux_reposition_text.py +10 -7
- gwaslab/viz_aux_save_figure.py +18 -8
- gwaslab/viz_plot_compare_af.py +32 -33
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +34 -26
- gwaslab/viz_plot_mqqplot.py +126 -75
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +36 -33
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +65 -61
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.37.dist-info/RECORD +0 -72
- /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/util_in_filter_value.py
CHANGED
|
@@ -8,65 +8,68 @@ from gwaslab.bd_common_data import get_chr_to_number
|
|
|
8
8
|
from gwaslab.g_Log import Log
|
|
9
9
|
from gwaslab.g_vchange_status import vchange_status
|
|
10
10
|
from gwaslab.qc_fix_sumstats import sortcoordinate
|
|
11
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
12
|
+
from gwaslab.qc_fix_sumstats import finished
|
|
13
|
+
from gwaslab.hm_harmonize_sumstats import is_palindromic
|
|
11
14
|
|
|
12
15
|
import gc
|
|
13
16
|
def filtervalues(sumstats,expr,remove=False,verbose=True,log=Log()):
|
|
14
|
-
|
|
17
|
+
log.write("Start filtering values by condition:",expr, verbose=verbose)
|
|
15
18
|
prenum = len(sumstats)
|
|
16
19
|
sumstats = sumstats.query(expr,engine='python').copy()
|
|
17
20
|
afternum = len(sumstats)
|
|
18
|
-
|
|
19
|
-
|
|
21
|
+
log.write(" -Removing "+ str(prenum-afternum) +" variants not meeting the conditions:",expr, verbose=verbose)
|
|
22
|
+
log.write("Finished filtering values.", verbose=verbose)
|
|
20
23
|
gc.collect()
|
|
21
24
|
return sumstats
|
|
22
25
|
|
|
23
26
|
def filterout(sumstats,interval={},lt={},gt={},eq={},remove=False,verbose=True,log=Log()):
|
|
24
|
-
|
|
27
|
+
log.write("Start filtering values:", verbose=verbose)
|
|
25
28
|
for key,threshold in gt.items():
|
|
26
29
|
num = len(sumstats.loc[sumstats[key]>threshold,:])
|
|
27
|
-
|
|
30
|
+
log.write(" -Removing "+ str(num) +" variants with "+key+" > "+ str(threshold)+" ...", verbose=verbose)
|
|
28
31
|
sumstats = sumstats.loc[sumstats[key]<threshold,:]
|
|
29
32
|
for key,threshold in lt.items():
|
|
30
33
|
num = len(sumstats.loc[sumstats[key]<threshold,:])
|
|
31
|
-
|
|
34
|
+
log.write(" -Removing "+ str(num) +" variants with "+key+" < "+ str(threshold)+" ...", verbose=verbose)
|
|
32
35
|
sumstats = sumstats.loc[sumstats[key]>threshold,:]
|
|
33
36
|
for key,threshold in eq.items():
|
|
34
37
|
num = len(sumstats.loc[sumstats[key]==threshold,:])
|
|
35
|
-
|
|
38
|
+
log.write(" -Removing "+ str(num) +" variants with "+key+" = "+ str(threshold)+" ...", verbose=verbose)
|
|
36
39
|
sumstats = sumstats.loc[sumstats[key]!=threshold,:]
|
|
37
|
-
|
|
40
|
+
log.write("Finished filtering values.", verbose=verbose)
|
|
38
41
|
gc.collect()
|
|
39
42
|
return sumstats.copy()
|
|
40
43
|
|
|
41
44
|
def filterin(sumstats,lt={},gt={},eq={},remove=False,verbose=True,log=Log()):
|
|
42
|
-
|
|
45
|
+
log.write("Start filtering values:", verbose=verbose)
|
|
43
46
|
for key,threshold in gt.items():
|
|
44
47
|
num = len(sumstats.loc[sumstats[key]>threshold,:])
|
|
45
|
-
|
|
48
|
+
log.write(" -Keeping "+ str(num) +" variants with "+key+" > "+ str(threshold)+" ...", verbose=verbose)
|
|
46
49
|
sumstats = sumstats.loc[sumstats[key]>threshold,:]
|
|
47
50
|
for key,threshold in lt.items():
|
|
48
51
|
num = len(sumstats.loc[sumstats[key]<threshold,:])
|
|
49
|
-
|
|
52
|
+
log.write(" -Keeping "+ str(num) +" variants with "+key+" < "+ str(threshold)+" ...", verbose=verbose)
|
|
50
53
|
sumstats = sumstats.loc[sumstats[key]<threshold,:]
|
|
51
54
|
for key,threshold in eq.items():
|
|
52
55
|
num = len(sumstats.loc[sumstats[key]==threshold,:])
|
|
53
|
-
|
|
56
|
+
log.write(" -Keeping "+ str(num) +" variants with "+key+" = "+ str(threshold)+" ...", verbose=verbose)
|
|
54
57
|
sumstats = sumstats.loc[sumstats[key]==threshold,:]
|
|
55
|
-
|
|
58
|
+
log.write("Finished filtering values.", verbose=verbose)
|
|
56
59
|
gc.collect()
|
|
57
60
|
return sumstats.copy()
|
|
58
61
|
|
|
59
62
|
def filterregionin(sumstats,path=None, chrom="CHR",pos="POS", high_ld=False, build="19", verbose=True,log=Log()):
|
|
60
63
|
sumstats = sortcoordinate(sumstats,verbose=verbose)
|
|
61
|
-
|
|
62
|
-
|
|
64
|
+
log.write("Start to filter in variants if in intervals defined in bed files:", verbose=verbose)
|
|
65
|
+
log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns), verbose=verbose)
|
|
63
66
|
|
|
64
67
|
if high_ld is True:
|
|
65
68
|
path = get_high_ld(build=build)
|
|
66
|
-
|
|
69
|
+
log.write(" -Loading bed format file for hg"+build, verbose=verbose)
|
|
67
70
|
|
|
68
71
|
else:
|
|
69
|
-
|
|
72
|
+
log.write(" -Loading bed format file: " , path, verbose=verbose)
|
|
70
73
|
bed = pd.read_csv(path,sep="\s+",header=None,dtype={0:"string",1:"Int64",2:"Int64"})
|
|
71
74
|
|
|
72
75
|
bed["tuple"] = bed.apply(lambda x: (x[1],x[2]),axis=1)
|
|
@@ -78,7 +81,7 @@ def filterregionin(sumstats,path=None, chrom="CHR",pos="POS", high_ld=False, bui
|
|
|
78
81
|
sumstats = sumstats.sort_values(["CHR","POS"])
|
|
79
82
|
|
|
80
83
|
if len(bed)<100:
|
|
81
|
-
|
|
84
|
+
log.write(" -Bed file < 100 lines: using pd IntervalIndex... ", verbose=verbose)
|
|
82
85
|
for i in sumstats[chrom].unique():
|
|
83
86
|
if sum(bed[0]==i)>0:
|
|
84
87
|
interval = pd.IntervalIndex.from_tuples(bed.loc[bed[0]==i,"tuple"])
|
|
@@ -86,7 +89,7 @@ def filterregionin(sumstats,path=None, chrom="CHR",pos="POS", high_ld=False, bui
|
|
|
86
89
|
else:
|
|
87
90
|
continue
|
|
88
91
|
else:
|
|
89
|
-
|
|
92
|
+
log.write(" -Bed file > 100 lines: using two pointers, please make files are all sorted... ", verbose=verbose)
|
|
90
93
|
bed_num =0
|
|
91
94
|
bed_chr =bed.iloc[bed_num,0]
|
|
92
95
|
bed_left =bed.iloc[bed_num,1]
|
|
@@ -134,23 +137,23 @@ def filterregionin(sumstats,path=None, chrom="CHR",pos="POS", high_ld=False, bui
|
|
|
134
137
|
## in
|
|
135
138
|
|
|
136
139
|
sumstats = sumstats.loc[sumstats["bed_indicator"],:]
|
|
137
|
-
|
|
138
|
-
|
|
140
|
+
log.write(" -Number of variants in the specified regions to keep:",sum(sumstats["bed_indicator"]), verbose=verbose)
|
|
141
|
+
log.write(" -Number of variants removed:",sum(~sumstats["bed_indicator"]), verbose=verbose)
|
|
139
142
|
sumstats = sumstats.drop(columns="bed_indicator")
|
|
140
|
-
|
|
143
|
+
log.write("Finished filtering in variants.", verbose=verbose)
|
|
141
144
|
gc.collect()
|
|
142
145
|
return sumstats
|
|
143
146
|
|
|
144
147
|
def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, build="19", verbose=True,log=Log()):
|
|
145
148
|
sumstats = sortcoordinate(sumstats,verbose=verbose)
|
|
146
|
-
|
|
147
|
-
|
|
149
|
+
log.write("Start to filter out variants if in intervals defined in bed files:", verbose=verbose)
|
|
150
|
+
log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns), verbose=verbose)
|
|
148
151
|
if high_ld is True:
|
|
149
152
|
path = get_high_ld(build=build)
|
|
150
|
-
|
|
153
|
+
log.write(" -Loading bed format file for hg"+build, verbose=verbose)
|
|
151
154
|
|
|
152
155
|
else:
|
|
153
|
-
|
|
156
|
+
log.write(" -Loading bed format file: " , path, verbose=verbose)
|
|
154
157
|
|
|
155
158
|
bed = pd.read_csv(path,sep="\s+",header=None,dtype={0:"string",1:"Int64",2:"Int64"})
|
|
156
159
|
bed["tuple"] = bed.apply(lambda x: (x[1],x[2]),axis=1)
|
|
@@ -162,7 +165,7 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
|
|
|
162
165
|
bed[0]=bed[0].astype("Int64")
|
|
163
166
|
|
|
164
167
|
if len(bed)<100:
|
|
165
|
-
|
|
168
|
+
log.write(" -Bed file < 100 lines: using pd IntervalIndex... ", verbose=verbose)
|
|
166
169
|
for i in sumstats[chrom].unique():
|
|
167
170
|
if sum(bed[0]==i)>0:
|
|
168
171
|
interval = pd.IntervalIndex.from_tuples(bed.loc[bed[0]==i,"tuple"])
|
|
@@ -170,7 +173,7 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
|
|
|
170
173
|
else:
|
|
171
174
|
continue
|
|
172
175
|
else:
|
|
173
|
-
|
|
176
|
+
log.write(" -Bed file > 100 lines: using two pointers, please make files are all sorted... ", verbose=verbose)
|
|
174
177
|
bed_num =0
|
|
175
178
|
bed_chr =bed.iloc[bed_num,0]
|
|
176
179
|
bed_left =bed.iloc[bed_num,1]
|
|
@@ -206,78 +209,93 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
|
|
|
206
209
|
## out
|
|
207
210
|
|
|
208
211
|
sumstats = sumstats.loc[~sumstats["bed_indicator"],:]
|
|
209
|
-
|
|
210
|
-
|
|
212
|
+
log.write(" -Number of variants in the specified regions to exclude:",sum(sumstats["bed_indicator"]), verbose=verbose)
|
|
213
|
+
log.write(" -Number of variants left:",len(sumstats), verbose=verbose)
|
|
211
214
|
sumstats = sumstats.drop(columns="bed_indicator")
|
|
212
|
-
|
|
215
|
+
log.write("Finished filtering out variants.", verbose=verbose)
|
|
213
216
|
gc.collect()
|
|
214
217
|
return sumstats
|
|
215
218
|
|
|
216
219
|
def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True,log=Log()):
|
|
220
|
+
##start function with col checking##########################################################
|
|
221
|
+
_start_line = "infer genome build version using hapmap3 SNPs"
|
|
222
|
+
_end_line = "inferring genome build version using hapmap3 SNPs"
|
|
223
|
+
_start_cols = [chrom,pos]
|
|
224
|
+
_start_function = ".infer_build()"
|
|
225
|
+
_must_args ={}
|
|
226
|
+
|
|
227
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
228
|
+
log=log,
|
|
229
|
+
verbose=verbose,
|
|
230
|
+
start_line=_start_line,
|
|
231
|
+
end_line=_end_line,
|
|
232
|
+
start_cols=_start_cols,
|
|
233
|
+
start_function=_start_function,
|
|
234
|
+
**_must_args)
|
|
235
|
+
if is_enough_info == False: return sumstats
|
|
236
|
+
############################################################################################
|
|
237
|
+
|
|
217
238
|
inferred_build="Unknown"
|
|
218
|
-
|
|
239
|
+
log.write("Start to infer genome build version using hapmap3 SNPs...", verbose=verbose)
|
|
219
240
|
data_path_19 = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz'
|
|
220
241
|
data_path_38 = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz'
|
|
221
|
-
|
|
242
|
+
log.write(" -Loading Hapmap3 variants data...", verbose=verbose)
|
|
222
243
|
hapmap3_ref_19 = pd.read_csv(data_path_19,sep="\s+",usecols=["#CHROM","POS"],dtype={"#CHROM":"string","POS":"string"})
|
|
223
244
|
hapmap3_ref_38 = pd.read_csv(data_path_38,sep="\s+",usecols=["#CHROM","POS"],dtype={"#CHROM":"string","POS":"string"})
|
|
224
245
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
inferred_build="38"
|
|
251
|
-
else:
|
|
252
|
-
if verbose:log.write(" -Since num_hg19 = num_hg38, unable to infer...")
|
|
253
|
-
gc.collect()
|
|
254
|
-
if verbose:log.write("Finished inferring genome build version using hapmap3 SNPs...")
|
|
255
|
-
return sumstats, inferred_build
|
|
246
|
+
log.write(" -CHR:POS will be used for matching...", verbose=verbose)
|
|
247
|
+
raw_chrpos = sumstats[chrom].astype("string")+":"+sumstats[pos].astype("string")
|
|
248
|
+
|
|
249
|
+
hapmap3_ref_19["chr:pos"] = hapmap3_ref_19["#CHROM"]+":"+hapmap3_ref_19["POS"]
|
|
250
|
+
hapmap3_ref_38["chr:pos"] = hapmap3_ref_38["#CHROM"]+":"+hapmap3_ref_38["POS"]
|
|
251
|
+
|
|
252
|
+
match_count_for_19 = sum(raw_chrpos.isin(hapmap3_ref_19["chr:pos"].values))
|
|
253
|
+
match_count_for_38 = sum(raw_chrpos.isin(hapmap3_ref_38["chr:pos"].values))
|
|
254
|
+
|
|
255
|
+
log.write(" -Matching variants for hg19: num_hg19 = ",match_count_for_19, verbose=verbose)
|
|
256
|
+
log.write(" -Matching variants for hg38: num_hg38 = ",match_count_for_38, verbose=verbose)
|
|
257
|
+
|
|
258
|
+
if max(match_count_for_19, match_count_for_38)<10000:
|
|
259
|
+
log.warning("Please be cautious due to the limited number of variants.", verbose=verbose)
|
|
260
|
+
|
|
261
|
+
if match_count_for_19 > match_count_for_38:
|
|
262
|
+
log.write(" -Since num_hg19 >> num_hg38, assigning genome build hg19...", verbose=verbose)
|
|
263
|
+
sumstats[status] = vchange_status(sumstats[status],1,"9","1")
|
|
264
|
+
sumstats[status] = vchange_status(sumstats[status],2,"9","9")
|
|
265
|
+
inferred_build="19"
|
|
266
|
+
elif match_count_for_19 < match_count_for_38:
|
|
267
|
+
log.write(" -Since num_hg19 << num_hg38, assigning genome build hg38...", verbose=verbose)
|
|
268
|
+
sumstats[status] = vchange_status(sumstats[status],1,"9","3")
|
|
269
|
+
sumstats[status] = vchange_status(sumstats[status],2,"9","8")
|
|
270
|
+
inferred_build="38"
|
|
256
271
|
else:
|
|
257
|
-
|
|
258
|
-
|
|
272
|
+
log.write(" -Since num_hg19 = num_hg38, unable to infer...", verbose=verbose)
|
|
273
|
+
|
|
274
|
+
finished(log,verbose,_end_line)
|
|
275
|
+
return sumstats, inferred_build
|
|
259
276
|
|
|
260
277
|
def sampling(sumstats,n=1, p=None, verbose=True,log=Log(),**args):
|
|
261
|
-
|
|
278
|
+
|
|
279
|
+
log.write("Start to randomly select variants from the sumstats...", verbose=verbose)
|
|
262
280
|
if p is None:
|
|
263
|
-
|
|
281
|
+
log.write(" -Number of variants selected from the sumstats:",n, verbose=verbose)
|
|
264
282
|
if n > len(sumstats):
|
|
265
283
|
raise ValueError("Please input a number < {}".format(len(sumstats)))
|
|
266
284
|
else:
|
|
267
285
|
if p>-0.00000001 and p<1.00000001:
|
|
268
|
-
|
|
286
|
+
log.write(" -Percentage of variants selected from the sumstats: ",p, verbose=verbose)
|
|
269
287
|
n = int(len(sumstats)*p)
|
|
270
|
-
|
|
288
|
+
log.write(" -Number of variants selected from the sumstats:",n, verbose=verbose)
|
|
271
289
|
else:
|
|
272
290
|
raise ValueError("Please input a number in (0,1)")
|
|
273
291
|
|
|
274
292
|
if "random_state" in args.keys():
|
|
275
|
-
|
|
293
|
+
log.write(" -Random state (seed): {}".format(args["random_state"]), verbose=verbose)
|
|
276
294
|
else:
|
|
277
295
|
args["random_state"] = np.random.randint(0,4294967295)
|
|
278
|
-
|
|
296
|
+
log.write(" -Random state (seed): {}".format(args["random_state"]), verbose=verbose)
|
|
279
297
|
sampled = sumstats.sample(n=n,**args)
|
|
280
|
-
|
|
298
|
+
log.write("Finished sampling...", verbose=verbose)
|
|
281
299
|
gc.collect()
|
|
282
300
|
return sampled
|
|
283
301
|
|
|
@@ -301,4 +319,113 @@ def _get_flanking(sumstats, snpid, windowsizekb=500, verbose=True,log=Log(),**ar
|
|
|
301
319
|
log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
|
|
302
320
|
|
|
303
321
|
return flanking
|
|
304
|
-
|
|
322
|
+
|
|
323
|
+
def _get_flanking_by_id(sumstats, snpid, windowsizekb=500, verbose=True,log=Log(),**args):
|
|
324
|
+
|
|
325
|
+
log.write("Start to extract variants in the flanking regions using rsID or SNPID...",verbose=verbose)
|
|
326
|
+
log.write(" - Central variants: {}".format(snpid), verbose=verbose)
|
|
327
|
+
log.write(" - Flanking windowsize in kb: {}".format(windowsizekb), verbose=verbose)
|
|
328
|
+
|
|
329
|
+
if type(snpid) == str:
|
|
330
|
+
snpid = [snpid]
|
|
331
|
+
|
|
332
|
+
if "rsID" in sumstats.columns and "SNPID" not in sumstats.columns:
|
|
333
|
+
is_specified = sumstats["rsID"].isin(snpid)
|
|
334
|
+
elif "rsID" not in sumstats.columns and "SNPID" in sumstats.columns:
|
|
335
|
+
is_specified = sumstats["SNPID"].isin(snpid)
|
|
336
|
+
else:
|
|
337
|
+
is_specified = sumstats["rsID"].isin(snpid) | sumstats["SNPID"].isin(snpid)
|
|
338
|
+
|
|
339
|
+
row = sumstats.loc[is_specified,:]
|
|
340
|
+
|
|
341
|
+
is_flanking = None
|
|
342
|
+
for index, row in row.iterrows():
|
|
343
|
+
chrom = row["CHR"]
|
|
344
|
+
left = row["POS"] - 1000 * windowsizekb
|
|
345
|
+
right = row["POS"] + 1000 * windowsizekb
|
|
346
|
+
is_flancking_in_this_region = (sumstats["CHR"] == chrom) & (sumstats["POS"] >= left) & (sumstats["POS"] <= right)
|
|
347
|
+
|
|
348
|
+
log.write(" - Variants in flanking region {}:{}-{} : {}".format(chrom, left, right, sum(is_flancking_in_this_region) ))
|
|
349
|
+
|
|
350
|
+
if is_flanking is None:
|
|
351
|
+
is_flanking = is_flancking_in_this_region
|
|
352
|
+
else:
|
|
353
|
+
is_flanking = is_flanking | is_flancking_in_this_region
|
|
354
|
+
|
|
355
|
+
flanking = sumstats.loc[is_flanking,:]
|
|
356
|
+
|
|
357
|
+
log.write(" - Extracted {} variants in the regions.".format(len(flanking)),verbose=verbose)
|
|
358
|
+
log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
|
|
359
|
+
|
|
360
|
+
return flanking
|
|
361
|
+
|
|
362
|
+
def _get_flanking_by_chrpos(sumstats, chrpos, windowsizekb=500, verbose=True,log=Log(),**args):
|
|
363
|
+
|
|
364
|
+
log.write("Start to extract variants in the flanking regions using CHR and POS...",verbose=verbose)
|
|
365
|
+
log.write(" - Central positions: {}".format(chrpos), verbose=verbose)
|
|
366
|
+
log.write(" - Flanking windowsize in kb: {}".format(windowsizekb), verbose=verbose)
|
|
367
|
+
|
|
368
|
+
if type(chrpos) == tuple:
|
|
369
|
+
chrpos_to_check = [chrpos]
|
|
370
|
+
else:
|
|
371
|
+
chrpos_to_check = chrpos
|
|
372
|
+
|
|
373
|
+
is_flanking = None
|
|
374
|
+
|
|
375
|
+
for index, row in enumerate(chrpos_to_check):
|
|
376
|
+
chrom = row[0]
|
|
377
|
+
left = row[1] - 1000 * windowsizekb
|
|
378
|
+
right = row[1] + 1000 * windowsizekb
|
|
379
|
+
is_flancking_in_this_region = (sumstats["CHR"] == chrom) & (sumstats["POS"] >= left) & (sumstats["POS"] <= right)
|
|
380
|
+
|
|
381
|
+
log.write(" - Variants in flanking region {}:{}-{} : {}".format(chrom, left, right, sum(is_flancking_in_this_region) ))
|
|
382
|
+
|
|
383
|
+
if is_flanking is None:
|
|
384
|
+
is_flanking = is_flancking_in_this_region
|
|
385
|
+
else:
|
|
386
|
+
is_flanking = is_flanking | is_flancking_in_this_region
|
|
387
|
+
|
|
388
|
+
flanking = sumstats.loc[is_flanking,:]
|
|
389
|
+
|
|
390
|
+
log.write(" - Extracted {} variants in the regions.".format(len(flanking)),verbose=verbose)
|
|
391
|
+
log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
|
|
392
|
+
|
|
393
|
+
return flanking
|
|
394
|
+
|
|
395
|
+
def _filter_palindromic(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
|
|
396
|
+
log.write("Start to filter palindromic variants...",verbose=verbose)
|
|
397
|
+
is_palindromic_snp = is_palindromic(sumstats[[nea,ea]],a1=nea,a2=ea)
|
|
398
|
+
|
|
399
|
+
log.write(" -Identified palindromic variants: {}".format(sum(is_palindromic_snp)),verbose=verbose)
|
|
400
|
+
|
|
401
|
+
if mode=="in":
|
|
402
|
+
palindromic = sumstats.loc[is_palindromic_snp,:]
|
|
403
|
+
else:
|
|
404
|
+
palindromic = sumstats.loc[~is_palindromic_snp,:]
|
|
405
|
+
|
|
406
|
+
log.write("Finished filtering palindromic variants.",verbose=verbose)
|
|
407
|
+
return palindromic
|
|
408
|
+
|
|
409
|
+
def _filter_indel(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
|
|
410
|
+
log.write("Start to filter indels...",verbose=verbose)
|
|
411
|
+
is_indel = (sumstats[ea].str.len()!=sumstats[nea].str.len())
|
|
412
|
+
|
|
413
|
+
log.write(" -Identified indels: {}".format(sum(is_indel)),verbose=verbose)
|
|
414
|
+
if mode=="in":
|
|
415
|
+
indel = sumstats.loc[is_indel,:]
|
|
416
|
+
else:
|
|
417
|
+
indel = sumstats.loc[~is_indel,:]
|
|
418
|
+
log.write("Finished filtering indels.",verbose=verbose)
|
|
419
|
+
return indel
|
|
420
|
+
|
|
421
|
+
def _filter_snp(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
|
|
422
|
+
log.write("Start to filter SNPs...",verbose=verbose)
|
|
423
|
+
is_snp = (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()==1)
|
|
424
|
+
|
|
425
|
+
log.write(" -Identified SNPs: {}".format(sum(is_snp)),verbose=verbose)
|
|
426
|
+
if mode=="in":
|
|
427
|
+
snp = sumstats.loc[is_snp,:]
|
|
428
|
+
else:
|
|
429
|
+
snp = sumstats.loc[~is_snp,:]
|
|
430
|
+
log.write("Finished filtering SNPs.",verbose=verbose)
|
|
431
|
+
return snp
|
gwaslab/util_in_get_density.py
CHANGED
|
@@ -5,9 +5,9 @@ from gwaslab.g_Log import Log
|
|
|
5
5
|
import gc
|
|
6
6
|
|
|
7
7
|
def getsignaldensity(insumstats, id="SNPID", chrom="CHR",pos="POS", bwindowsizekb=100,log=Log(),verbose=True):
|
|
8
|
-
|
|
9
|
-
sumstats = insumstats
|
|
10
|
-
|
|
8
|
+
log.write("Start to calculate signal DENSITY..." ,verbose=verbose)
|
|
9
|
+
sumstats = insumstats[[id,chrom,pos]].copy()
|
|
10
|
+
log.write(" -Calculating DENSITY with windowsize of ",bwindowsizekb ," kb",verbose=verbose)
|
|
11
11
|
#stack=[]
|
|
12
12
|
|
|
13
13
|
large_number = 1000000000
|
|
@@ -58,13 +58,13 @@ def getsignaldensity(insumstats, id="SNPID", chrom="CHR",pos="POS", bwindowsizek
|
|
|
58
58
|
bmax = sumstats["DENSITY"].max()
|
|
59
59
|
bmaxid = sumstats["DENSITY"].idxmax()
|
|
60
60
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
61
|
+
log.write(" -Mean : {} signals per {} kb".format(bmean,bwindowsizekb),verbose=verbose)
|
|
62
|
+
log.write(" -SD : {}".format(bsd),verbose=verbose)
|
|
63
|
+
log.write(" -Median : {} signals per {} kb".format(bmedian,bwindowsizekb),verbose=verbose)
|
|
64
|
+
log.write(" -Max : {} signals per {} kb at variant(s) {}".format(bmax,bwindowsizekb,sumstats.loc[bmaxid,id]),verbose=verbose)
|
|
65
65
|
|
|
66
66
|
sumstats = sumstats.drop("TCHR+POS",axis=1)
|
|
67
|
-
|
|
67
|
+
log.write("Finished calculating signal DENSITY successfully!",verbose=verbose)
|
|
68
68
|
return sumstats["DENSITY"]
|
|
69
69
|
|
|
70
70
|
def assigndensity(insumstats,
|
|
@@ -81,7 +81,7 @@ def assigndensity(insumstats,
|
|
|
81
81
|
large_number = int(large_number * 10)
|
|
82
82
|
else:
|
|
83
83
|
break
|
|
84
|
-
sumstats = insumstats
|
|
84
|
+
sumstats = insumstats[[id,chrom,pos]].copy()
|
|
85
85
|
sumstats["DENSITY"] = 0
|
|
86
86
|
sumstats["TCHR+POS"] = sumstats[chrom]*large_number + sumstats[pos]
|
|
87
87
|
sig_sumstats["TCHR+POS"] = sig_sumstats[chrom]*large_number + sig_sumstats[pos]
|
|
@@ -92,7 +92,7 @@ def assigndensity(insumstats,
|
|
|
92
92
|
to_add =(sumstats["TCHR+POS"]>=(row["TCHR+POS"]- 1000*bwindowsizekb)) & (sumstats["TCHR+POS"]<=(row["TCHR+POS"]+ 1000*bwindowsizekb))
|
|
93
93
|
sumstats.loc[to_add,"DENSITY"] += 1
|
|
94
94
|
if counter%1000==0:
|
|
95
|
-
|
|
95
|
+
log.write(" -Processed {} signals".format(counter//1000),verbose=verbose)
|
|
96
96
|
gc.collect()
|
|
97
97
|
|
|
98
98
|
return sumstats["DENSITY"]
|