gwaslab 3.4.36__py3-none-any.whl → 3.4.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +1 -1
- gwaslab/g_Sumstats.py +54 -31
- gwaslab/g_meta.py +13 -3
- gwaslab/g_version.py +2 -2
- gwaslab/hm_harmonize_sumstats.py +43 -18
- gwaslab/io_preformat_input.py +3 -0
- gwaslab/qc_check_datatype.py +14 -0
- gwaslab/qc_fix_sumstats.py +217 -91
- gwaslab/util_ex_process_h5.py +26 -17
- gwaslab/util_in_fill_data.py +42 -3
- gwaslab/viz_aux_quickfix.py +2 -2
- gwaslab/viz_plot_compare_effect.py +22 -5
- gwaslab/viz_plot_mqqplot.py +127 -48
- gwaslab/viz_plot_regionalplot.py +13 -8
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.37.dist-info}/METADATA +2 -2
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.37.dist-info}/RECORD +19 -19
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.37.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.37.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.37.dist-info}/top_level.txt +0 -0
gwaslab/__init__.py
CHANGED
|
@@ -42,6 +42,6 @@ from gwaslab.util_in_calculate_power import get_power
|
|
|
42
42
|
from gwaslab.util_in_calculate_power import get_beta
|
|
43
43
|
from gwaslab.viz_plot_trumpetplot import plot_power
|
|
44
44
|
from gwaslab.viz_plot_trumpetplot import plot_power_x
|
|
45
|
-
from gwaslab.util_ex_process_h5 import
|
|
45
|
+
from gwaslab.util_ex_process_h5 import process_vcf_to_hfd5
|
|
46
46
|
from gwaslab.util_ex_run_susie import _run_susie_rss as run_susie_rss
|
|
47
47
|
from gwaslab.io_read_tabular import _read_tabular as read_tabular
|
gwaslab/g_Sumstats.py
CHANGED
|
@@ -52,7 +52,8 @@ from gwaslab.bd_common_data import get_format_dict
|
|
|
52
52
|
from gwaslab.bd_common_data import get_formats_list
|
|
53
53
|
from gwaslab.g_version import _show_version
|
|
54
54
|
from gwaslab.g_version import gwaslab_info
|
|
55
|
-
from gwaslab.g_meta import
|
|
55
|
+
from gwaslab.g_meta import _init_meta
|
|
56
|
+
from gwaslab.g_meta import _append_meta_record
|
|
56
57
|
from gwaslab.util_ex_run_clumping import _clump
|
|
57
58
|
from gwaslab.util_ex_calculate_ldmatrix import tofinemapping
|
|
58
59
|
from gwaslab.util_ex_calculate_prs import _calculate_prs
|
|
@@ -60,6 +61,7 @@ from gwaslab.viz_plot_mqqplot import mqqplot
|
|
|
60
61
|
from gwaslab.viz_plot_trumpetplot import plottrumpet
|
|
61
62
|
from gwaslab.viz_plot_compare_af import plotdaf
|
|
62
63
|
from gwaslab.util_ex_run_susie import _run_susie_rss
|
|
64
|
+
from gwaslab.qc_fix_sumstats import _check_data_consistency
|
|
63
65
|
import gc
|
|
64
66
|
|
|
65
67
|
#20220309
|
|
@@ -119,10 +121,9 @@ class Sumstats():
|
|
|
119
121
|
self.log = Log()
|
|
120
122
|
|
|
121
123
|
# meta information
|
|
122
|
-
self.meta =
|
|
124
|
+
self.meta = _init_meta()
|
|
123
125
|
self.build = build
|
|
124
|
-
self.meta["gwaslab"]["study_name"] =
|
|
125
|
-
#self.meta["gwaslab"]["genome_build"] = build
|
|
126
|
+
self.meta["gwaslab"]["study_name"] = study
|
|
126
127
|
self.meta["gwaslab"]["species"] = species
|
|
127
128
|
|
|
128
129
|
# initialize attributes for clumping and finmapping
|
|
@@ -217,8 +218,22 @@ class Sumstats():
|
|
|
217
218
|
return lookupstatus(self.data[status])
|
|
218
219
|
|
|
219
220
|
def set_build(self, build, verbose=True):
|
|
220
|
-
self.data = _set_build(self.data, build=build, log=self.log,verbose=verbose)
|
|
221
|
+
self.data, self.meta["gwaslab"]["genome_build"] = _set_build(self.data, build=build, log=self.log,verbose=verbose)
|
|
221
222
|
gc.collect()
|
|
223
|
+
|
|
224
|
+
def infer_build(self,**args):
|
|
225
|
+
self.data, self.meta["gwaslab"]["genome_build"] = inferbuild(self.data,**args)
|
|
226
|
+
|
|
227
|
+
def liftover(self,to_build, from_build=None,**args):
|
|
228
|
+
if from_build is None:
|
|
229
|
+
if self.meta["gwaslab"]["genome_build"]=="99":
|
|
230
|
+
self.data, self.meta["gwaslab"]["genome_build"] = inferbuild(self.data,**args)
|
|
231
|
+
from_build = self.meta["gwaslab"]["genome_build"]
|
|
232
|
+
self.data = parallelizeliftovervariant(self.data,from_build=from_build, to_build=to_build, log=self.log,**args)
|
|
233
|
+
self.meta["is_sorted"] = False
|
|
234
|
+
self.meta["is_harmonised"] = False
|
|
235
|
+
self.meta["gwaslab"]["genome_build"]=to_build
|
|
236
|
+
|
|
222
237
|
# QC ######################################################################################
|
|
223
238
|
#clean the sumstats with one line
|
|
224
239
|
def basic_check(self,
|
|
@@ -231,6 +246,7 @@ class Sumstats():
|
|
|
231
246
|
fixpos_args={},
|
|
232
247
|
fixallele_args={},
|
|
233
248
|
sanitycheckstats_args={},
|
|
249
|
+
consistencycheck_args={},
|
|
234
250
|
normalize=True,
|
|
235
251
|
normalizeallele_args={},
|
|
236
252
|
verbose=True):
|
|
@@ -241,6 +257,8 @@ class Sumstats():
|
|
|
241
257
|
self.data = fixpos(self.data,log=self.log,remove=remove,verbose=verbose,**fixpos_args)
|
|
242
258
|
self.data = fixallele(self.data,log=self.log,remove=remove,verbose=verbose,**fixallele_args)
|
|
243
259
|
self.data = sanitycheckstats(self.data,log=self.log,verbose=verbose,**sanitycheckstats_args)
|
|
260
|
+
_check_data_consistency(self.data,log=self.log,verbose=verbose,**consistencycheck_args)
|
|
261
|
+
|
|
244
262
|
if normalize is True:
|
|
245
263
|
self.data = parallelnormalizeallele(self.data,n_cores=n_cores,verbose=verbose,log=self.log,**normalizeallele_args)
|
|
246
264
|
if remove_dup is True:
|
|
@@ -329,9 +347,9 @@ class Sumstats():
|
|
|
329
347
|
|
|
330
348
|
self.data= parallelinferstrand(self.data,ref_infer = ref_infer,ref_alt_freq=ref_alt_freq,maf_threshold=maf_threshold,
|
|
331
349
|
n_cores=n_cores,log=self.log,**inferstrand_args)
|
|
332
|
-
|
|
333
|
-
self.meta["gwaslab"]["references"]["ref_infer"] = ref_infer
|
|
334
350
|
|
|
351
|
+
self.meta["gwaslab"]["references"]["ref_infer"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer"] , ref_infer)
|
|
352
|
+
|
|
335
353
|
self.data =flipallelestats(self.data,log=self.log,**flipallelestats_args)
|
|
336
354
|
|
|
337
355
|
gc.collect()
|
|
@@ -341,13 +359,18 @@ class Sumstats():
|
|
|
341
359
|
|
|
342
360
|
self.data = parallelizeassignrsid(self.data,path=ref_rsid_tsv,ref_mode="tsv",
|
|
343
361
|
n_cores=n_cores,log=self.log,**assignrsid_args)
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
|
|
344
365
|
self.meta["gwaslab"]["references"]["ref_rsid_tsv"] = ref_rsid_tsv
|
|
345
366
|
gc.collect()
|
|
367
|
+
|
|
346
368
|
if ref_rsid_vcf is not None:
|
|
347
|
-
|
|
348
369
|
self.data = parallelizeassignrsid(self.data,path=ref_rsid_vcf,ref_mode="vcf",
|
|
349
370
|
n_cores=n_cores,log=self.log,**assignrsid_args)
|
|
350
|
-
|
|
371
|
+
|
|
372
|
+
self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_rsid_vcf"] , ref_rsid_vcf)
|
|
373
|
+
|
|
351
374
|
gc.collect()
|
|
352
375
|
######################################################
|
|
353
376
|
if remove is True:
|
|
@@ -376,17 +399,23 @@ class Sumstats():
|
|
|
376
399
|
self.data = removedup(self.data,log=self.log,**args)
|
|
377
400
|
def check_sanity(self,**args):
|
|
378
401
|
self.data = sanitycheckstats(self.data,log=self.log,**args)
|
|
379
|
-
|
|
402
|
+
def check_data_consistency(self, **args):
|
|
403
|
+
_check_data_consistency(self.data,log=self.log,**args)
|
|
380
404
|
def check_id(self,**args):
|
|
381
405
|
pass
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
self.data =
|
|
406
|
+
|
|
407
|
+
def check_ref(self,ref_seq,**args):
|
|
408
|
+
self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
|
|
409
|
+
self.data = checkref(self.data,ref_seq,log=self.log,**args)
|
|
410
|
+
def infer_strand(self,ref_infer,**args):
|
|
411
|
+
self.meta["gwaslab"]["references"]["ref_infer"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer"] , ref_infer)
|
|
412
|
+
self.data = parallelinferstrand(self.data,ref_infer=ref_infer,log=self.log,**args)
|
|
413
|
+
|
|
386
414
|
def flip_allele_stats(self,**args):
|
|
387
415
|
self.data = flipallelestats(self.data,log=self.log,**args)
|
|
388
416
|
def normalize_allele(self,**args):
|
|
389
417
|
self.data = parallelnormalizeallele(self.data,log=self.log,**args)
|
|
418
|
+
|
|
390
419
|
def assign_rsid(self,
|
|
391
420
|
ref_rsid_tsv=None,
|
|
392
421
|
ref_rsid_vcf=None,
|
|
@@ -396,21 +425,15 @@ class Sumstats():
|
|
|
396
425
|
self.meta["gwaslab"]["references"]["ref_rsid_tsv"] = ref_rsid_tsv
|
|
397
426
|
if ref_rsid_vcf is not None:
|
|
398
427
|
self.data = parallelizeassignrsid(self.data,path=ref_rsid_vcf,ref_mode="vcf",log=self.log,**args)
|
|
399
|
-
self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = ref_rsid_vcf
|
|
428
|
+
self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_rsid_vcf"] , ref_rsid_vcf)
|
|
429
|
+
|
|
400
430
|
def rsid_to_chrpos(self,**args):
|
|
401
431
|
self.data = rsidtochrpos(self.data,log=self.log,**args)
|
|
432
|
+
|
|
402
433
|
def rsid_to_chrpos2(self,**args):
|
|
403
434
|
self.data = parallelrsidtochrpos(self.data,log=self.log,**args)
|
|
404
435
|
|
|
405
|
-
|
|
406
|
-
if from_build is None:
|
|
407
|
-
if self.meta["gwaslab"]["genome_build"]=="99":
|
|
408
|
-
self.data, self.meta["gwaslab"]["genome_build"] = inferbuild(self.data,**args)
|
|
409
|
-
from_build = self.meta["gwaslab"]["genome_build"]
|
|
410
|
-
self.data = parallelizeliftovervariant(self.data,from_build=from_build, to_build=to_build, log=self.log,**args)
|
|
411
|
-
self.meta["is_sorted"] = False
|
|
412
|
-
self.meta["is_harmonised"] = False
|
|
413
|
-
self.meta["gwaslab"]["genome_build"]=to_build
|
|
436
|
+
|
|
414
437
|
############################################################################################################
|
|
415
438
|
|
|
416
439
|
def sort_coordinate(self,**sort_args):
|
|
@@ -420,11 +443,10 @@ class Sumstats():
|
|
|
420
443
|
self.data = sortcolumn(self.data,log=self.log,**args)
|
|
421
444
|
|
|
422
445
|
############################################################################################################
|
|
423
|
-
def fill_data(self, **args):
|
|
424
|
-
self.data = filldata(self.data
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
self.data, self.meta["gwaslab"]["genome_build"] = inferbuild(self.data,**args)
|
|
446
|
+
def fill_data(self, verbose=True, **args):
|
|
447
|
+
self.data = filldata(self.data, verbose=verbose, **args)
|
|
448
|
+
self.data = sortcolumn(self.data, verbose=verbose, log=self.log)
|
|
449
|
+
|
|
428
450
|
# utilities ############################################################################################################
|
|
429
451
|
# filter series ######################################################################
|
|
430
452
|
def get_flanking(self, inplace=False,**args):
|
|
@@ -485,11 +507,12 @@ class Sumstats():
|
|
|
485
507
|
|
|
486
508
|
def check_af(self,ref_infer,**args):
|
|
487
509
|
self.data = parallelecheckaf(self.data,ref_infer=ref_infer,log=self.log,**args)
|
|
488
|
-
self.meta["gwaslab"]["references"]["ref_infer_daf"] = ref_infer
|
|
489
|
-
|
|
510
|
+
self.meta["gwaslab"]["references"]["ref_infer_daf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_daf"] , ref_infer)
|
|
511
|
+
|
|
490
512
|
def infer_af(self,ref_infer,**args):
|
|
491
513
|
self.data = paralleleinferaf(self.data,ref_infer=ref_infer,log=self.log,**args)
|
|
492
514
|
self.meta["gwaslab"]["references"]["ref_infer_af"] = ref_infer
|
|
515
|
+
self.meta["gwaslab"]["references"]["ref_infer_af"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_af"] , ref_infer)
|
|
493
516
|
|
|
494
517
|
def plot_daf(self, **args):
|
|
495
518
|
fig,outliers = plotdaf(self.data, **args)
|
gwaslab/g_meta.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from gwaslab.g_version import gwaslab_info
|
|
2
2
|
|
|
3
|
-
def
|
|
3
|
+
def _init_meta():
|
|
4
4
|
metadata = {"gwaslab":{
|
|
5
5
|
"gwaslab_version": gwaslab_info()["version"],
|
|
6
6
|
"study_name":"Sumstats_1",
|
|
@@ -23,7 +23,11 @@ def init_meta():
|
|
|
23
23
|
"ref_rsid_tsv":"Unknown",
|
|
24
24
|
"ref_rsid_vcf":"Unknown",
|
|
25
25
|
"ref_seq":"Unknown",
|
|
26
|
-
"ref_infer":"Unknown"
|
|
26
|
+
"ref_infer":"Unknown",
|
|
27
|
+
"ref_infer_af":"Unknown",
|
|
28
|
+
"ref_infer_daf":"Unknown",
|
|
29
|
+
"ref_rsid_to_chrpos_tsv":"Unknown",
|
|
30
|
+
"ref_rsid_to_chrpos_vcf":"Unknown"
|
|
27
31
|
}
|
|
28
32
|
},
|
|
29
33
|
"genotyping_technology":"Unknown",
|
|
@@ -45,4 +49,10 @@ def init_meta():
|
|
|
45
49
|
"coordinate_system":"1-based",
|
|
46
50
|
"sex": "M|F|combined"
|
|
47
51
|
}
|
|
48
|
-
return metadata.copy()
|
|
52
|
+
return metadata.copy()
|
|
53
|
+
|
|
54
|
+
def _append_meta_record(old, new):
|
|
55
|
+
if old == "Unknown" or old== "Unchecked":
|
|
56
|
+
return new
|
|
57
|
+
else:
|
|
58
|
+
return "{}, {}".format(old, new)
|
gwaslab/g_version.py
CHANGED
gwaslab/hm_harmonize_sumstats.py
CHANGED
|
@@ -11,6 +11,8 @@ import gc
|
|
|
11
11
|
from gwaslab.g_Log import Log
|
|
12
12
|
from gwaslab.qc_fix_sumstats import fixchr
|
|
13
13
|
from gwaslab.qc_fix_sumstats import fixpos
|
|
14
|
+
from gwaslab.qc_fix_sumstats import sortcolumn
|
|
15
|
+
from gwaslab.qc_check_datatype import check_dataframe_shape
|
|
14
16
|
from gwaslab.bd_common_data import get_number_to_chr
|
|
15
17
|
from gwaslab.bd_common_data import get_chr_list
|
|
16
18
|
from gwaslab.bd_common_data import get_chr_to_number
|
|
@@ -27,7 +29,7 @@ from gwaslab.g_version import _get_version
|
|
|
27
29
|
|
|
28
30
|
###~!!!!
|
|
29
31
|
def rsidtochrpos(sumstats,
|
|
30
|
-
path=
|
|
32
|
+
path=None, ref_rsid_to_chrpos_tsv=None, snpid="SNPID",
|
|
31
33
|
rsid="rsID", chrom="CHR",pos="POS",ref_rsid="rsID",ref_chr="CHR",ref_pos="POS", build="19",
|
|
32
34
|
overwrite=False,remove=False,chunksize=5000000,verbose=True,log=Log()):
|
|
33
35
|
'''
|
|
@@ -35,9 +37,12 @@ def rsidtochrpos(sumstats,
|
|
|
35
37
|
'''
|
|
36
38
|
#########################################################################################################
|
|
37
39
|
if verbose: log.write("Start to update chromosome and position information based on rsID...{}".format(_get_version()))
|
|
38
|
-
|
|
40
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
39
41
|
if verbose: log.write(" -rsID dictionary file: "+ path)
|
|
40
42
|
|
|
43
|
+
if ref_rsid_to_chrpos_tsv is not None:
|
|
44
|
+
path = ref_rsid_to_chrpos_tsv
|
|
45
|
+
|
|
41
46
|
if snpid in sumstats.columns and sum(sumstats[rsid].isna())>0:
|
|
42
47
|
if verbose: log.write(" -Filling na in rsID columns with SNPID...")
|
|
43
48
|
sumstats.loc[sumstats[rsid].isna(),rsid] = sumstats.loc[sumstats[rsid].isna(),snpid]
|
|
@@ -75,6 +80,7 @@ def rsidtochrpos(sumstats,
|
|
|
75
80
|
if verbose: log.write(" -Updating CHR and POS finished.Start to re-fixing CHR and POS... ")
|
|
76
81
|
sumstats = fixchr(sumstats,verbose=verbose)
|
|
77
82
|
sumstats = fixpos(sumstats,verbose=verbose)
|
|
83
|
+
sumstats = sortcolumn(sumstats,verbose=verbose)
|
|
78
84
|
return sumstats
|
|
79
85
|
####################################################################################################
|
|
80
86
|
|
|
@@ -96,9 +102,19 @@ def merge_chrpos(sumstats_part,all_groups_max,path,build,status):
|
|
|
96
102
|
return sumstats_part
|
|
97
103
|
|
|
98
104
|
|
|
99
|
-
def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None,build="99",status="STATUS",
|
|
105
|
+
def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None, ref_rsid_to_chrpos_vcf = None, ref_rsid_to_chrpos_hdf5 = None, build="99",status="STATUS",
|
|
100
106
|
n_cores=4,block_size=20000000,verbose=True,log=Log()):
|
|
107
|
+
|
|
108
|
+
if ref_rsid_to_chrpos_hdf5 is not None:
|
|
109
|
+
path = ref_rsid_to_chrpos_hdf5
|
|
110
|
+
elif ref_rsid_to_chrpos_vcf is not None:
|
|
111
|
+
vcf_file_name = os.path.basename(ref_rsid_to_chrpos_vcf)
|
|
112
|
+
vcf_dir_path = os.path.dirname(ref_rsid_to_chrpos_vcf)
|
|
113
|
+
path = "{}/{}.rsID_CHR_POS_groups_{}.h5".format(vcf_dir_path,vcf_file_name,int(block_size))
|
|
114
|
+
|
|
101
115
|
if verbose: log.write("Start to assign CHR and POS using rsIDs...{}".format(_get_version()))
|
|
116
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
117
|
+
|
|
102
118
|
if path is None:
|
|
103
119
|
raise ValueError("Please provide path to hdf5 file.")
|
|
104
120
|
|
|
@@ -164,13 +180,16 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
|
|
|
164
180
|
# merge back
|
|
165
181
|
if verbose: log.write(" -Append data... ")
|
|
166
182
|
sumstats = pd.concat([sumstats_rs,sumstats_nonrs],ignore_index=True)
|
|
183
|
+
|
|
167
184
|
del sumstats_rs
|
|
168
185
|
del sumstats_nonrs
|
|
169
186
|
gc.collect()
|
|
170
187
|
|
|
171
188
|
# check
|
|
172
|
-
sumstats = fixchr(sumstats,verbose=
|
|
173
|
-
sumstats = fixpos(sumstats,verbose=
|
|
189
|
+
sumstats = fixchr(sumstats,verbose=verbose)
|
|
190
|
+
sumstats = fixpos(sumstats,verbose=verbose)
|
|
191
|
+
sumstats = sortcolumn(sumstats,verbose=verbose)
|
|
192
|
+
|
|
174
193
|
pool.close()
|
|
175
194
|
pool.join()
|
|
176
195
|
gc.collect()
|
|
@@ -235,7 +254,7 @@ def check_status(row,record):
|
|
|
235
254
|
|
|
236
255
|
def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="STATUS",chr_dict=get_chr_to_number(),remove=False,verbose=True,log=Log()):
|
|
237
256
|
if verbose: log.write("Start to check if NEA is aligned with reference sequence...{}".format(_get_version()))
|
|
238
|
-
|
|
257
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
239
258
|
if verbose: log.write(" -Reference genome fasta file: "+ ref_path)
|
|
240
259
|
if verbose: log.write(" -Checking records: ", end="")
|
|
241
260
|
chromlist = get_chr_list(add_number=True)
|
|
@@ -381,7 +400,7 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
381
400
|
assign rsID based on chr:pos
|
|
382
401
|
'''
|
|
383
402
|
if verbose: log.write("Start to annotate rsID based on chromosome and position information...{}".format(_get_version()))
|
|
384
|
-
|
|
403
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
385
404
|
if verbose: log.write(" -SNPID-rsID text file: "+ path)
|
|
386
405
|
|
|
387
406
|
standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234][0126]\w", case=False, flags=0, na=False)
|
|
@@ -517,7 +536,7 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
517
536
|
chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
|
|
518
537
|
chr_dict=None,verbose=True,log=Log()):
|
|
519
538
|
if verbose: log.write("Start to infer strand for palindromic SNPs...{}".format(_get_version()))
|
|
520
|
-
|
|
539
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
521
540
|
if verbose: log.write(" -Reference vcf file:", ref_infer)
|
|
522
541
|
|
|
523
542
|
chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
|
|
@@ -528,8 +547,8 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
528
547
|
if "p" in mode:
|
|
529
548
|
# ref_alt_freq INFO in vcf was provided
|
|
530
549
|
if ref_alt_freq is not None:
|
|
550
|
+
|
|
531
551
|
if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
|
|
532
|
-
|
|
533
552
|
## checking \w\w\w\w[0]\w\w -> standardized and normalized snp
|
|
534
553
|
good_chrpos = sumstats[status].str.match(r'\w\w\w[0][0]\w\w', case=False, flags=0, na=False)
|
|
535
554
|
palindromic = good_chrpos & is_palindromic(sumstats[[ref,alt]],a1=ref,a2=alt)
|
|
@@ -538,22 +557,28 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
538
557
|
##not palindromic : change status
|
|
539
558
|
sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
|
|
540
559
|
if verbose: log.write(" -Identified ", sum(palindromic)," palindromic SNPs...")
|
|
541
|
-
|
|
560
|
+
|
|
542
561
|
#palindromic but can not infer
|
|
543
|
-
maf_can_infer = (sumstats.loc[:,eaf] < maf_threshold) | (sumstats.loc[:,eaf] > 1 - maf_threshold)
|
|
562
|
+
maf_can_infer = (sumstats.loc[:,eaf] < maf_threshold) | (sumstats.loc[:,eaf] > 1 - maf_threshold)
|
|
563
|
+
|
|
544
564
|
sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
|
|
565
|
+
|
|
566
|
+
#palindromic WITH UNKNWON OR UNCHECKED STATUS
|
|
567
|
+
unknow_palindromic = sumstats[status].str.match(r'\w\w\w\w\w[012][89]', case=False, flags=0, na=False)
|
|
545
568
|
|
|
569
|
+
unknow_palindromic_to_check = palindromic & maf_can_infer & unknow_palindromic
|
|
570
|
+
|
|
571
|
+
if verbose: log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)))
|
|
546
572
|
|
|
547
|
-
if verbose: log.write(" -After filtering by MAF< ", maf_threshold ," , the strand of ", sum(palindromic & maf_can_infer)," palindromic SNPs will be inferred...")
|
|
548
573
|
#########################################################################################
|
|
549
|
-
if sum(
|
|
550
|
-
if sum(
|
|
574
|
+
if sum(unknow_palindromic_to_check)>0:
|
|
575
|
+
if sum(unknow_palindromic_to_check)<10000:
|
|
551
576
|
n_cores=1
|
|
552
|
-
df_split = np.array_split(sumstats.loc[
|
|
577
|
+
df_split = np.array_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
|
|
553
578
|
pool = Pool(n_cores)
|
|
554
579
|
map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
555
580
|
status_inferred = pd.concat(pool.map(map_func,df_split))
|
|
556
|
-
sumstats.loc[
|
|
581
|
+
sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
|
|
557
582
|
pool.close()
|
|
558
583
|
pool.join()
|
|
559
584
|
#########################################################################################
|
|
@@ -650,7 +675,7 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
650
675
|
def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,column_name="DAF",suffix="",n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
|
|
651
676
|
|
|
652
677
|
if verbose: log.write("Start to check the difference between EAF and reference vcf alt frequency ...{}".format(_get_version()))
|
|
653
|
-
|
|
678
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
654
679
|
if verbose: log.write(" -Reference vcf file:", ref_infer)
|
|
655
680
|
if verbose: log.write(" -CPU Cores to use :",n_cores)
|
|
656
681
|
|
|
@@ -718,7 +743,7 @@ def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
|
|
|
718
743
|
def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
|
|
719
744
|
|
|
720
745
|
if verbose: log.write("Start to infer the AF and reference vcf alt frequency ...{}".format(_get_version()))
|
|
721
|
-
|
|
746
|
+
check_dataframe_shape(sumstats, log, verbose)
|
|
722
747
|
if verbose: log.write(" -Reference vcf file:", ref_infer)
|
|
723
748
|
if verbose: log.write(" -CPU Cores to use :",n_cores)
|
|
724
749
|
|
gwaslab/io_preformat_input.py
CHANGED
|
@@ -8,6 +8,7 @@ from gwaslab.bd_common_data import get_format_dict
|
|
|
8
8
|
from gwaslab.qc_fix_sumstats import sortcolumn
|
|
9
9
|
from gwaslab.qc_fix_sumstats import _process_build
|
|
10
10
|
from gwaslab.qc_check_datatype import check_datatype
|
|
11
|
+
from gwaslab.qc_check_datatype import check_dataframe_memory_usage
|
|
11
12
|
|
|
12
13
|
#20221030
|
|
13
14
|
def preformat(sumstats,
|
|
@@ -353,6 +354,8 @@ def preformat(sumstats,
|
|
|
353
354
|
sumstats = sortcolumn(sumstats=sumstats,log=log,verbose=verbose)
|
|
354
355
|
check_datatype(sumstats,log=log,verbose=verbose)
|
|
355
356
|
gc.collect()
|
|
357
|
+
check_dataframe_memory_usage(sumstats,log=log,verbose=verbose)
|
|
358
|
+
|
|
356
359
|
if verbose: log.write("Finished loading data successfully!")
|
|
357
360
|
return sumstats
|
|
358
361
|
|
gwaslab/qc_check_datatype.py
CHANGED
|
@@ -87,3 +87,17 @@ def verify_datatype(header, dtype):
|
|
|
87
87
|
return "F"
|
|
88
88
|
else:
|
|
89
89
|
return "NA"
|
|
90
|
+
|
|
91
|
+
def check_dataframe_shape(sumstats, log, verbose):
|
|
92
|
+
memory_in_mb = sumstats.memory_usage().sum()/1024/1024
|
|
93
|
+
try:
|
|
94
|
+
log.write(" -Current Dataframe shape : {} x {} ; Memory usage: {:.2f} MB".format(len(sumstats),len(sumstats.columns),memory_in_mb), verbose=verbose)
|
|
95
|
+
except:
|
|
96
|
+
log.write(" -WARNING! Error: cannot get Dataframe shape...", verbose=verbose)
|
|
97
|
+
|
|
98
|
+
def check_dataframe_memory_usage(sumstats, log, verbose):
|
|
99
|
+
memory_in_mb = sumstats.memory_usage().sum()/1024/1024
|
|
100
|
+
try:
|
|
101
|
+
log.write(" -Current Dataframe memory usage: {:.2f} MB".format(memory_in_mb), verbose=verbose)
|
|
102
|
+
except:
|
|
103
|
+
log.write(" -WARNING! Error: cannot get Memory usage...", verbose=verbose)
|