gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +22 -5
- gwaslab/g_Sumstats.py +110 -163
- gwaslab/g_SumstatsPair.py +76 -25
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +36 -17
- gwaslab/hm_harmonize_sumstats.py +354 -221
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +49 -43
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/io_to_formats.py +428 -295
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +3 -3
- gwaslab/qc_fix_sumstats.py +891 -778
- gwaslab/util_ex_calculate_ldmatrix.py +31 -13
- gwaslab/util_ex_gwascatalog.py +25 -25
- gwaslab/util_ex_ldproxyfinder.py +10 -10
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +30 -30
- gwaslab/util_in_filter_value.py +201 -74
- gwaslab/util_in_get_density.py +10 -10
- gwaslab/util_in_get_sig.py +445 -71
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +42 -37
- gwaslab/viz_aux_reposition_text.py +10 -7
- gwaslab/viz_aux_save_figure.py +18 -8
- gwaslab/viz_plot_compare_af.py +32 -33
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +34 -26
- gwaslab/viz_plot_mqqplot.py +126 -75
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +36 -33
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +65 -61
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.37.dist-info/RECORD +0 -72
- /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/g_Log.py
CHANGED
|
@@ -1,17 +1,34 @@
|
|
|
1
1
|
import time
|
|
2
2
|
class Log():
|
|
3
3
|
def __init__(self):
|
|
4
|
-
self.log_text=str(time.
|
|
4
|
+
self.log_text=str(time.strftime('%Y/%m/%d %H:%M:%S'))+ " " + "Sumstats Object created."+ "\n"
|
|
5
|
+
|
|
5
6
|
def write(self,*message,end="\n",show_time=True, verbose=True):
|
|
6
7
|
if show_time is True:
|
|
7
|
-
if verbose: print(str(time.
|
|
8
|
-
self.log_text = self.log_text + str(time.
|
|
8
|
+
if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')),*message,end=end)
|
|
9
|
+
self.log_text = self.log_text + str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " ".join(map(str,message)) + end
|
|
9
10
|
else:
|
|
10
11
|
if verbose: print(*message,end=end)
|
|
11
12
|
self.log_text = self.log_text + " ".join(map(str,message)) + end
|
|
13
|
+
|
|
14
|
+
def warning(self,*message,end="\n",show_time=True, verbose=True):
|
|
15
|
+
self.write(" #WARNING! {}".format(" ".join(map(str,message))),
|
|
16
|
+
end=end,
|
|
17
|
+
show_time=show_time,
|
|
18
|
+
verbose=verbose)
|
|
19
|
+
|
|
12
20
|
def show(self):
|
|
13
21
|
print(self.log_text)
|
|
14
22
|
def save(self,path,verbose=True):
|
|
15
23
|
with open(path,"w") as f:
|
|
16
|
-
if verbose: print(str(time.
|
|
17
|
-
f.write(self.log_text)
|
|
24
|
+
if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " -Save log file to : ", path)
|
|
25
|
+
f.write(self.log_text)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def log(self,*message,end="\n",show_time=True, verbose=True):
|
|
29
|
+
if show_time is True:
|
|
30
|
+
if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')),*message,end=end)
|
|
31
|
+
self.log_text = self.log_text + str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " ".join(map(str,message)) + end
|
|
32
|
+
else:
|
|
33
|
+
if verbose: print(*message,end=end)
|
|
34
|
+
self.log_text = self.log_text + " ".join(map(str,message)) + end
|
gwaslab/g_Sumstats.py
CHANGED
|
@@ -5,7 +5,7 @@ import copy
|
|
|
5
5
|
from gwaslab.g_Sumstats_summary import summarize
|
|
6
6
|
from gwaslab.g_Sumstats_summary import lookupstatus
|
|
7
7
|
from gwaslab.io_preformat_input import preformat
|
|
8
|
-
from gwaslab.io_to_formats import
|
|
8
|
+
from gwaslab.io_to_formats import _to_format
|
|
9
9
|
from gwaslab.g_Log import Log
|
|
10
10
|
from gwaslab.qc_fix_sumstats import fixID
|
|
11
11
|
from gwaslab.qc_fix_sumstats import removedup
|
|
@@ -32,9 +32,14 @@ from gwaslab.util_in_filter_value import filterout
|
|
|
32
32
|
from gwaslab.util_in_filter_value import filterin
|
|
33
33
|
from gwaslab.util_in_filter_value import filterregionin
|
|
34
34
|
from gwaslab.util_in_filter_value import filterregionout
|
|
35
|
+
from gwaslab.util_in_filter_value import _filter_indel
|
|
36
|
+
from gwaslab.util_in_filter_value import _filter_palindromic
|
|
37
|
+
from gwaslab.util_in_filter_value import _filter_snp
|
|
35
38
|
from gwaslab.util_in_filter_value import inferbuild
|
|
36
39
|
from gwaslab.util_in_filter_value import sampling
|
|
37
40
|
from gwaslab.util_in_filter_value import _get_flanking
|
|
41
|
+
from gwaslab.util_in_filter_value import _get_flanking_by_chrpos
|
|
42
|
+
from gwaslab.util_in_filter_value import _get_flanking_by_id
|
|
38
43
|
from gwaslab.util_in_calculate_gc import lambdaGC
|
|
39
44
|
from gwaslab.util_in_convert_h2 import _get_per_snp_r2
|
|
40
45
|
from gwaslab.util_in_get_sig import getsig
|
|
@@ -42,6 +47,8 @@ from gwaslab.util_in_get_density import getsignaldensity
|
|
|
42
47
|
from gwaslab.util_in_get_density import assigndensity
|
|
43
48
|
from gwaslab.util_in_get_sig import annogene
|
|
44
49
|
from gwaslab.util_in_get_sig import getnovel
|
|
50
|
+
from gwaslab.util_in_get_sig import _check_cis
|
|
51
|
+
from gwaslab.util_in_get_sig import _check_novel_set
|
|
45
52
|
from gwaslab.util_in_fill_data import filldata
|
|
46
53
|
from gwaslab.bd_get_hapmap3 import gethapmap3
|
|
47
54
|
from gwaslab.bd_common_data import get_chr_list
|
|
@@ -62,6 +69,9 @@ from gwaslab.viz_plot_trumpetplot import plottrumpet
|
|
|
62
69
|
from gwaslab.viz_plot_compare_af import plotdaf
|
|
63
70
|
from gwaslab.util_ex_run_susie import _run_susie_rss
|
|
64
71
|
from gwaslab.qc_fix_sumstats import _check_data_consistency
|
|
72
|
+
from gwaslab.util_ex_ldsc import _estimate_h2_by_ldsc
|
|
73
|
+
from gwaslab.util_ex_ldsc import _estimate_rg_by_ldsc
|
|
74
|
+
from gwaslab.bd_get_hapmap3 import gethapmap3
|
|
65
75
|
import gc
|
|
66
76
|
|
|
67
77
|
#20220309
|
|
@@ -119,7 +129,8 @@ class Sumstats():
|
|
|
119
129
|
# basic attributes
|
|
120
130
|
self.data = pd.DataFrame()
|
|
121
131
|
self.log = Log()
|
|
122
|
-
|
|
132
|
+
self.ldsc_h2 = None
|
|
133
|
+
self.ldsc_rg = None
|
|
123
134
|
# meta information
|
|
124
135
|
self.meta = _init_meta()
|
|
125
136
|
self.build = build
|
|
@@ -133,7 +144,7 @@ class Sumstats():
|
|
|
133
144
|
self.pipcs = pd.DataFrame()
|
|
134
145
|
|
|
135
146
|
# print gwaslab version information
|
|
136
|
-
|
|
147
|
+
_show_version(self.log, verbose=verbose)
|
|
137
148
|
|
|
138
149
|
#preformat the data
|
|
139
150
|
self.data = preformat(
|
|
@@ -403,19 +414,16 @@ class Sumstats():
|
|
|
403
414
|
_check_data_consistency(self.data,log=self.log,**args)
|
|
404
415
|
def check_id(self,**args):
|
|
405
416
|
pass
|
|
406
|
-
|
|
407
417
|
def check_ref(self,ref_seq,**args):
|
|
408
418
|
self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
|
|
409
419
|
self.data = checkref(self.data,ref_seq,log=self.log,**args)
|
|
410
420
|
def infer_strand(self,ref_infer,**args):
|
|
411
421
|
self.meta["gwaslab"]["references"]["ref_infer"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer"] , ref_infer)
|
|
412
422
|
self.data = parallelinferstrand(self.data,ref_infer=ref_infer,log=self.log,**args)
|
|
413
|
-
|
|
414
423
|
def flip_allele_stats(self,**args):
|
|
415
424
|
self.data = flipallelestats(self.data,log=self.log,**args)
|
|
416
425
|
def normalize_allele(self,**args):
|
|
417
426
|
self.data = parallelnormalizeallele(self.data,log=self.log,**args)
|
|
418
|
-
|
|
419
427
|
def assign_rsid(self,
|
|
420
428
|
ref_rsid_tsv=None,
|
|
421
429
|
ref_rsid_vcf=None,
|
|
@@ -426,14 +434,11 @@ class Sumstats():
|
|
|
426
434
|
if ref_rsid_vcf is not None:
|
|
427
435
|
self.data = parallelizeassignrsid(self.data,path=ref_rsid_vcf,ref_mode="vcf",log=self.log,**args)
|
|
428
436
|
self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_rsid_vcf"] , ref_rsid_vcf)
|
|
429
|
-
|
|
430
437
|
def rsid_to_chrpos(self,**args):
|
|
431
438
|
self.data = rsidtochrpos(self.data,log=self.log,**args)
|
|
432
|
-
|
|
433
439
|
def rsid_to_chrpos2(self,**args):
|
|
434
440
|
self.data = parallelrsidtochrpos(self.data,log=self.log,**args)
|
|
435
441
|
|
|
436
|
-
|
|
437
442
|
############################################################################################################
|
|
438
443
|
|
|
439
444
|
def sort_coordinate(self,**sort_args):
|
|
@@ -449,14 +454,27 @@ class Sumstats():
|
|
|
449
454
|
|
|
450
455
|
# utilities ############################################################################################################
|
|
451
456
|
# filter series ######################################################################
|
|
452
|
-
def
|
|
457
|
+
def filter_flanking(self, inplace=False,**args):
|
|
453
458
|
if inplace is False:
|
|
454
459
|
new_Sumstats_object = copy.deepcopy(self)
|
|
455
460
|
new_Sumstats_object.data = _get_flanking(new_Sumstats_object.data, **args)
|
|
456
461
|
return new_Sumstats_object
|
|
457
462
|
else:
|
|
458
463
|
self.data = _get_flanking(self.data, **args)
|
|
459
|
-
|
|
464
|
+
def filter_flanking_by_chrpos(self, chrpos, inplace=False,**args):
|
|
465
|
+
if inplace is False:
|
|
466
|
+
new_Sumstats_object = copy.deepcopy(self)
|
|
467
|
+
new_Sumstats_object.data = _get_flanking_by_chrpos(new_Sumstats_object.data, chrpos, **args)
|
|
468
|
+
return new_Sumstats_object
|
|
469
|
+
else:
|
|
470
|
+
self.data = _get_flanking_by_chrpos(self.data, chrpos,**args)
|
|
471
|
+
def filter_flanking_by_id(self, snpid, inplace=False,**args):
|
|
472
|
+
if inplace is False:
|
|
473
|
+
new_Sumstats_object = copy.deepcopy(self)
|
|
474
|
+
new_Sumstats_object.data = _get_flanking_by_id(new_Sumstats_object.data, snpid, **args)
|
|
475
|
+
return new_Sumstats_object
|
|
476
|
+
else:
|
|
477
|
+
self.data = _get_flanking_by_id(self.data, snpid, **args)
|
|
460
478
|
def filter_value(self, expr, inplace=False, **args):
|
|
461
479
|
if inplace is False:
|
|
462
480
|
new_Sumstats_object = copy.deepcopy(self)
|
|
@@ -464,7 +482,6 @@ class Sumstats():
|
|
|
464
482
|
return new_Sumstats_object
|
|
465
483
|
else:
|
|
466
484
|
self.data = filtervalues(self.data, expr,log=self.log,**args)
|
|
467
|
-
|
|
468
485
|
def filter_out(self, inplace=False, **args):
|
|
469
486
|
if inplace is False:
|
|
470
487
|
new_Sumstats_object = copy.deepcopy(self)
|
|
@@ -472,7 +489,6 @@ class Sumstats():
|
|
|
472
489
|
return new_Sumstats_object
|
|
473
490
|
else:
|
|
474
491
|
self.data = filterout(self.data,log=self.log,**args)
|
|
475
|
-
|
|
476
492
|
def filter_in(self, inplace=False, **args):
|
|
477
493
|
if inplace is False:
|
|
478
494
|
new_Sumstats_object = copy.deepcopy(self)
|
|
@@ -494,7 +510,28 @@ class Sumstats():
|
|
|
494
510
|
return new_Sumstats_object
|
|
495
511
|
else:
|
|
496
512
|
self.data = filterregionout(self.data,log=self.log,**args)
|
|
497
|
-
|
|
513
|
+
def filter_palindromic(self, inplace=False, **args):
|
|
514
|
+
if inplace is False:
|
|
515
|
+
new_Sumstats_object = copy.deepcopy(self)
|
|
516
|
+
new_Sumstats_object.data = _filter_palindromic(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
|
|
517
|
+
return new_Sumstats_object
|
|
518
|
+
else:
|
|
519
|
+
self.data = _filter_palindromic(self.data,log=self.log,**args)
|
|
520
|
+
def filter_snp(self, inplace=False, **args):
|
|
521
|
+
if inplace is False:
|
|
522
|
+
new_Sumstats_object = copy.deepcopy(self)
|
|
523
|
+
new_Sumstats_object.data = _filter_snp(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
|
|
524
|
+
return new_Sumstats_object
|
|
525
|
+
else:
|
|
526
|
+
self.data = _filter_snp(self.data,log=self.log,**args)
|
|
527
|
+
def filter_indel(self, inplace=False, **args):
|
|
528
|
+
if inplace is False:
|
|
529
|
+
new_Sumstats_object = copy.deepcopy(self)
|
|
530
|
+
new_Sumstats_object.data = _filter_indel(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
|
|
531
|
+
return new_Sumstats_object
|
|
532
|
+
else:
|
|
533
|
+
self.data = _filter_indel(self.data,log=self.log,**args)
|
|
534
|
+
|
|
498
535
|
def random_variants(self,inplace=False,n=1,p=None,**args):
|
|
499
536
|
if inplace is True:
|
|
500
537
|
self.data = sampling(self.data,n=n,p=p,log=self.log,**args)
|
|
@@ -502,18 +539,25 @@ class Sumstats():
|
|
|
502
539
|
new_Sumstats_object = copy.deepcopy(self)
|
|
503
540
|
new_Sumstats_object.data = sampling(new_Sumstats_object.data,n=n,p=p,log=new_Sumstats_object.log,**args)
|
|
504
541
|
return new_Sumstats_object
|
|
505
|
-
|
|
542
|
+
|
|
543
|
+
def filter_hapmap3(self, inplace=False, build=None, **args ):
|
|
544
|
+
if build is None:
|
|
545
|
+
build = self.meta["gwaslab"]["genome_build"]
|
|
546
|
+
if inplace is True:
|
|
547
|
+
self.data = gethapmap3(self.data, build=build,log=self.log, **args)
|
|
548
|
+
else:
|
|
549
|
+
new_Sumstats_object = copy.deepcopy(self)
|
|
550
|
+
new_Sumstats_object.data = gethapmap3(new_Sumstats_object.data, build=build,log=self.log, **args)
|
|
551
|
+
return new_Sumstats_object
|
|
506
552
|
######################################################################
|
|
507
553
|
|
|
508
554
|
def check_af(self,ref_infer,**args):
|
|
509
555
|
self.data = parallelecheckaf(self.data,ref_infer=ref_infer,log=self.log,**args)
|
|
510
556
|
self.meta["gwaslab"]["references"]["ref_infer_daf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_daf"] , ref_infer)
|
|
511
|
-
|
|
512
557
|
def infer_af(self,ref_infer,**args):
|
|
513
558
|
self.data = paralleleinferaf(self.data,ref_infer=ref_infer,log=self.log,**args)
|
|
514
559
|
self.meta["gwaslab"]["references"]["ref_infer_af"] = ref_infer
|
|
515
560
|
self.meta["gwaslab"]["references"]["ref_infer_af"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_af"] , ref_infer)
|
|
516
|
-
|
|
517
561
|
def plot_daf(self, **args):
|
|
518
562
|
fig,outliers = plotdaf(self.data, **args)
|
|
519
563
|
return fig, outliers
|
|
@@ -548,8 +592,10 @@ class Sumstats():
|
|
|
548
592
|
|
|
549
593
|
return plot
|
|
550
594
|
|
|
551
|
-
def plot_trumpet(self, **args):
|
|
552
|
-
|
|
595
|
+
def plot_trumpet(self, build=None, **args):
|
|
596
|
+
if build is None:
|
|
597
|
+
build = self.meta["gwaslab"]["genome_build"]
|
|
598
|
+
fig = plottrumpet(self.data,build = build, **args)
|
|
553
599
|
return fig
|
|
554
600
|
|
|
555
601
|
def get_lead(self, build=None, gls=False, **args):
|
|
@@ -617,7 +663,37 @@ class Sumstats():
|
|
|
617
663
|
**args)
|
|
618
664
|
# return sumstats object
|
|
619
665
|
return output
|
|
620
|
-
|
|
666
|
+
|
|
667
|
+
def check_cis(self, **args):
|
|
668
|
+
if "SNPID" in self.data.columns:
|
|
669
|
+
id_to_use = "SNPID"
|
|
670
|
+
else:
|
|
671
|
+
id_to_use = "rsID"
|
|
672
|
+
output = _check_cis(self.data,
|
|
673
|
+
id=id_to_use,
|
|
674
|
+
chrom="CHR",
|
|
675
|
+
pos="POS",
|
|
676
|
+
p="P",
|
|
677
|
+
log=self.log,
|
|
678
|
+
**args)
|
|
679
|
+
# return sumstats object
|
|
680
|
+
return output
|
|
681
|
+
|
|
682
|
+
def check_novel_set(self, **args):
|
|
683
|
+
if "SNPID" in self.data.columns:
|
|
684
|
+
id_to_use = "SNPID"
|
|
685
|
+
else:
|
|
686
|
+
id_to_use = "rsID"
|
|
687
|
+
output = _check_novel_set(self.data,
|
|
688
|
+
id=id_to_use,
|
|
689
|
+
chrom="CHR",
|
|
690
|
+
pos="POS",
|
|
691
|
+
p="P",
|
|
692
|
+
log=self.log,
|
|
693
|
+
**args)
|
|
694
|
+
# return sumstats object
|
|
695
|
+
return output
|
|
696
|
+
|
|
621
697
|
def anno_gene(self, **args):
|
|
622
698
|
if "SNPID" in self.data.columns:
|
|
623
699
|
id_to_use = "SNPID"
|
|
@@ -653,6 +729,18 @@ class Sumstats():
|
|
|
653
729
|
output = lambdaGC(self.data[["CHR",mode]],mode=mode,**args)
|
|
654
730
|
self.meta["Genomic inflation factor"] = output
|
|
655
731
|
return output
|
|
732
|
+
|
|
733
|
+
def estimate_h2_by_ldsc(self, build=None, verbose=True, match_allele=True, **args):
|
|
734
|
+
if build is None:
|
|
735
|
+
build = self.meta["gwaslab"]["genome_build"]
|
|
736
|
+
insumstats = gethapmap3(self.data.copy(), build=build, verbose=verbose , match_allele=True )
|
|
737
|
+
self.ldsc_h2 = _estimate_h2_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **args)
|
|
738
|
+
|
|
739
|
+
def estimate_rg_by_ldsc(self, build=None, verbose=True, match_allele=True, **args):
|
|
740
|
+
if build is None:
|
|
741
|
+
build = self.meta["gwaslab"]["genome_build"]
|
|
742
|
+
insumstats = gethapmap3(self.data.copy(), build=build, verbose=verbose , match_allele=True )
|
|
743
|
+
self.ldsc_rg = _estimate_rg_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **args)
|
|
656
744
|
# external ################################################################################################
|
|
657
745
|
|
|
658
746
|
def to_finemapping(self,**args):
|
|
@@ -670,148 +758,7 @@ class Sumstats():
|
|
|
670
758
|
|
|
671
759
|
# to_format ###############################################################################################
|
|
672
760
|
|
|
673
|
-
def to_format(self,
|
|
674
|
-
path="./sumstats",
|
|
675
|
-
fmt="gwaslab",
|
|
676
|
-
extract=None,
|
|
677
|
-
exclude=None,
|
|
678
|
-
cols=None,
|
|
679
|
-
id_use="rsID",
|
|
680
|
-
hapmap3=False,
|
|
681
|
-
exclude_hla=False,
|
|
682
|
-
hla_range=(25,34),
|
|
683
|
-
build=None,
|
|
684
|
-
n=None,
|
|
685
|
-
verbose=True,
|
|
686
|
-
no_status=False,
|
|
687
|
-
output_log=True,
|
|
688
|
-
to_csvargs=None,
|
|
689
|
-
float_formats=None,
|
|
690
|
-
xymt_number=False,
|
|
691
|
-
xymt=None,
|
|
692
|
-
chr_prefix="",
|
|
693
|
-
ssfmeta=False,
|
|
694
|
-
md5sum=False,
|
|
695
|
-
bgzip=False,
|
|
696
|
-
tabix=False,
|
|
697
|
-
tabix_indexargs={}):
|
|
761
|
+
def to_format(self, path, build=None, **args):
|
|
698
762
|
if build is None:
|
|
699
763
|
build = self.meta["gwaslab"]["genome_build"]
|
|
700
|
-
|
|
701
|
-
if to_csvargs is None:
|
|
702
|
-
to_csvargs = {}
|
|
703
|
-
if float_formats is None:
|
|
704
|
-
float_formats={}
|
|
705
|
-
if cols is None:
|
|
706
|
-
cols=[]
|
|
707
|
-
if xymt is None:
|
|
708
|
-
xymt = ["X","Y","MT"]
|
|
709
|
-
|
|
710
|
-
formatlist= get_formats_list() + ["vep","bed","annovar","vcf"]
|
|
711
|
-
if fmt in formatlist:
|
|
712
|
-
if verbose: onetime_log.write("Start to format the output sumstats in: ",fmt, " format")
|
|
713
|
-
else:
|
|
714
|
-
raise ValueError("Please select a format to output")
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
#######################################################################################################
|
|
718
|
-
# filter
|
|
719
|
-
output = self.data.copy()
|
|
720
|
-
if extract is not None:
|
|
721
|
-
output = output.loc[output[id_use].isin(extract),:]
|
|
722
|
-
|
|
723
|
-
if exclude is not None:
|
|
724
|
-
output = output.loc[~output[id_use].isin(exclude),:]
|
|
725
|
-
|
|
726
|
-
#hla and hapmap3 #######################################################################################
|
|
727
|
-
suffix=fmt
|
|
728
|
-
|
|
729
|
-
#exclude hla
|
|
730
|
-
if exclude_hla is True:
|
|
731
|
-
if verbose: onetime_log.write(" -Excluding variants in MHC (HLA) region ...")
|
|
732
|
-
before = len(output)
|
|
733
|
-
is_hla = (output["CHR"].astype("string") == "6") & (output["POS"].astype("Int64") > hla_range[0]*1000000) & (output["POS"].astype("Int64") < hla_range[1]*1000000)
|
|
734
|
-
output = output.loc[~is_hla,:]
|
|
735
|
-
after = len(output)
|
|
736
|
-
if verbose: onetime_log.write(" -Exclude "+ str(before - after) + " variants in MHC (HLA) region : {}Mb - {}Mb.".format(hla_range[0], hla_range[1]))
|
|
737
|
-
suffix = "noMHC."+suffix
|
|
738
|
-
|
|
739
|
-
#extract hapmap3 SNPs
|
|
740
|
-
if hapmap3 is True:
|
|
741
|
-
output = gethapmap3(output,build=build,verbose=True)
|
|
742
|
-
after = len(output)
|
|
743
|
-
if verbose: onetime_log.write(" -Extract "+ str(after) + " variants in Hapmap3 datasets for build "+build+".")
|
|
744
|
-
suffix = "hapmap3."+suffix
|
|
745
|
-
|
|
746
|
-
# add a n column
|
|
747
|
-
if n is not None:
|
|
748
|
-
output["N"] = n
|
|
749
|
-
|
|
750
|
-
#######################################################################################################
|
|
751
|
-
#formatting float statistics
|
|
752
|
-
if verbose: onetime_log.write(" -Formatting statistics ...")
|
|
753
|
-
|
|
754
|
-
formats = {'EAF': '{:.4g}',
|
|
755
|
-
'BETA': '{:.4f}',
|
|
756
|
-
'Z': '{:.4f}',
|
|
757
|
-
'CHISQ': '{:.4f}',
|
|
758
|
-
'SE': '{:.4f}',
|
|
759
|
-
'OR': '{:.4f}',
|
|
760
|
-
'OR_95U': '{:.4f}',
|
|
761
|
-
'OR_95L': '{:.4f}',
|
|
762
|
-
'INFO': '{:.4f}',
|
|
763
|
-
'P': '{:.4e}',
|
|
764
|
-
'MLOG10P': '{:.4f}',
|
|
765
|
-
'DAF': '{:.4f}'
|
|
766
|
-
}
|
|
767
|
-
|
|
768
|
-
for col, f in float_formats.items():
|
|
769
|
-
if col in output.columns:
|
|
770
|
-
formats[col]=f
|
|
771
|
-
for col, f in formats.items():
|
|
772
|
-
if col in output.columns:
|
|
773
|
-
if output[col].dtype in ["float64","float32","float16","float"]:
|
|
774
|
-
output[col] = output[col].map(f.format)
|
|
775
|
-
if verbose:
|
|
776
|
-
onetime_log.write(" - Float statistics formats:")
|
|
777
|
-
keys=[]
|
|
778
|
-
values=[]
|
|
779
|
-
for key,value in formats.items():
|
|
780
|
-
if key in output.columns:
|
|
781
|
-
keys.append(key)
|
|
782
|
-
values.append(value)
|
|
783
|
-
onetime_log.write(" - Columns:",keys)
|
|
784
|
-
onetime_log.write(" - Output formats:",values)
|
|
785
|
-
|
|
786
|
-
##########################################################################################################
|
|
787
|
-
# output, mapping column names
|
|
788
|
-
|
|
789
|
-
if fmt in get_formats_list() + ["vep","bed","annovar","vcf"]:
|
|
790
|
-
tofmt(output,
|
|
791
|
-
path=path,
|
|
792
|
-
fmt=fmt,
|
|
793
|
-
cols=cols,
|
|
794
|
-
suffix=suffix,
|
|
795
|
-
build=build,
|
|
796
|
-
verbose=True,
|
|
797
|
-
no_status=no_status,
|
|
798
|
-
log=onetime_log,
|
|
799
|
-
to_csvargs=to_csvargs,
|
|
800
|
-
chr_prefix=chr_prefix,
|
|
801
|
-
meta = self.meta,
|
|
802
|
-
ssfmeta=ssfmeta,
|
|
803
|
-
bgzip=bgzip,
|
|
804
|
-
tabix=tabix,
|
|
805
|
-
tabix_indexargs=tabix_indexargs,
|
|
806
|
-
md5sum=md5sum,
|
|
807
|
-
xymt_number=xymt_number,
|
|
808
|
-
xymt=xymt)
|
|
809
|
-
if output_log is True:
|
|
810
|
-
log_path = path + "."+ suffix + ".log"
|
|
811
|
-
if verbose: onetime_log.write(" -Saving log file to: {}".format(log_path))
|
|
812
|
-
if verbose: onetime_log.write("Finished outputting successfully!")
|
|
813
|
-
try:
|
|
814
|
-
onetime_log.save(log_path, verbose=False)
|
|
815
|
-
except:
|
|
816
|
-
pass
|
|
817
|
-
|
|
764
|
+
_to_format(self.data, path, log=self.log, meta=self.meta, build=build, **args)
|
gwaslab/g_SumstatsPair.py
CHANGED
|
@@ -6,33 +6,40 @@ from gwaslab.util_in_filter_value import filtervalues
|
|
|
6
6
|
from gwaslab.g_Log import Log
|
|
7
7
|
from math import floor
|
|
8
8
|
from gwaslab.g_Sumstats import Sumstats
|
|
9
|
-
from gwaslab.hm_casting import
|
|
9
|
+
from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
|
|
10
10
|
from gwaslab.hm_casting import _align_with_mold
|
|
11
11
|
from gwaslab.hm_casting import _fill_missing_columns
|
|
12
12
|
from gwaslab.hm_casting import _check_daf
|
|
13
13
|
from gwaslab.hm_casting import _assign_warning_code
|
|
14
14
|
from gwaslab.qc_fix_sumstats import flipallelestats
|
|
15
|
+
from gwaslab.qc_check_datatype import check_datatype
|
|
16
|
+
from gwaslab.qc_check_datatype import check_dataframe_shape
|
|
15
17
|
from gwaslab.hm_casting import _renaming_cols
|
|
16
18
|
from gwaslab.hm_casting import _sort_pair_cols
|
|
17
19
|
from gwaslab.util_ex_calculate_ldmatrix import tofinemapping
|
|
18
20
|
from gwaslab.util_ex_run_coloc import _run_coloc_susie
|
|
19
21
|
from gwaslab.viz_plot_miamiplot2 import plot_miami2
|
|
22
|
+
from gwaslab.viz_plot_compare_af import plotdaf
|
|
20
23
|
from gwaslab.util_ex_run_2samplemr import _run_two_sample_mr
|
|
21
24
|
from gwaslab.util_ex_run_clumping import _clump
|
|
22
25
|
from gwaslab.util_ex_ldproxyfinder import _extract_with_ld_proxy
|
|
23
26
|
|
|
24
27
|
class SumstatsPair( ):
|
|
25
|
-
def __init__(self, sumstatsObject1, sumstatsObject2, study=None, suffixes = ("_1","_2") ):
|
|
28
|
+
def __init__(self, sumstatsObject1, sumstatsObject2, study=None, suffixes = ("_1","_2") ,verbose=True ):
|
|
26
29
|
|
|
27
30
|
if not isinstance(sumstatsObject1, Sumstats):
|
|
28
31
|
raise ValueError("Please provide GWASLab Sumstats Object #1.")
|
|
29
32
|
if not isinstance(sumstatsObject2, Sumstats):
|
|
30
33
|
raise ValueError("Please provide GWASLab Sumstats Object #2.")
|
|
31
|
-
|
|
32
|
-
|
|
34
|
+
if sumstatsObject1.meta["gwaslab"]["study_name"]!=sumstatsObject2.meta["gwaslab"]["study_name"]:
|
|
35
|
+
self.study_name = "{}_{}".format(sumstatsObject1.meta["gwaslab"]["study_name"], sumstatsObject2.meta["gwaslab"]["study_name"])
|
|
36
|
+
else:
|
|
37
|
+
self.study_name = "{}_{}".format("STUDY1", "STUDY2")
|
|
33
38
|
self.snp_info_cols = []
|
|
34
39
|
self.stats_cols =[]
|
|
35
|
-
self.
|
|
40
|
+
self.stats_cols2 =[]
|
|
41
|
+
self.other_cols =[]
|
|
42
|
+
self.other_cols2 =[]
|
|
36
43
|
self.log = Log()
|
|
37
44
|
self.suffixes = suffixes
|
|
38
45
|
self.colocalization=pd.DataFrame()
|
|
@@ -41,26 +48,53 @@ class SumstatsPair( ):
|
|
|
41
48
|
self.mr = {}
|
|
42
49
|
self.clumps ={}
|
|
43
50
|
self.ns = None
|
|
51
|
+
self.to_finemapping_file_path = ""
|
|
52
|
+
self.plink_log = ""
|
|
53
|
+
|
|
54
|
+
self.log.write( "Start to create SumstatsPair object..." )
|
|
55
|
+
|
|
56
|
+
self.log.write( " -Checking sumstats 1..." , verbose=verbose)
|
|
57
|
+
check_datatype(sumstatsObject1.data, log=self.log, verbose=verbose)
|
|
58
|
+
check_dataframe_shape(sumstats=sumstatsObject1.data,
|
|
59
|
+
log=self.log,
|
|
60
|
+
verbose=verbose)
|
|
61
|
+
|
|
62
|
+
self.log.write( " -Checking sumstats 2..." , verbose=verbose)
|
|
63
|
+
check_datatype(sumstatsObject2.data, log=self.log, verbose=verbose)
|
|
64
|
+
check_dataframe_shape(sumstats=sumstatsObject2.data,
|
|
65
|
+
log=self.log,
|
|
66
|
+
verbose=verbose)
|
|
44
67
|
|
|
45
68
|
for i in sumstatsObject1.data.columns:
|
|
46
69
|
if i in ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]:
|
|
47
70
|
self.snp_info_cols.append(i)
|
|
48
|
-
elif i in ["BETA","SE","P","MLOG10P","N","Z","OR","
|
|
71
|
+
elif i in ["BETA","SE","P","MLOG10P","N","N_CASE","N_CONTROL","Z","T","F","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","MAF","EAF","BETA_95L","BETA_95U"]:
|
|
49
72
|
self.stats_cols.append(i)
|
|
50
73
|
else:
|
|
51
74
|
self.other_cols.append(i)
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
75
|
+
for i in sumstatsObject2.data.columns:
|
|
76
|
+
if i in ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]:
|
|
77
|
+
continue
|
|
78
|
+
elif i in ["BETA","SE","P","MLOG10P","N","N_CASE","N_CONTROL","Z","T","F","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","MAF","EAF","BETA_95L","BETA_95U"]:
|
|
79
|
+
self.stats_cols2.append(i)
|
|
80
|
+
else:
|
|
81
|
+
self.other_cols2.append(i)
|
|
82
|
+
|
|
83
|
+
self.log.write( " -Variant Info columns: {}".format(self.snp_info_cols) , verbose=verbose)
|
|
84
|
+
self.log.write( " -Variant statistics columns: {}".format(self.stats_cols) , verbose=verbose)
|
|
85
|
+
self.log.write( " -Sumstats1 other columns: {}".format(self.other_cols) , verbose=verbose)
|
|
86
|
+
self.log.write( " -Sumstats2 other columns: {}".format(self.other_cols2) , verbose=verbose)
|
|
87
|
+
|
|
88
|
+
# extract only info and stats cols
|
|
89
|
+
self.data = sumstatsObject1.data
|
|
90
|
+
|
|
91
|
+
#rename with _1
|
|
55
92
|
self.data = self.data.rename(columns={"EA":"EA_1","NEA":"NEA_1"})
|
|
56
|
-
|
|
57
93
|
self.data = self.data.rename(columns={i:i + suffixes[0] for i in self.stats_cols})
|
|
94
|
+
self.data = self.data.rename(columns={i:i + suffixes[0] for i in self.other_cols})
|
|
58
95
|
|
|
59
96
|
self.data, self.sumstats1 = self._merge_two_sumstats(sumstatsObject2, suffixes=suffixes)
|
|
60
97
|
|
|
61
|
-
self.to_finemapping_file_path = ""
|
|
62
|
-
self.plink_log = ""
|
|
63
|
-
|
|
64
98
|
if "N{}".format(self.suffixes[0]) in self.data.columns and "N{}".format(self.suffixes[1]) in self.data.columns:
|
|
65
99
|
n1 = int(floor(self.data["N{}".format(self.suffixes[0])].mean()))
|
|
66
100
|
n2 = int(floor(self.data["N{}".format(self.suffixes[1])].mean()))
|
|
@@ -70,8 +104,9 @@ class SumstatsPair( ):
|
|
|
70
104
|
|
|
71
105
|
def _merge_two_sumstats(self, sumstatsObject2, threshold=0.2, verbose=True,windowsizeb=10, ref_path=None,suffixes=("_1","_2")):
|
|
72
106
|
|
|
73
|
-
|
|
74
|
-
|
|
107
|
+
# sumstats1 with suffix _1, sumstats2 with no suffix
|
|
108
|
+
molded_sumstats, sumstats1 = _merge_mold_with_sumstats_by_chrpos(mold=self.data,
|
|
109
|
+
sumstats=sumstatsObject2.data,
|
|
75
110
|
log=self.log,
|
|
76
111
|
verbose=verbose,
|
|
77
112
|
suffixes=(suffixes[0],""),
|
|
@@ -79,16 +114,21 @@ class SumstatsPair( ):
|
|
|
79
114
|
|
|
80
115
|
molded_sumstats = _align_with_mold(molded_sumstats, log=self.log, verbose=verbose,suffixes=(suffixes[0],""))
|
|
81
116
|
|
|
117
|
+
# flip sumstats2 statistics
|
|
82
118
|
molded_sumstats = flipallelestats(molded_sumstats, log=self.log, verbose=verbose)
|
|
83
119
|
|
|
120
|
+
# drop sumstats2 EA NEA
|
|
84
121
|
molded_sumstats = molded_sumstats.drop(columns=["EA","NEA"])
|
|
122
|
+
|
|
123
|
+
# rename sumstats1 EA NEA
|
|
85
124
|
molded_sumstats = molded_sumstats.rename(columns={"EA_1":"EA","NEA_1":"NEA"})
|
|
86
125
|
|
|
87
|
-
if not
|
|
88
|
-
cols_to_fill = set(self.stats_cols).difference(set(
|
|
126
|
+
if not set(self.stats_cols2) == set(self.stats_cols):
|
|
127
|
+
cols_to_fill = set(self.stats_cols).difference(set(self.stats_cols2))
|
|
89
128
|
molded_sumstats = _fill_missing_columns(molded_sumstats, cols_to_fill, log=self.log, verbose=verbose)
|
|
90
129
|
|
|
91
|
-
|
|
130
|
+
# rename sumstast2 with _2
|
|
131
|
+
molded_sumstats = _renaming_cols(molded_sumstats, self.stats_cols + self.other_cols2, log=self.log, verbose=verbose, suffixes=suffixes)
|
|
92
132
|
|
|
93
133
|
molded_sumstats = _sort_pair_cols(molded_sumstats, verbose=verbose, log=self.log)
|
|
94
134
|
|
|
@@ -104,13 +144,7 @@ class SumstatsPair( ):
|
|
|
104
144
|
def run_coloc_susie(self,**args):
|
|
105
145
|
|
|
106
146
|
self.colocalization = _run_coloc_susie(self.to_finemapping_file_path,log=self.log,ncols=self.ns,**args)
|
|
107
|
-
|
|
108
|
-
def plot_miami(self,**args):
|
|
109
147
|
|
|
110
|
-
plot_miami2(merged_sumstats=self.data,
|
|
111
|
-
suffixes=self.suffixes,
|
|
112
|
-
**args)
|
|
113
|
-
|
|
114
148
|
def run_two_sample_mr(self, clump=False, **args):
|
|
115
149
|
exposure1 = self.study_name.split("_")[0]
|
|
116
150
|
outcome2 = self.study_name.split("_")[1]
|
|
@@ -126,4 +160,21 @@ class SumstatsPair( ):
|
|
|
126
160
|
return new_Sumstats_object
|
|
127
161
|
else:
|
|
128
162
|
self.data = filtervalues(self.data, expr,log=self.log,**args)
|
|
129
|
-
gc.collect()
|
|
163
|
+
gc.collect()
|
|
164
|
+
|
|
165
|
+
## Visualization #############################################################################################################################################
|
|
166
|
+
def plot_miami(self,**args):
|
|
167
|
+
|
|
168
|
+
plot_miami2(merged_sumstats=self.data,
|
|
169
|
+
suffixes=self.suffixes,
|
|
170
|
+
**args)
|
|
171
|
+
|
|
172
|
+
def compare_af(self, **args):
|
|
173
|
+
|
|
174
|
+
return plotdaf( self.data,
|
|
175
|
+
eaf="EAF_2",
|
|
176
|
+
raf="EAF_1",
|
|
177
|
+
xlabel="Effect Allele Frequency in Sumstats 1",
|
|
178
|
+
ylabel="Effect Allele Frequency in Sumstats 2",
|
|
179
|
+
**args)
|
|
180
|
+
|
gwaslab/g_SumstatsT.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
import numpy as np
|
|
3
3
|
from gwaslab.g_Sumstats import Sumstats
|
|
4
|
-
from gwaslab.hm_casting import
|
|
4
|
+
from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
|
|
5
5
|
from gwaslab.hm_casting import _align_with_mold
|
|
6
6
|
from gwaslab.hm_casting import _fill_missing_columns
|
|
7
7
|
from gwaslab.hm_casting import _check_daf
|
|
@@ -34,7 +34,7 @@ class SumstatsT( ):
|
|
|
34
34
|
|
|
35
35
|
def cast(self, sumstatsObject, threshold=0.2, verbose=True,windowsizeb=10, ref_path=None):
|
|
36
36
|
|
|
37
|
-
molded_sumstats =
|
|
37
|
+
molded_sumstats = _merge_mold_with_sumstats_by_chrpos(self.snp_info, sumstatsObject.data, log=sumstatsObject.log, verbose=verbose, windowsizeb=windowsizeb,ref_path=ref_path)
|
|
38
38
|
|
|
39
39
|
molded_sumstats = _align_with_mold(molded_sumstats, log=sumstatsObject.log, verbose=verbose)
|
|
40
40
|
|