gwaslab 3.5.3__py3-none-any.whl → 3.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +2 -1
- gwaslab/g_Sumstats.py +29 -8
- gwaslab/g_SumstatsPair.py +16 -12
- gwaslab/g_headers.py +131 -0
- gwaslab/g_meta.py +2 -1
- gwaslab/g_version.py +3 -3
- gwaslab/io_preformat_input.py +29 -7
- gwaslab/io_read_pipcs.py +23 -0
- gwaslab/io_to_formats.py +45 -44
- gwaslab/qc_check_datatype.py +65 -42
- gwaslab/util_ex_ldsc.py +9 -0
- gwaslab/util_ex_run_2samplemr.py +34 -0
- gwaslab/util_ex_run_clumping.py +4 -2
- gwaslab/util_in_fill_data.py +9 -1
- gwaslab/util_in_filter_value.py +15 -1
- gwaslab/viz_plot_credible_sets.py +99 -0
- gwaslab/viz_plot_miamiplot2.py +11 -4
- gwaslab/viz_plot_mqqplot.py +14 -11
- gwaslab/viz_plot_stackedregional.py +64 -33
- {gwaslab-3.5.3.dist-info → gwaslab-3.5.5.dist-info}/METADATA +3 -3
- {gwaslab-3.5.3.dist-info → gwaslab-3.5.5.dist-info}/RECORD +25 -23
- gwaslab/vis_plot_credible sets.py +0 -0
- {gwaslab-3.5.3.dist-info → gwaslab-3.5.5.dist-info}/LICENSE +0 -0
- {gwaslab-3.5.3.dist-info → gwaslab-3.5.5.dist-info}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.5.3.dist-info → gwaslab-3.5.5.dist-info}/WHEEL +0 -0
- {gwaslab-3.5.3.dist-info → gwaslab-3.5.5.dist-info}/top_level.txt +0 -0
gwaslab/__init__.py
CHANGED
|
@@ -46,4 +46,5 @@ from gwaslab.util_ex_process_h5 import process_vcf_to_hfd5
|
|
|
46
46
|
from gwaslab.util_ex_run_susie import _run_susie_rss as run_susie_rss
|
|
47
47
|
from gwaslab.io_read_tabular import _read_tabular as read_tabular
|
|
48
48
|
from gwaslab.util_in_meta import meta_analyze
|
|
49
|
-
from gwaslab.viz_plot_scatter_with_reg import scatter
|
|
49
|
+
from gwaslab.viz_plot_scatter_with_reg import scatter
|
|
50
|
+
from gwaslab.util_in_fill_data import rank_based_int
|
gwaslab/g_Sumstats.py
CHANGED
|
@@ -80,6 +80,8 @@ from gwaslab.util_ex_ldsc import _estimate_partitioned_h2_by_ldsc
|
|
|
80
80
|
from gwaslab.bd_get_hapmap3 import gethapmap3
|
|
81
81
|
from gwaslab.util_abf_finemapping import abf_finemapping
|
|
82
82
|
from gwaslab.util_abf_finemapping import make_cs
|
|
83
|
+
from gwaslab.io_read_pipcs import _read_pipcs
|
|
84
|
+
from gwaslab.viz_plot_credible_sets import _plot_cs
|
|
83
85
|
import gc
|
|
84
86
|
from gwaslab.viz_plot_phe_heatmap import _gwheatmap
|
|
85
87
|
|
|
@@ -88,6 +90,7 @@ class Sumstats():
|
|
|
88
90
|
def __init__(self,
|
|
89
91
|
sumstats,
|
|
90
92
|
fmt=None,
|
|
93
|
+
tab_fmt="tsv",
|
|
91
94
|
snpid=None,
|
|
92
95
|
rsid=None,
|
|
93
96
|
chrom=None,
|
|
@@ -154,10 +157,17 @@ class Sumstats():
|
|
|
154
157
|
self.meta["gwaslab"]["species"] = species
|
|
155
158
|
|
|
156
159
|
# initialize attributes for clumping and finmapping
|
|
157
|
-
self.to_finemapping_file_path = ""
|
|
158
|
-
self.to_finemapping_file = pd.DataFrame()
|
|
159
|
-
self.plink_log = ""
|
|
160
|
-
|
|
160
|
+
#self.to_finemapping_file_path = ""
|
|
161
|
+
#self.to_finemapping_file = pd.DataFrame()
|
|
162
|
+
#self.plink_log = ""
|
|
163
|
+
|
|
164
|
+
# path / file / plink_log
|
|
165
|
+
self.finemapping = dict()
|
|
166
|
+
|
|
167
|
+
# clumps / clumps_raw / plink_log
|
|
168
|
+
self.clumps = dict()
|
|
169
|
+
|
|
170
|
+
#
|
|
161
171
|
self.pipcs = pd.DataFrame()
|
|
162
172
|
|
|
163
173
|
# print gwaslab version information
|
|
@@ -167,6 +177,7 @@ class Sumstats():
|
|
|
167
177
|
self.data = preformat(
|
|
168
178
|
sumstats=sumstats,
|
|
169
179
|
fmt=fmt,
|
|
180
|
+
tab_fmt = tab_fmt,
|
|
170
181
|
snpid=snpid,
|
|
171
182
|
rsid=rsid,
|
|
172
183
|
chrom=chrom,
|
|
@@ -822,21 +833,31 @@ class Sumstats():
|
|
|
822
833
|
# external ################################################################################################
|
|
823
834
|
|
|
824
835
|
def calculate_ld_matrix(self,**kwargs):
|
|
825
|
-
self.
|
|
836
|
+
self.finemapping["path"],self.finemapping["file"],self.finemapping["plink_log"]= tofinemapping(self.data,study = self.meta["gwaslab"]["study_name"],**kwargs)
|
|
837
|
+
#self.to_finemapping_file_path, self.to_finemapping_file, self.plink_log = tofinemapping(self.data,study = self.meta["gwaslab"]["study_name"],**kwargs)
|
|
826
838
|
|
|
827
839
|
def run_susie_rss(self,**kwargs):
|
|
828
|
-
self.pipcs=_run_susie_rss(self.
|
|
840
|
+
self.pipcs=_run_susie_rss(self.finemapping["path"],**kwargs)
|
|
841
|
+
#self.pipcs=_run_susie_rss(self.to_finemapping_file_path,**kwargs)
|
|
829
842
|
|
|
830
843
|
def clump(self,**kwargs):
|
|
831
|
-
self.clumps,self.plink_log = _clump(self.data, log=self.log, study = self.meta["gwaslab"]["study_name"], **kwargs)
|
|
844
|
+
self.clumps["clumps"], self.clumps["clumps_raw"], self.clumps["plink_log"] = _clump(self.data, log=self.log, study = self.meta["gwaslab"]["study_name"], **kwargs)
|
|
832
845
|
|
|
833
846
|
def calculate_prs(self,**kwargs):
|
|
834
847
|
combined_results_summary = _calculate_prs(self.data, log=self.log, study = self.meta["gwaslab"]["study_name"], **kwargs)
|
|
835
848
|
return combined_results_summary
|
|
836
|
-
|
|
849
|
+
|
|
850
|
+
# loading aux data
|
|
851
|
+
def read_pipcs(self,prefix,**kwargs):
|
|
852
|
+
self.pipcs = _read_pipcs(self.data[["SNPID","CHR","POS"]],prefix, **kwargs)
|
|
853
|
+
|
|
854
|
+
def plot_pipcs(self, region,**kwargs):
|
|
855
|
+
_plot_cs(self.pipcs, region, **kwargs)
|
|
837
856
|
# to_format ###############################################################################################
|
|
838
857
|
|
|
839
858
|
def to_format(self, path, build=None, verbose=True, **kwargs):
|
|
840
859
|
if build is None:
|
|
841
860
|
build = self.meta["gwaslab"]["genome_build"]
|
|
842
861
|
_to_format(self.data, path, log=self.log, verbose=verbose, meta=self.meta, build=build, **kwargs)
|
|
862
|
+
|
|
863
|
+
|
gwaslab/g_SumstatsPair.py
CHANGED
|
@@ -23,6 +23,7 @@ from gwaslab.viz_plot_compare_af import plotdaf
|
|
|
23
23
|
from gwaslab.util_ex_run_2samplemr import _run_two_sample_mr
|
|
24
24
|
from gwaslab.util_ex_run_clumping import _clump
|
|
25
25
|
from gwaslab.util_ex_ldproxyfinder import _extract_with_ld_proxy
|
|
26
|
+
from gwaslab.g_headers import _get_headers
|
|
26
27
|
|
|
27
28
|
class SumstatsPair( ):
|
|
28
29
|
def __init__(self, sumstatsObject1, sumstatsObject2, study=None, suffixes = ("_1","_2") ,verbose=True ):
|
|
@@ -35,6 +36,7 @@ class SumstatsPair( ):
|
|
|
35
36
|
self.study_name = "{}_{}".format(sumstatsObject1.meta["gwaslab"]["study_name"], sumstatsObject2.meta["gwaslab"]["study_name"])
|
|
36
37
|
else:
|
|
37
38
|
self.study_name = "{}_{}".format("STUDY1", "STUDY2")
|
|
39
|
+
|
|
38
40
|
self.snp_info_cols = []
|
|
39
41
|
self.stats_cols =[]
|
|
40
42
|
self.stats_cols2 =[]
|
|
@@ -45,11 +47,13 @@ class SumstatsPair( ):
|
|
|
45
47
|
self.colocalization=pd.DataFrame()
|
|
46
48
|
self.sumstats1 = pd.DataFrame()
|
|
47
49
|
self.sumstats2 = pd.DataFrame()
|
|
48
|
-
|
|
49
|
-
self.
|
|
50
|
+
|
|
51
|
+
self.mr =dict()
|
|
52
|
+
self.clumps =dict()
|
|
50
53
|
self.ns = None
|
|
51
|
-
self.
|
|
52
|
-
self.
|
|
54
|
+
self.finemapping = dict()
|
|
55
|
+
#self.to_finemapping_file_path = ""
|
|
56
|
+
#self.plink_log = ""
|
|
53
57
|
|
|
54
58
|
self.log.write( "Start to create SumstatsPair object..." )
|
|
55
59
|
|
|
@@ -66,16 +70,17 @@ class SumstatsPair( ):
|
|
|
66
70
|
verbose=verbose)
|
|
67
71
|
|
|
68
72
|
for i in sumstatsObject1.data.columns:
|
|
69
|
-
if i in
|
|
73
|
+
if i in _get_headers(mode="info"):
|
|
74
|
+
# extract SNP info columns from sumstats1
|
|
70
75
|
self.snp_info_cols.append(i)
|
|
71
|
-
elif i in
|
|
76
|
+
elif i in _get_headers(mode="stats"):
|
|
72
77
|
self.stats_cols.append(i)
|
|
73
78
|
else:
|
|
74
79
|
self.other_cols.append(i)
|
|
75
80
|
for i in sumstatsObject2.data.columns:
|
|
76
|
-
if i in
|
|
81
|
+
if i in _get_headers(mode="info"):
|
|
77
82
|
continue
|
|
78
|
-
elif i in
|
|
83
|
+
elif i in _get_headers(mode="stats"):
|
|
79
84
|
self.stats_cols2.append(i)
|
|
80
85
|
else:
|
|
81
86
|
self.other_cols2.append(i)
|
|
@@ -136,14 +141,13 @@ class SumstatsPair( ):
|
|
|
136
141
|
|
|
137
142
|
|
|
138
143
|
def clump(self,**kwargs):
|
|
139
|
-
self.clumps["clumps"],
|
|
144
|
+
self.clumps["clumps"],self.clumps["clumps_raw"],self.clumps["plink_log"] = _clump(self.data, log=self.log, p="P_1",mlog10p="MLOG10P_1", study = self.study_name, **kwargs)
|
|
140
145
|
|
|
141
146
|
def to_coloc(self,**kwargs):
|
|
142
|
-
self.
|
|
147
|
+
self.finemapping["path"],self.finemapping["file"],self.finemapping["plink_log"] = tofinemapping(self.data,study=self.study_name,suffixes=self.suffixes,log=self.log,**kwargs)
|
|
143
148
|
|
|
144
149
|
def run_coloc_susie(self,**kwargs):
|
|
145
|
-
|
|
146
|
-
self.colocalization = _run_coloc_susie(self.to_finemapping_file_path,log=self.log,ncols=self.ns,**kwargs)
|
|
150
|
+
self.colocalization = _run_coloc_susie(self.finemapping["path"],log=self.log,ncols=self.ns,**kwargs)
|
|
147
151
|
|
|
148
152
|
def run_two_sample_mr(self, clump=False, **kwargs):
|
|
149
153
|
exposure1 = self.study_name.split("_")[0]
|
gwaslab/g_headers.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
dtype_dic={
|
|
2
|
+
'SNPID' : 'string' ,
|
|
3
|
+
'rsID' : 'string' ,
|
|
4
|
+
'CHR' : 'Int64' ,
|
|
5
|
+
'POS' : 'Int64' ,
|
|
6
|
+
'EA' : 'category' ,
|
|
7
|
+
'NEA' : 'category' ,
|
|
8
|
+
'STATUS' : 'category' ,
|
|
9
|
+
'REF' : 'category' ,
|
|
10
|
+
'ALT' : 'category' ,
|
|
11
|
+
'EAF' : 'float64' ,
|
|
12
|
+
'NEAF' : 'float64' ,
|
|
13
|
+
'MAF' : 'float64' ,
|
|
14
|
+
'INFO' : 'float32' ,
|
|
15
|
+
'BETA' : 'float64' ,
|
|
16
|
+
'SE' : 'float64' ,
|
|
17
|
+
'BETA_95U' : 'float64' ,
|
|
18
|
+
'BETA_95L' : 'float64' ,
|
|
19
|
+
'OR' : 'float64' ,
|
|
20
|
+
'OR_95U' : 'float64' ,
|
|
21
|
+
'OR_95L' : 'float64' ,
|
|
22
|
+
'HR' : 'float64' ,
|
|
23
|
+
'HR_95U' : 'float64' ,
|
|
24
|
+
'HR_95L' : 'float64' ,
|
|
25
|
+
'CHISQ' : 'float64' ,
|
|
26
|
+
'Z' : 'float64' ,
|
|
27
|
+
'T' : 'float64' ,
|
|
28
|
+
'F' : 'float64' ,
|
|
29
|
+
'P' : 'float64' ,
|
|
30
|
+
'P_MANTISSA' : 'float64' ,
|
|
31
|
+
'P_EXPONENT' : 'float64' ,
|
|
32
|
+
'MLOG10P' : 'float64' ,
|
|
33
|
+
'SNPR2' : 'float64' ,
|
|
34
|
+
'DOF' : 'Int64' ,
|
|
35
|
+
'P_HET' : 'float64' ,
|
|
36
|
+
'I2_HET' : 'float64' ,
|
|
37
|
+
'DENSITY' : 'Int64' ,
|
|
38
|
+
'N' : 'Int64' ,
|
|
39
|
+
'N_CASE' : 'Int64' ,
|
|
40
|
+
'N_CONTROL' : 'Int64' ,
|
|
41
|
+
'GENENAME' : 'string' ,
|
|
42
|
+
'CIS/TRANS' : 'string' ,
|
|
43
|
+
'DISTANCE_TO_KNOWN' : 'Int64' ,
|
|
44
|
+
'LOCATION_OF_KNOWN' : 'string' ,
|
|
45
|
+
'KNOWN_ID' : 'string' ,
|
|
46
|
+
'KNOWN_PUBMED_ID' : 'string' ,
|
|
47
|
+
'KNOWN_AUTHOR' : 'string' ,
|
|
48
|
+
'KNOWN_SET_VARIANT' : 'string' ,
|
|
49
|
+
'KNOWN_VARIANT' : 'string' ,
|
|
50
|
+
'KNOWN_SET' : 'string' ,
|
|
51
|
+
'NOVEL' : 'string' ,
|
|
52
|
+
'PIP' :' float64 ',
|
|
53
|
+
'CREDIBLE_SET_INDEX': 'Int64' ,
|
|
54
|
+
'N_SNP' : 'Int64' ,
|
|
55
|
+
'LOCUS' : 'string' ,
|
|
56
|
+
'STUDY' : 'string' ,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
description_dic={
|
|
61
|
+
'SNPID' :' variant ID (CHR:POS:NEA:EA) ',
|
|
62
|
+
'rsID' :' dbSNP rsID ',
|
|
63
|
+
'CHR' :' chromosome number (X 23, Y 24, MT 25) ',
|
|
64
|
+
'POS' :' base pair position ',
|
|
65
|
+
'EA' :' effect allele ',
|
|
66
|
+
'NEA' :' non-effect allele ',
|
|
67
|
+
'STATUS' :' variant standardization & harmonization status ',
|
|
68
|
+
'REF' :' reference allele in reference genome ',
|
|
69
|
+
'ALT' :' alternative allele ',
|
|
70
|
+
'EAF' :' effect allele frequency ',
|
|
71
|
+
'NEAF' :' non-effect allele frequency ',
|
|
72
|
+
'MAF' :' minor allele frequency ',
|
|
73
|
+
'INFO' :' imputation INFO/RSQ ',
|
|
74
|
+
'BETA' :' effect size beta ',
|
|
75
|
+
'SE' :' standard error of beta ',
|
|
76
|
+
'BETA_95U' :' upper bound of beta 95% condidence interval ',
|
|
77
|
+
'BETA_95L' :' lower bound of beta 95% condidence interval ',
|
|
78
|
+
'OR' :' odds ratio ',
|
|
79
|
+
'OR_95U' :' upper bound of OR 95% condidence interval ',
|
|
80
|
+
'OR_95L' :' lower bound of OR 95% condidence interval ',
|
|
81
|
+
'HR' :' hazard ratio ',
|
|
82
|
+
'HR_95U' :' upper bound of HR 95% condidence interval ',
|
|
83
|
+
'HR_95L' :' lower bound of HR 95% condidence interval ',
|
|
84
|
+
'CHISQ' :' chi square ',
|
|
85
|
+
'Z' :' z score ',
|
|
86
|
+
'T' :' t statistics ',
|
|
87
|
+
'F' :' F statistics ',
|
|
88
|
+
'P' :' P value ',
|
|
89
|
+
'P_MANTISSA' :' P mantissa ',
|
|
90
|
+
'P_EXPONENT' :' P exponent ',
|
|
91
|
+
'MLOG10P' :' $-log_{10}(P)$ ',
|
|
92
|
+
'SNPR2' :' per variant R2 ',
|
|
93
|
+
'DOF' :' degree of freedom ',
|
|
94
|
+
'P_HET' :' heterogeneity test P value ',
|
|
95
|
+
'I2_HET' :' heterogeneity I2 ',
|
|
96
|
+
'DENSITY' :' signal density ',
|
|
97
|
+
'N' :' total sample size ',
|
|
98
|
+
'N_CASE' :' number of cases ',
|
|
99
|
+
'N_CONTROL' :' number of controls ',
|
|
100
|
+
'GENENAME' :' nearest gene symbol ',
|
|
101
|
+
'CIS/TRANS' :' whether the variant is in cis or trans region ',
|
|
102
|
+
'DISTANCE_TO_KNOWN' :' distance to nearest known variants ',
|
|
103
|
+
'LOCATION_OF_KNOWN' :' relative location to nearest known variants ',
|
|
104
|
+
'KNOWN_ID' :' nearest known variant ID ',
|
|
105
|
+
'KNOWN_PUBMED_ID' :' pubmed ID of the known variant ',
|
|
106
|
+
'KNOWN_AUTHOR' :' author of the study ',
|
|
107
|
+
'KNOWN_SET_VARIANT' :' known set and overlapping variant ',
|
|
108
|
+
'KNOWN_VARIANT' :' known variant overlapping with the variant ',
|
|
109
|
+
'KNOWN_SET' :' variant set of the known variant ',
|
|
110
|
+
'PIP' :' Posterior Inclusion Probability ',
|
|
111
|
+
'CREDIBLE_SET_INDEX':' credible sets index ',
|
|
112
|
+
'N_SNP' :' number of variants included in this locus for finemapping ',
|
|
113
|
+
'LOCUS' :' locus name, usually the lead variant of the locus ',
|
|
114
|
+
'STUDY' :' study name '}
|
|
115
|
+
|
|
116
|
+
def _get_headers(mode="all"):
|
|
117
|
+
if mode=="info":
|
|
118
|
+
return ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]
|
|
119
|
+
elif mode=="stats":
|
|
120
|
+
return ["BETA","SE","P","MLOG10P","N","N_CASE","N_CONTROL","Z","T","F","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","MAF","EAF","BETA_95L","BETA_95U"]
|
|
121
|
+
else:
|
|
122
|
+
return description_dic.keys()
|
|
123
|
+
|
|
124
|
+
def _check_overlap_with_reserved_keys(other):
|
|
125
|
+
overlapped=[]
|
|
126
|
+
for i in other:
|
|
127
|
+
if i in _get_headers():
|
|
128
|
+
overlapped.append(i)
|
|
129
|
+
return overlapped
|
|
130
|
+
|
|
131
|
+
|
gwaslab/g_meta.py
CHANGED
gwaslab/g_version.py
CHANGED
|
@@ -6,7 +6,7 @@ import numpy as np
|
|
|
6
6
|
def _show_version(log=Log(), verbose=True):
|
|
7
7
|
# show version when loading sumstats
|
|
8
8
|
log.write("GWASLab v{} https://cloufield.github.io/gwaslab/".format(gwaslab_info()["version"]),verbose=verbose)
|
|
9
|
-
log.write("(C) 2022-
|
|
9
|
+
log.write("(C) 2022-2025, Yunye He, Kamatani Lab, GPL-3.0 license, gwaslab@gmail.com",verbose=verbose)
|
|
10
10
|
|
|
11
11
|
def _get_version():
|
|
12
12
|
# return short version string like v3.4.33
|
|
@@ -15,8 +15,8 @@ def _get_version():
|
|
|
15
15
|
def gwaslab_info():
|
|
16
16
|
# version meta information
|
|
17
17
|
dic={
|
|
18
|
-
"version":"3.5.
|
|
19
|
-
"release_date":"
|
|
18
|
+
"version":"3.5.5",
|
|
19
|
+
"release_date":"20250102"
|
|
20
20
|
}
|
|
21
21
|
return dic
|
|
22
22
|
|
gwaslab/io_preformat_input.py
CHANGED
|
@@ -8,11 +8,13 @@ from gwaslab.bd_common_data import get_format_dict
|
|
|
8
8
|
from gwaslab.qc_fix_sumstats import sortcolumn
|
|
9
9
|
from gwaslab.qc_fix_sumstats import _process_build
|
|
10
10
|
from gwaslab.qc_check_datatype import check_datatype
|
|
11
|
+
from gwaslab.qc_check_datatype import quick_convert_datatype
|
|
11
12
|
from gwaslab.qc_check_datatype import check_dataframe_memory_usage
|
|
12
|
-
|
|
13
|
+
from gwaslab.g_headers import _check_overlap_with_reserved_keys
|
|
13
14
|
#20221030
|
|
14
15
|
def preformat(sumstats,
|
|
15
16
|
fmt=None,
|
|
17
|
+
tab_fmt="tsv",
|
|
16
18
|
snpid=None,
|
|
17
19
|
rsid=None,
|
|
18
20
|
chrom=None,
|
|
@@ -66,12 +68,21 @@ def preformat(sumstats,
|
|
|
66
68
|
rename_dictionary = {}
|
|
67
69
|
usecols = []
|
|
68
70
|
dtype_dictionary ={}
|
|
69
|
-
|
|
71
|
+
if readargs is None:
|
|
72
|
+
readargs={}
|
|
70
73
|
#######################################################################################################################################################
|
|
71
74
|
# workflow:
|
|
72
75
|
# 1. formatbook
|
|
73
76
|
# 2. user specified header
|
|
74
77
|
# 3. usekeys
|
|
78
|
+
if tab_fmt=="parquet":
|
|
79
|
+
if type(sumstats) is str:
|
|
80
|
+
log.write("Start to load data from parquet file....",verbose=verbose)
|
|
81
|
+
log.write(" -path: {}".format(sumstats),verbose=verbose)
|
|
82
|
+
sumstats = pd.read_parquet(sumstats,**readargs)
|
|
83
|
+
log.write("Finished loading parquet file into pd.DataFrame....",verbose=verbose)
|
|
84
|
+
else:
|
|
85
|
+
raise ValueError("Please input a path for parquet file.")
|
|
75
86
|
|
|
76
87
|
if fmt is not None:
|
|
77
88
|
# loading format parameters
|
|
@@ -145,9 +156,11 @@ def preformat(sumstats,
|
|
|
145
156
|
if key in raw_cols:
|
|
146
157
|
usecols.append(key)
|
|
147
158
|
if value in ["EA","NEA"]:
|
|
148
|
-
dtype_dictionary[
|
|
149
|
-
if value in ["
|
|
150
|
-
dtype_dictionary[
|
|
159
|
+
dtype_dictionary[key]="category"
|
|
160
|
+
if value in ["STATUS"]:
|
|
161
|
+
dtype_dictionary[key]="string"
|
|
162
|
+
if value in ["CHR"]:
|
|
163
|
+
dtype_dictionary[key]="string"
|
|
151
164
|
|
|
152
165
|
except ValueError:
|
|
153
166
|
raise ValueError("Please input a path or a pd.DataFrame, and make sure the separator is correct and the columns you specified are in the file.")
|
|
@@ -276,6 +289,8 @@ def preformat(sumstats,
|
|
|
276
289
|
rename_dictionary[status]="STATUS"
|
|
277
290
|
dtype_dictionary[status]="string"
|
|
278
291
|
if other:
|
|
292
|
+
overlapped = _check_overlap_with_reserved_keys(other)
|
|
293
|
+
log.warning("Columns with headers overlapping with GWASLab reserved keywords:{}".format(overlapped),verbose=verbose)
|
|
279
294
|
usecols = usecols + other
|
|
280
295
|
for i in other:
|
|
281
296
|
rename_dictionary[i] = i
|
|
@@ -359,8 +374,13 @@ def preformat(sumstats,
|
|
|
359
374
|
sumstats = sumstats[usecols].copy()
|
|
360
375
|
for key,value in dtype_dictionary.items():
|
|
361
376
|
if key in usecols:
|
|
362
|
-
|
|
363
|
-
|
|
377
|
+
astype = value
|
|
378
|
+
if rename_dictionary[key]=="CHR":
|
|
379
|
+
astype ="Int64"
|
|
380
|
+
try:
|
|
381
|
+
sumstats[key] = sumstats[key].astype(astype)
|
|
382
|
+
except:
|
|
383
|
+
sumstats[key] = sumstats[key].astype("string")
|
|
364
384
|
except ValueError:
|
|
365
385
|
raise ValueError("Please input a path or a pd.DataFrame, and make sure it contain the columns.")
|
|
366
386
|
|
|
@@ -400,6 +420,8 @@ def preformat(sumstats,
|
|
|
400
420
|
|
|
401
421
|
## reodering ###################################################################################################
|
|
402
422
|
sumstats = sortcolumn(sumstats=sumstats,log=log,verbose=verbose)
|
|
423
|
+
sumstats = quick_convert_datatype(sumstats,log=log,verbose=verbose)
|
|
424
|
+
|
|
403
425
|
check_datatype(sumstats,log=log,verbose=verbose)
|
|
404
426
|
gc.collect()
|
|
405
427
|
check_dataframe_memory_usage(sumstats,log=log,verbose=verbose)
|
gwaslab/io_read_pipcs.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from gwaslab.g_Log import Log
|
|
3
|
+
from gwaslab.qc_check_datatype import check_datatype
|
|
4
|
+
from gwaslab.qc_check_datatype import check_dataframe_memory_usage
|
|
5
|
+
|
|
6
|
+
def _read_pipcs(data, output_prefix, log=Log(),verbose=True):
|
|
7
|
+
log.write("Start to load PIP and CREDIBLE_SET_INDEX from file...",verbose=verbose)
|
|
8
|
+
log.write(" -File:{}.pipcs".format(output_prefix),verbose=verbose)
|
|
9
|
+
|
|
10
|
+
pipcs = pd.read_csv("{}.pipcs".format(output_prefix))
|
|
11
|
+
|
|
12
|
+
log.write(" -Merging CHR and POS from main dataframe...",verbose=verbose)
|
|
13
|
+
pipcs = _merge_chrpos(data,pipcs)
|
|
14
|
+
|
|
15
|
+
log.write(" -Current pipcs Dataframe shape :",len(pipcs)," x ", len(pipcs.columns),verbose=verbose)
|
|
16
|
+
check_datatype(pipcs,log=log,verbose=verbose)
|
|
17
|
+
check_dataframe_memory_usage(pipcs,log=log,verbose=verbose)
|
|
18
|
+
log.write("Finished loading PIP and CREDIBLE_SET_INDEX from file!",verbose=verbose)
|
|
19
|
+
return pipcs
|
|
20
|
+
|
|
21
|
+
def _merge_chrpos(data,pipcs):
|
|
22
|
+
df = pd.merge(pipcs, data,on="SNPID",how="left")
|
|
23
|
+
return df
|
gwaslab/io_to_formats.py
CHANGED
|
@@ -114,48 +114,49 @@ def _to_format(sumstats,
|
|
|
114
114
|
|
|
115
115
|
#######################################################################################################
|
|
116
116
|
#formatting float statistics
|
|
117
|
-
onetime_log.write(" -Formatting statistics ...",verbose=verbose)
|
|
118
117
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
118
|
+
if tab_fmt!="parquet":
|
|
119
|
+
onetime_log.write(" -Formatting statistics ...",verbose=verbose)
|
|
120
|
+
formats = {
|
|
121
|
+
'EAF': '{:.4g}',
|
|
122
|
+
'MAF': '{:.4g}',
|
|
123
|
+
'BETA': '{:.4f}',
|
|
124
|
+
'SE': '{:.4f}',
|
|
125
|
+
'BETA_95U': '{:.4f}',
|
|
126
|
+
'BETA_95L': '{:.4f}',
|
|
127
|
+
'Z': '{:.4f}',
|
|
128
|
+
'CHISQ': '{:.4f}',
|
|
129
|
+
'F': '{:.4f}',
|
|
130
|
+
'OR': '{:.4f}',
|
|
131
|
+
'OR_95U': '{:.4f}',
|
|
132
|
+
'OR_95L': '{:.4f}',
|
|
133
|
+
'HR': '{:.4f}',
|
|
134
|
+
'HR_95U': '{:.4f}',
|
|
135
|
+
'HR_95L': '{:.4f}',
|
|
136
|
+
'INFO': '{:.4f}',
|
|
137
|
+
'P': '{:.4e}',
|
|
138
|
+
'MLOG10P': '{:.4f}',
|
|
139
|
+
'DAF': '{:.4f}'}
|
|
140
|
+
|
|
141
|
+
for col, f in float_formats.items():
|
|
142
|
+
if col in output.columns:
|
|
143
|
+
formats[col]=f
|
|
144
|
+
|
|
145
|
+
for col, f in formats.items():
|
|
146
|
+
if col in output.columns:
|
|
147
|
+
if str(output[col].dtype) in ["Float32","Float64","float64","float32","float16","float"]:
|
|
148
|
+
output[col] = output[col].map(f.format)
|
|
149
|
+
|
|
150
|
+
onetime_log.write(" -Float statistics formats:",verbose=verbose)
|
|
151
|
+
keys=[]
|
|
152
|
+
values=[]
|
|
153
|
+
for key,value in formats.items():
|
|
154
|
+
if key in output.columns:
|
|
155
|
+
keys.append(key)
|
|
156
|
+
values.append(value)
|
|
157
|
+
|
|
158
|
+
onetime_log.write(" - Columns :",keys,verbose=verbose)
|
|
159
|
+
onetime_log.write(" - Output formats:",values,verbose=verbose)
|
|
159
160
|
|
|
160
161
|
##########################################################################################################
|
|
161
162
|
# output, mapping column names
|
|
@@ -233,7 +234,7 @@ def tofmt(sumstats,
|
|
|
233
234
|
if xymt_number is False and pd.api.types.is_integer_dtype(sumstats["CHR"]):
|
|
234
235
|
sumstats["CHR"]= sumstats["CHR"].map(get_number_to_chr(xymt=xymt,prefix=chr_prefix))
|
|
235
236
|
# add prefix to CHR
|
|
236
|
-
elif chr_prefix
|
|
237
|
+
elif len(chr_prefix)>0:
|
|
237
238
|
sumstats["CHR"]= chr_prefix + sumstats["CHR"].astype("string")
|
|
238
239
|
|
|
239
240
|
####################################################################################################################
|
|
@@ -409,7 +410,7 @@ def _write_tabular(sumstats,rename_dictionary, path, tab_fmt, to_csvargs, to_tab
|
|
|
409
410
|
log.write(f" -@ detected: writing each chromosome to a single file...",verbose=verbose)
|
|
410
411
|
log.write(" -Chromosomes:{}...".format(list(sumstats["CHR"].unique())),verbose=verbose)
|
|
411
412
|
for single_chr in list(sumstats["CHR"].unique()):
|
|
412
|
-
single_path = path.replace("@",single_chr)
|
|
413
|
+
single_path = path.replace("@","{}".format(single_chr))
|
|
413
414
|
|
|
414
415
|
fast_to_csv(sumstats.loc[sumstats[chr_header]==single_chr,:],
|
|
415
416
|
single_path,
|
|
@@ -422,7 +423,7 @@ def _write_tabular(sumstats,rename_dictionary, path, tab_fmt, to_csvargs, to_tab
|
|
|
422
423
|
log.write(f" -@ detected: writing each chromosome to a single file...",verbose=verbose)
|
|
423
424
|
log.write(" -Chromosomes:{}...".format(list(sumstats["CHR"].unique())),verbose=verbose)
|
|
424
425
|
for single_chr in list(sumstats["CHR"].unique()):
|
|
425
|
-
single_path = path.replace("@",single_chr)
|
|
426
|
+
single_path = path.replace("@","{}".format(single_chr))
|
|
426
427
|
|
|
427
428
|
sumstats.loc[sumstats[chr_header]==single_chr,:].to_csv(path, index=None, **to_csvargs)
|
|
428
429
|
else:
|