gwaslab 3.5.7__py3-none-any.whl → 3.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +2 -0
- gwaslab/bd_common_data.py +1 -0
- gwaslab/bd_get_hapmap3.py +0 -1
- gwaslab/data/formatbook.json +78 -0
- gwaslab/g_Sumstats.py +98 -24
- gwaslab/g_SumstatsMulti.py +287 -0
- gwaslab/g_SumstatsPair.py +101 -16
- gwaslab/g_Sumstats_polars.py +245 -0
- gwaslab/g_headers.py +12 -3
- gwaslab/g_meta.py +123 -47
- gwaslab/g_meta_update.py +48 -0
- gwaslab/g_vchange_status_polars.py +44 -0
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +169 -110
- gwaslab/hm_casting_polars.py +202 -0
- gwaslab/hm_harmonize_sumstats.py +19 -8
- gwaslab/io_load_ld.py +529 -0
- gwaslab/io_preformat_input.py +11 -0
- gwaslab/io_preformat_input_polars.py +632 -0
- gwaslab/io_process_args.py +25 -1
- gwaslab/io_read_ldsc.py +34 -3
- gwaslab/io_read_pipcs.py +62 -6
- gwaslab/prscs_gigrnd.py +122 -0
- gwaslab/prscs_mcmc_gtb.py +136 -0
- gwaslab/prscs_parse_genet.py +98 -0
- gwaslab/qc_build.py +53 -0
- gwaslab/qc_check_datatype.py +10 -8
- gwaslab/qc_check_datatype_polars.py +128 -0
- gwaslab/qc_fix_sumstats.py +25 -23
- gwaslab/qc_fix_sumstats_polars.py +193 -0
- gwaslab/util_ex_calculate_ldmatrix.py +49 -19
- gwaslab/util_ex_gwascatalog.py +71 -28
- gwaslab/util_ex_ldsc.py +67 -21
- gwaslab/util_ex_match_ldmatrix.py +396 -0
- gwaslab/util_ex_run_2samplemr.py +0 -2
- gwaslab/util_ex_run_ccgwas.py +155 -0
- gwaslab/util_ex_run_coloc.py +1 -1
- gwaslab/util_ex_run_hyprcoloc.py +117 -0
- gwaslab/util_ex_run_mesusie.py +155 -0
- gwaslab/util_ex_run_mtag.py +92 -0
- gwaslab/util_ex_run_prscs.py +85 -0
- gwaslab/util_ex_run_susie.py +40 -9
- gwaslab/util_in_estimate_ess.py +18 -0
- gwaslab/util_in_fill_data.py +20 -1
- gwaslab/util_in_filter_value.py +10 -5
- gwaslab/util_in_get_sig.py +71 -13
- gwaslab/util_in_meta.py +168 -4
- gwaslab/util_in_meta_polars.py +174 -0
- gwaslab/viz_plot_compare_effect.py +87 -23
- gwaslab/viz_plot_credible_sets.py +55 -11
- gwaslab/viz_plot_effect.py +22 -12
- gwaslab/viz_plot_miamiplot2.py +3 -2
- gwaslab/viz_plot_mqqplot.py +84 -81
- gwaslab/viz_plot_qqplot.py +6 -6
- gwaslab/viz_plot_regional2.py +2 -1
- gwaslab/viz_plot_stackedregional.py +4 -1
- {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info}/METADATA +8 -6
- gwaslab-3.5.8.dist-info/RECORD +117 -0
- {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info}/WHEEL +1 -1
- gwaslab-3.5.7.dist-info/RECORD +0 -96
- {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE +0 -0
- {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,632 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import polars as pl
|
|
3
|
+
import numpy as np
|
|
4
|
+
import scipy.stats as ss
|
|
5
|
+
import gzip
|
|
6
|
+
import os
|
|
7
|
+
import gc
|
|
8
|
+
from gwaslab.bd_common_data import get_format_dict
|
|
9
|
+
from gwaslab.qc_fix_sumstats import sortcolumn
|
|
10
|
+
from gwaslab.qc_fix_sumstats import _process_build
|
|
11
|
+
from gwaslab.qc_check_datatype_polars import check_datatype
|
|
12
|
+
from gwaslab.qc_check_datatype_polars import quick_convert_datatype
|
|
13
|
+
from gwaslab.qc_check_datatype_polars import check_dataframe_memory_usage
|
|
14
|
+
from gwaslab.g_headers import _check_overlap_with_reserved_keys
|
|
15
|
+
#20221030
|
|
16
|
+
def preformatp(sumstats,
|
|
17
|
+
fmt=None,
|
|
18
|
+
tab_fmt="tsv",
|
|
19
|
+
snpid=None,
|
|
20
|
+
rsid=None,
|
|
21
|
+
chrom=None,
|
|
22
|
+
pos=None,
|
|
23
|
+
ea=None,
|
|
24
|
+
nea=None,
|
|
25
|
+
ref=None,
|
|
26
|
+
alt=None,
|
|
27
|
+
eaf=None,
|
|
28
|
+
neaf=None,
|
|
29
|
+
maf=None,
|
|
30
|
+
n=None,
|
|
31
|
+
beta=None,
|
|
32
|
+
se=None,
|
|
33
|
+
chisq=None,
|
|
34
|
+
z=None,
|
|
35
|
+
f=None,
|
|
36
|
+
t=None,
|
|
37
|
+
p=None,
|
|
38
|
+
q=None,
|
|
39
|
+
mlog10p=None,
|
|
40
|
+
test=None,
|
|
41
|
+
info=None,
|
|
42
|
+
OR=None,
|
|
43
|
+
OR_95L=None,
|
|
44
|
+
OR_95U=None,
|
|
45
|
+
beta_95L=None,
|
|
46
|
+
beta_95U=None,
|
|
47
|
+
HR=None,
|
|
48
|
+
HR_95L=None,
|
|
49
|
+
HR_95U=None,
|
|
50
|
+
i2=None,
|
|
51
|
+
snpr2=None,
|
|
52
|
+
phet=None,
|
|
53
|
+
dof=None,
|
|
54
|
+
ncase=None,
|
|
55
|
+
ncontrol=None,
|
|
56
|
+
neff=None,
|
|
57
|
+
direction=None,
|
|
58
|
+
status=None,
|
|
59
|
+
study=None,
|
|
60
|
+
trait=None,
|
|
61
|
+
build=None,
|
|
62
|
+
other=[],
|
|
63
|
+
usekeys=None,
|
|
64
|
+
chrom_pat=None,
|
|
65
|
+
snpid_pat=None,
|
|
66
|
+
verbose=False,
|
|
67
|
+
readargs=None,
|
|
68
|
+
log=None):
|
|
69
|
+
|
|
70
|
+
#renaming dictionary
|
|
71
|
+
rename_dictionary = {}
|
|
72
|
+
usecols = []
|
|
73
|
+
dtype_dictionary ={}
|
|
74
|
+
if readargs is None:
|
|
75
|
+
readargs={}
|
|
76
|
+
#######################################################################################################################################################
|
|
77
|
+
# workflow:
|
|
78
|
+
# 1. formatbook
|
|
79
|
+
# 2. user specified header
|
|
80
|
+
# 3. usekeys
|
|
81
|
+
if tab_fmt=="parquet":
|
|
82
|
+
if type(sumstats) is str:
|
|
83
|
+
log.write("Start to load data from parquet file....",verbose=verbose)
|
|
84
|
+
log.write(" -path: {}".format(sumstats),verbose=verbose)
|
|
85
|
+
sumstats = pd.read_parquet(sumstats,**readargs)
|
|
86
|
+
log.write("Finished loading parquet file into pd.DataFrame....",verbose=verbose)
|
|
87
|
+
else:
|
|
88
|
+
raise ValueError("Please input a path for parquet file.")
|
|
89
|
+
|
|
90
|
+
if fmt is not None:
|
|
91
|
+
# loading format parameters
|
|
92
|
+
log.write("Start to load format from formatbook....",verbose=verbose)
|
|
93
|
+
|
|
94
|
+
# load format data
|
|
95
|
+
meta_data,rename_dictionary = get_format_dict(fmt)
|
|
96
|
+
|
|
97
|
+
########## print format information################################################
|
|
98
|
+
print_format_info(fmt=fmt, meta_data=meta_data,rename_dictionary=rename_dictionary,verbose=verbose, log=log)
|
|
99
|
+
|
|
100
|
+
if "format_separator" in meta_data.keys():
|
|
101
|
+
if "separator" not in readargs.keys():
|
|
102
|
+
readargs["separator"] = meta_data["format_separator"]
|
|
103
|
+
else:
|
|
104
|
+
if readargs["separator"] != meta_data["format_separator"]:
|
|
105
|
+
log.write(' - format_separator will be changed to: "{}"'.format(readargs["separator"]),verbose=verbose)
|
|
106
|
+
|
|
107
|
+
if "format_na" in meta_data.keys():
|
|
108
|
+
readargs["null_values"] = meta_data["format_na"]
|
|
109
|
+
|
|
110
|
+
if "format_comment" in meta_data.keys():
|
|
111
|
+
readargs["comment_prefix"] = meta_data["format_comment"]
|
|
112
|
+
|
|
113
|
+
if "format_other_cols" in meta_data.keys():
|
|
114
|
+
other += meta_data["format_other_cols"]
|
|
115
|
+
|
|
116
|
+
if "sep" not in readargs.keys():
|
|
117
|
+
readargs["separator"] = "\t"
|
|
118
|
+
|
|
119
|
+
#########################################################################################################################################################
|
|
120
|
+
|
|
121
|
+
# check chr-separated path / vcf / then print header.
|
|
122
|
+
try:
|
|
123
|
+
if type(sumstats) is str:
|
|
124
|
+
## loading data from path #################################################
|
|
125
|
+
inpath = sumstats
|
|
126
|
+
###load sumstats by each chromosome #################################################
|
|
127
|
+
if "@" in inpath:
|
|
128
|
+
log.write(" -Detected @ in path: load sumstats by each chromosome...",verbose=verbose)
|
|
129
|
+
inpath_chr_list=[]
|
|
130
|
+
inpath_chr_num_list=[]
|
|
131
|
+
for chromosome in list(range(1,26))+["x","y","X","Y","MT","mt","m","M"]:
|
|
132
|
+
inpath_chr = inpath.replace("@",str(chromosome))
|
|
133
|
+
if isfile_casesensitive(inpath_chr):
|
|
134
|
+
inpath_chr_num_list.append(str(chromosome))
|
|
135
|
+
inpath_chr_list.append(inpath_chr)
|
|
136
|
+
log.write(" -Chromosomes detected:",",".join(inpath_chr_num_list),verbose=verbose)
|
|
137
|
+
readargs_header = get_readargs_header(inpath = inpath_chr_list[0], readargs = readargs)
|
|
138
|
+
row_one = pl.read_csv(inpath_chr_list[0],**readargs_header)
|
|
139
|
+
# columns in the sumstats
|
|
140
|
+
raw_cols = row_one.columns
|
|
141
|
+
else:
|
|
142
|
+
##### loading data from tabular file#################################################
|
|
143
|
+
readargs_header = get_readargs_header(inpath = inpath, readargs = readargs)
|
|
144
|
+
row_one = pl.read_csv(inpath,**readargs_header)
|
|
145
|
+
raw_cols = row_one.columns
|
|
146
|
+
|
|
147
|
+
if fmt=="vcf":
|
|
148
|
+
# expanded
|
|
149
|
+
format_cols = list(row_one["FORMAT"].str.split(":"))[0]
|
|
150
|
+
# fixed + study1 + expanded
|
|
151
|
+
raw_cols = meta_data["format_fixed"] + [raw_cols[9]] + format_cols
|
|
152
|
+
|
|
153
|
+
######################################################################################
|
|
154
|
+
elif type(sumstats) is pd.DataFrame:
|
|
155
|
+
## loading data from dataframe
|
|
156
|
+
raw_cols = sumstats.columns
|
|
157
|
+
|
|
158
|
+
################################################
|
|
159
|
+
for key,value in rename_dictionary.items():
|
|
160
|
+
# check avaiable keys key->raw header
|
|
161
|
+
# usecols : a list of raw headers to load from file/DataFrame
|
|
162
|
+
if key in raw_cols:
|
|
163
|
+
usecols.append(key)
|
|
164
|
+
if value in ["EA","NEA"]:
|
|
165
|
+
dtype_dictionary[key]=pl.String()
|
|
166
|
+
if value in ["STATUS"]:
|
|
167
|
+
dtype_dictionary[key]=pl.String()
|
|
168
|
+
if value in ["CHR"]:
|
|
169
|
+
dtype_dictionary[key]=pl.String()
|
|
170
|
+
|
|
171
|
+
except ValueError:
|
|
172
|
+
raise ValueError("Please input a path or a pd.DataFrame, and make sure the separator is correct and the columns you specified are in the file.")
|
|
173
|
+
|
|
174
|
+
###################################################################################################################################################
|
|
175
|
+
## check columns/datatype to use
|
|
176
|
+
if snpid:
|
|
177
|
+
usecols.append(snpid)
|
|
178
|
+
rename_dictionary[snpid]= "SNPID"
|
|
179
|
+
if rsid:
|
|
180
|
+
usecols.append(rsid)
|
|
181
|
+
rename_dictionary[rsid]= "rsID"
|
|
182
|
+
if chrom:
|
|
183
|
+
usecols.append(chrom)
|
|
184
|
+
rename_dictionary[chrom]= "CHR"
|
|
185
|
+
dtype_dictionary[chrom]=pl.String()
|
|
186
|
+
if pos:
|
|
187
|
+
usecols.append(pos)
|
|
188
|
+
rename_dictionary[pos]= "POS"
|
|
189
|
+
dtype_dictionary[pos]=pl.Float64()
|
|
190
|
+
if ea:
|
|
191
|
+
usecols.append(ea)
|
|
192
|
+
rename_dictionary[ea]= "EA"
|
|
193
|
+
dtype_dictionary[ea]=pl.String()
|
|
194
|
+
if nea:
|
|
195
|
+
usecols.append(nea)
|
|
196
|
+
rename_dictionary[nea]= "NEA"
|
|
197
|
+
dtype_dictionary[nea]=pl.String()
|
|
198
|
+
if ref:
|
|
199
|
+
usecols.append(ref)
|
|
200
|
+
rename_dictionary[ref]= "REF"
|
|
201
|
+
dtype_dictionary[ref]=pl.String()
|
|
202
|
+
if alt:
|
|
203
|
+
usecols.append(alt)
|
|
204
|
+
rename_dictionary[alt]= "ALT"
|
|
205
|
+
dtype_dictionary[alt]=pl.String()
|
|
206
|
+
if eaf:
|
|
207
|
+
usecols.append(eaf)
|
|
208
|
+
rename_dictionary[eaf]= "EAF"
|
|
209
|
+
elif neaf:
|
|
210
|
+
# neaf will be converted to eaf
|
|
211
|
+
usecols.append(neaf)
|
|
212
|
+
rename_dictionary[neaf]= "EAF"
|
|
213
|
+
if maf:
|
|
214
|
+
usecols.append(maf)
|
|
215
|
+
rename_dictionary[maf]= "MAF"
|
|
216
|
+
if n and (type(n) is str):
|
|
217
|
+
usecols.append(n)
|
|
218
|
+
rename_dictionary[n]= "N"
|
|
219
|
+
if ncase and (type(ncase) is str):
|
|
220
|
+
usecols.append(ncase)
|
|
221
|
+
rename_dictionary[ncase]= "N_CASE"
|
|
222
|
+
if ncontrol and (type(ncontrol) is str):
|
|
223
|
+
usecols.append(ncontrol)
|
|
224
|
+
rename_dictionary[ncontrol]= "N_CONTROL"
|
|
225
|
+
if neff and (type(neff) is str):
|
|
226
|
+
usecols.append(neff)
|
|
227
|
+
rename_dictionary[neff]= "N_EFF"
|
|
228
|
+
if beta:
|
|
229
|
+
usecols.append(beta)
|
|
230
|
+
rename_dictionary[beta]= "BETA"
|
|
231
|
+
if beta_95L:
|
|
232
|
+
usecols.append(beta_95L)
|
|
233
|
+
rename_dictionary[beta_95L]= "BETA_95L"
|
|
234
|
+
if beta_95U:
|
|
235
|
+
usecols.append(beta_95U)
|
|
236
|
+
rename_dictionary[beta_95U]= "BETA_95U"
|
|
237
|
+
if se:
|
|
238
|
+
usecols.append(se)
|
|
239
|
+
rename_dictionary[se]= "SE"
|
|
240
|
+
if chisq:
|
|
241
|
+
usecols.append(chisq)
|
|
242
|
+
rename_dictionary[chisq]="CHISQ"
|
|
243
|
+
if z:
|
|
244
|
+
usecols.append(z)
|
|
245
|
+
rename_dictionary[z]= "Z"
|
|
246
|
+
if q:
|
|
247
|
+
usecols.append(q)
|
|
248
|
+
rename_dictionary[q]= "Q"
|
|
249
|
+
if p:
|
|
250
|
+
usecols.append(p)
|
|
251
|
+
rename_dictionary[p]= "P"
|
|
252
|
+
if t:
|
|
253
|
+
usecols.append(t)
|
|
254
|
+
rename_dictionary[t]= "T"
|
|
255
|
+
if f:
|
|
256
|
+
usecols.append(f)
|
|
257
|
+
rename_dictionary[f]= "F"
|
|
258
|
+
if mlog10p:
|
|
259
|
+
usecols.append(mlog10p)
|
|
260
|
+
rename_dictionary[mlog10p]= "MLOG10P"
|
|
261
|
+
if test:
|
|
262
|
+
usecols.append(test)
|
|
263
|
+
rename_dictionary[test]= "TEST"
|
|
264
|
+
if info:
|
|
265
|
+
usecols.append(info)
|
|
266
|
+
rename_dictionary[info]= "INFO"
|
|
267
|
+
if OR:
|
|
268
|
+
usecols.append(OR)
|
|
269
|
+
rename_dictionary[OR]= "OR"
|
|
270
|
+
if OR_95L:
|
|
271
|
+
usecols.append(OR_95L)
|
|
272
|
+
rename_dictionary[OR_95L]= "OR_95L"
|
|
273
|
+
if OR_95U:
|
|
274
|
+
usecols.append(OR_95U)
|
|
275
|
+
rename_dictionary[OR_95U]= "OR_95U"
|
|
276
|
+
if HR:
|
|
277
|
+
usecols.append(HR)
|
|
278
|
+
rename_dictionary[HR]= "HR"
|
|
279
|
+
if HR_95L:
|
|
280
|
+
usecols.append(HR_95L)
|
|
281
|
+
rename_dictionary[HR_95L]= "HR_95L"
|
|
282
|
+
if HR_95U:
|
|
283
|
+
usecols.append(HR_95U)
|
|
284
|
+
rename_dictionary[HR_95U]= "HR_95U"
|
|
285
|
+
if phet:
|
|
286
|
+
usecols.append(phet)
|
|
287
|
+
rename_dictionary[phet]= "P_HET"
|
|
288
|
+
if i2:
|
|
289
|
+
usecols.append(i2)
|
|
290
|
+
rename_dictionary[i2]= "I2"
|
|
291
|
+
if snpr2:
|
|
292
|
+
usecols.append(snpr2)
|
|
293
|
+
rename_dictionary[snpr2]= "SNPR2"
|
|
294
|
+
if dof:
|
|
295
|
+
usecols.append(dof)
|
|
296
|
+
rename_dictionary[dof]= "DOF"
|
|
297
|
+
if direction:
|
|
298
|
+
usecols.append(direction)
|
|
299
|
+
rename_dictionary[direction]="DIRECTION"
|
|
300
|
+
if status:
|
|
301
|
+
usecols.append(status)
|
|
302
|
+
rename_dictionary[status]="STATUS"
|
|
303
|
+
dtype_dictionary[status]=pl.String()
|
|
304
|
+
if other:
|
|
305
|
+
overlapped = _check_overlap_with_reserved_keys(other)
|
|
306
|
+
log.warning("Columns with headers overlapping with GWASLab reserved keywords:{}".format(overlapped),verbose=verbose)
|
|
307
|
+
usecols = usecols + other
|
|
308
|
+
for i in other:
|
|
309
|
+
rename_dictionary[i] = i
|
|
310
|
+
if fmt=="vcf":
|
|
311
|
+
# store the final column list
|
|
312
|
+
vcf_usecols = usecols.copy()
|
|
313
|
+
# loading the fixed columns + study
|
|
314
|
+
usecols = meta_data["format_fixed"]
|
|
315
|
+
if study is not None:
|
|
316
|
+
usecols = usecols + [study]
|
|
317
|
+
else:
|
|
318
|
+
study = raw_cols[9]
|
|
319
|
+
usecols = usecols + [study]
|
|
320
|
+
|
|
321
|
+
if usekeys is not None:
|
|
322
|
+
# extract only specified keys
|
|
323
|
+
usecols_new =[]
|
|
324
|
+
for i in usekeys:
|
|
325
|
+
for k, v in rename_dictionary.items():
|
|
326
|
+
if i == v:
|
|
327
|
+
usecols_new.append(k)
|
|
328
|
+
usecols_valid =[]
|
|
329
|
+
for i in usecols_new:
|
|
330
|
+
if i in usecols:
|
|
331
|
+
usecols_valid.append(i)
|
|
332
|
+
usecols = usecols_valid
|
|
333
|
+
|
|
334
|
+
usecols = list(set(usecols))
|
|
335
|
+
|
|
336
|
+
#loading data ##########################################################################################################
|
|
337
|
+
|
|
338
|
+
try:
|
|
339
|
+
if type(sumstats) is str:
|
|
340
|
+
## loading data from path
|
|
341
|
+
inpath = sumstats
|
|
342
|
+
if "@" in inpath:
|
|
343
|
+
log.write("Start to initialize gl.Sumstats from files with pattern :" + inpath,verbose=verbose)
|
|
344
|
+
sumstats_chr_list=[]
|
|
345
|
+
for i in inpath_chr_list:
|
|
346
|
+
log.write(" -Loading:" + i)
|
|
347
|
+
skip_rows = get_skip_rows(i)
|
|
348
|
+
readargs["skip_rows"] = skip_rows
|
|
349
|
+
sumstats_chr = pl.read_csv(i,
|
|
350
|
+
columns = usecols,
|
|
351
|
+
schema_overrides=dtype_dictionary,
|
|
352
|
+
**readargs)
|
|
353
|
+
sumstats_chr_list.append(sumstats_chr)
|
|
354
|
+
log.write(" -Merging sumstats for chromosomes:",",".join(inpath_chr_num_list),verbose=verbose)
|
|
355
|
+
sumstats = pl.concat(sumstats_chr_list, rechunk=True)
|
|
356
|
+
del(sumstats_chr_list)
|
|
357
|
+
gc.collect()
|
|
358
|
+
else:
|
|
359
|
+
skip_rows = get_skip_rows(inpath)
|
|
360
|
+
readargs["skip_rows"] = skip_rows
|
|
361
|
+
log.write("Start to initialize gl.Sumstats from file :" + inpath,verbose=verbose)
|
|
362
|
+
|
|
363
|
+
sumstats = pl.read_csv(inpath,
|
|
364
|
+
columns =usecols,
|
|
365
|
+
schema_overrides=dtype_dictionary,
|
|
366
|
+
**readargs)
|
|
367
|
+
|
|
368
|
+
elif type(sumstats) is pd.DataFrame:
|
|
369
|
+
## loading data from dataframe
|
|
370
|
+
log.write("Start to initialize gl.Sumstats from pandas DataFrame ...",verbose=verbose)
|
|
371
|
+
sumstats = sumstats[usecols].copy()
|
|
372
|
+
for key,value in dtype_dictionary.items():
|
|
373
|
+
if key in usecols:
|
|
374
|
+
astype = value
|
|
375
|
+
if rename_dictionary[key]=="CHR":
|
|
376
|
+
astype ="Int64"
|
|
377
|
+
try:
|
|
378
|
+
sumstats[key] = sumstats[key].astype(astype)
|
|
379
|
+
except:
|
|
380
|
+
sumstats[key] = sumstats[key].astype("string")
|
|
381
|
+
except ValueError:
|
|
382
|
+
raise ValueError("Please input a path or a pd.DataFrame, and make sure it contain the columns.")
|
|
383
|
+
|
|
384
|
+
if chrom_pat is not None:
|
|
385
|
+
sumstats = _load_single_chr(sumstats,
|
|
386
|
+
usecols=usecols,
|
|
387
|
+
rename_dictionary=rename_dictionary,
|
|
388
|
+
chrom_pat=chrom_pat,
|
|
389
|
+
log=log,
|
|
390
|
+
verbose=verbose)
|
|
391
|
+
elif snpid_pat is not None:
|
|
392
|
+
sumstats = _load_variants_with_pattern(sumstats,
|
|
393
|
+
usecols=usecols,
|
|
394
|
+
rename_dictionary=rename_dictionary,
|
|
395
|
+
snpid_pat=snpid_pat,
|
|
396
|
+
log=log,
|
|
397
|
+
verbose=verbose)
|
|
398
|
+
## renaming columns ###############################################################################################
|
|
399
|
+
if fmt == "vcf":
|
|
400
|
+
sumstats = parse_vcf_study(sumstats,format_cols,study,vcf_usecols,log=log,verbose=verbose)
|
|
401
|
+
usecols = vcf_usecols
|
|
402
|
+
|
|
403
|
+
converted_columns = list(map(lambda x: rename_dictionary[x], set(usecols)))
|
|
404
|
+
|
|
405
|
+
## renaming log
|
|
406
|
+
log.write(" -Reading columns :", ",".join(set(usecols)),verbose=verbose)
|
|
407
|
+
log.write(" -Renaming columns to :", ",".join(converted_columns),verbose=verbose)
|
|
408
|
+
log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns),verbose=verbose)
|
|
409
|
+
|
|
410
|
+
## renaming #####################################################################################
|
|
411
|
+
sumstats = sumstats.rename(rename_dictionary)
|
|
412
|
+
|
|
413
|
+
## if n was provided as int #####################################################################################
|
|
414
|
+
if type(n) is int:
|
|
415
|
+
sumstats["N"] = n
|
|
416
|
+
if type(ncase) is int:
|
|
417
|
+
sumstats["N_CASE"] = ncase
|
|
418
|
+
if type(ncontrol) is int:
|
|
419
|
+
sumstats["N_CONTROL"] = ncontrol
|
|
420
|
+
|
|
421
|
+
### status ######################################################################################################
|
|
422
|
+
if status is None:
|
|
423
|
+
sumstats = process_status(sumstats=sumstats,build=build,log=log,verbose=verbose)
|
|
424
|
+
|
|
425
|
+
## ea/nea, ref/alt ##############################################################################################
|
|
426
|
+
sumstats = process_allele(sumstats=sumstats,log=log,verbose=verbose)
|
|
427
|
+
|
|
428
|
+
## NEAF to EAF ###########################################################################################################
|
|
429
|
+
if neaf is not None :
|
|
430
|
+
sumstats = process_neaf(sumstats=sumstats,log=log,verbose=verbose)
|
|
431
|
+
|
|
432
|
+
## reodering ###################################################################################################
|
|
433
|
+
#sumstats = sortcolumn(sumstats=sumstats,log=log,verbose=verbose)
|
|
434
|
+
sumstats = quick_convert_datatype(sumstats,log=log,verbose=verbose)
|
|
435
|
+
|
|
436
|
+
check_datatype(sumstats,log=log,verbose=verbose)
|
|
437
|
+
#gc.collect()
|
|
438
|
+
check_dataframe_memory_usage(sumstats,log=log,verbose=verbose)
|
|
439
|
+
|
|
440
|
+
log.write("Finished loading data successfully!",verbose=verbose)
|
|
441
|
+
return sumstats
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
#### helper #######################################################################
|
|
445
|
+
def isfile_casesensitive(path):
|
|
446
|
+
if not os.path.isfile(path):
|
|
447
|
+
return False # exit early
|
|
448
|
+
directory, filename = os.path.split(path)
|
|
449
|
+
return filename in os.listdir(directory)
|
|
450
|
+
|
|
451
|
+
def get_readargs_header(inpath,readargs):
|
|
452
|
+
if "vcf.gz" in inpath:
|
|
453
|
+
with gzip.open(inpath,'r') as file:
|
|
454
|
+
skip=0
|
|
455
|
+
for line in file:
|
|
456
|
+
if line.decode('utf-8').startswith('##'):
|
|
457
|
+
skip+=1
|
|
458
|
+
else:
|
|
459
|
+
readargs["skip_rows"]=skip
|
|
460
|
+
readargs["separator"]="\t"
|
|
461
|
+
break
|
|
462
|
+
readargs_header = readargs.copy()
|
|
463
|
+
readargs_header["n_rows"]=1
|
|
464
|
+
#readargs_header["dtype"]="string"
|
|
465
|
+
readargs_header["infer_schema"] = False
|
|
466
|
+
return readargs_header
|
|
467
|
+
|
|
468
|
+
def get_skip_rows(inpath):
|
|
469
|
+
if "vcf.gz" in inpath:
|
|
470
|
+
with gzip.open(inpath,'r') as file:
|
|
471
|
+
skip=0
|
|
472
|
+
for line in file:
|
|
473
|
+
if line.decode('utf-8').startswith('##'):
|
|
474
|
+
skip+=1
|
|
475
|
+
else:
|
|
476
|
+
return skip
|
|
477
|
+
else:
|
|
478
|
+
return 0
|
|
479
|
+
|
|
480
|
+
def parse_vcf_study(sumstats,format_cols,study,vcf_usecols,log,verbose=True):
|
|
481
|
+
log.write(" -Parsing based on FORMAT: ", format_cols,verbose=verbose)
|
|
482
|
+
log.write(" -Parsing vcf study : ", study,verbose=verbose)
|
|
483
|
+
#sumstats[format_cols] = sumstats[study].str.split(":",expand=True).values
|
|
484
|
+
sumstats = sumstats.drop(["FORMAT",study])
|
|
485
|
+
sumstats = sumstats[vcf_usecols]
|
|
486
|
+
gc.collect()
|
|
487
|
+
return sumstats
|
|
488
|
+
|
|
489
|
+
def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=False, skip_meta_records=None):
|
|
490
|
+
log.write(" -"+fmt+" format meta info:",verbose=verbose)
|
|
491
|
+
if skip_meta_records is None:
|
|
492
|
+
skip_meta_records =[]
|
|
493
|
+
for key,value in meta_data.items():
|
|
494
|
+
if key in skip_meta_records:
|
|
495
|
+
continue
|
|
496
|
+
if value is None:
|
|
497
|
+
continue
|
|
498
|
+
if type(value) is str:
|
|
499
|
+
if "\n" in value:
|
|
500
|
+
value_first_line=value.split("\n")[0]
|
|
501
|
+
log.write(" -",key," : "+value_first_line.strip()+"...",verbose=verbose)
|
|
502
|
+
elif value==" ":
|
|
503
|
+
log.write(' -',key,' : \\s ',verbose=verbose)
|
|
504
|
+
elif value=="\t":
|
|
505
|
+
log.write(' -',key,' : \\t',verbose=verbose)
|
|
506
|
+
else:
|
|
507
|
+
log.write(" -",key," : "+value.strip(),verbose=verbose)
|
|
508
|
+
elif type(value) is list:
|
|
509
|
+
log.write(" -",key," : "+','.join(value),verbose=verbose)
|
|
510
|
+
else:
|
|
511
|
+
log.write(" -",key," : ",value,verbose=verbose)
|
|
512
|
+
keys=[]
|
|
513
|
+
values=[]
|
|
514
|
+
for key,value in rename_dictionary.items():
|
|
515
|
+
keys.append(key)
|
|
516
|
+
values.append(value)
|
|
517
|
+
if fmt!="gwaslab":
|
|
518
|
+
if output == False:
|
|
519
|
+
if fmt!="auto":
|
|
520
|
+
log.write(" -"+fmt+" to gwaslab format dictionary:",verbose=verbose)
|
|
521
|
+
log.write(" - "+fmt+" keys:",",".join(keys),verbose=verbose)
|
|
522
|
+
log.write(" - gwaslab values:",",".join(values),verbose=verbose)
|
|
523
|
+
else:
|
|
524
|
+
log.write(" - Auto-detection mode. Note: auto-detection assumes A1=EA; Alt=EA and Frq=EAF...",verbose=verbose)
|
|
525
|
+
log.write(" - Header conversion source: https://github.com/Cloufield/formatbook/blob/main/formats/auto.json",verbose=verbose)
|
|
526
|
+
else:
|
|
527
|
+
log.write(" -gwaslab to "+fmt+" format dictionary:",verbose=verbose)
|
|
528
|
+
keys=[]
|
|
529
|
+
values=[]
|
|
530
|
+
for key,value in rename_dictionary.items():
|
|
531
|
+
keys.append(key)
|
|
532
|
+
values.append(value)
|
|
533
|
+
log.write(" - gwaslab keys:", ','.join(keys),verbose=verbose)
|
|
534
|
+
log.write(" - "+fmt+" values:" , ','.join(values),verbose=verbose)
|
|
535
|
+
|
|
536
|
+
def process_neaf(sumstats,log,verbose):
|
|
537
|
+
log.write(" -NEAF is specified...",verbose=verbose)
|
|
538
|
+
pre_number=len(sumstats)
|
|
539
|
+
log.write(" -Checking if 0<= NEAF <=1 ...",verbose=verbose)
|
|
540
|
+
|
|
541
|
+
sumstats["EAF"] = pd.to_numeric(sumstats["EAF"], errors='coerce')
|
|
542
|
+
|
|
543
|
+
sumstats = sumstats.filter(pl.col("EAF")>=0 & pl.col("EAF")<=1)
|
|
544
|
+
sumstats = sumstats.with_columns(
|
|
545
|
+
EAF = 1- pl.col("EAF")
|
|
546
|
+
)
|
|
547
|
+
log.write(" -Converted NEAF to EAF.",verbose=verbose)
|
|
548
|
+
|
|
549
|
+
after_number=len(sumstats)
|
|
550
|
+
|
|
551
|
+
log.write(" -Removed "+str(pre_number - after_number)+" variants with bad NEAF.",verbose=verbose)
|
|
552
|
+
|
|
553
|
+
return sumstats
|
|
554
|
+
|
|
555
|
+
def process_allele(sumstats,log,verbose):
|
|
556
|
+
|
|
557
|
+
if "EA" in sumstats.columns:
|
|
558
|
+
|
|
559
|
+
if "REF" in sumstats.columns and "ALT" in sumstats.columns:
|
|
560
|
+
|
|
561
|
+
if "NEA" not in sumstats.columns:
|
|
562
|
+
log.write(" NEA not available: assigning REF to NEA...",verbose=verbose)
|
|
563
|
+
|
|
564
|
+
sumstats = sumstats.with_columns(NEA = pl.col("REF"))
|
|
565
|
+
|
|
566
|
+
log.write(" -EA,REF and ALT columns are available: assigning NEA...",verbose=verbose)
|
|
567
|
+
ea_alt = sumstats["EA"]==sumstats["ALT"]
|
|
568
|
+
|
|
569
|
+
log.write(" -For variants with EA == ALT : assigning REF to NEA ...",verbose=verbose)
|
|
570
|
+
sumstats.loc[ea_alt,"NEA"] = sumstats.loc[ea_alt,"REF"]
|
|
571
|
+
|
|
572
|
+
sumstats = sumstats.with_columns(
|
|
573
|
+
pl.when(ea_alt)
|
|
574
|
+
.then(pl.col("REF"))
|
|
575
|
+
.otherwise(pl.col("NEA"))
|
|
576
|
+
.alias("NEA")
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
ea_not_alt = sumstats["EA"]!=sumstats["ALT"]
|
|
580
|
+
log.write(" -For variants with EA != ALT : assigning ALT to NEA ...",verbose=verbose)
|
|
581
|
+
sumstats = sumstats.with_columns(
|
|
582
|
+
pl.when(ea_not_alt)
|
|
583
|
+
.then(pl.col("ALT"))
|
|
584
|
+
.otherwise(pl.col("NEA"))
|
|
585
|
+
.alias("NEA")
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
return sumstats
|
|
589
|
+
|
|
590
|
+
def process_status(sumstats,build,log,verbose):
|
|
591
|
+
log.write(" -Initiating a status column: STATUS ...",verbose=verbose)
|
|
592
|
+
#sumstats["STATUS"] = int(build)*(10**5) +99999
|
|
593
|
+
build = _process_build(build,log,verbose)
|
|
594
|
+
sumstats = sumstats.with_columns(
|
|
595
|
+
STATUS = pl.lit(build +"99999")
|
|
596
|
+
)
|
|
597
|
+
return sumstats
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
def _load_single_chr(sumstats,usecols, rename_dictionary,chrom_pat,log,verbose):
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
# get chr
|
|
604
|
+
for k,v in rename_dictionary.items():
|
|
605
|
+
if v=="CHR":
|
|
606
|
+
if k in sumstats.columns:
|
|
607
|
+
log.write(" -Columns used to filter variants: {}".format(k),verbose=verbose)
|
|
608
|
+
chunk_chrom = k
|
|
609
|
+
break
|
|
610
|
+
|
|
611
|
+
log.write(" -Loading only variants on chromosome with pattern : {} ...".format(chrom_pat),verbose=verbose)
|
|
612
|
+
|
|
613
|
+
sumstats_filtered = sumstats.filter(pl.col(chunk_chrom).str.contains(chrom_pat))
|
|
614
|
+
|
|
615
|
+
log.write(" -Loaded {} variants on chromosome with pattern :{} ...".format(len(sumstats_filtered), chrom_pat),verbose=verbose)
|
|
616
|
+
return sumstats_filtered
|
|
617
|
+
|
|
618
|
+
def _load_variants_with_pattern(sumstats,usecols, rename_dictionary,snpid_pat,log,verbose):
|
|
619
|
+
|
|
620
|
+
# get chr
|
|
621
|
+
for k,v in rename_dictionary.items():
|
|
622
|
+
if v=="SNPID":
|
|
623
|
+
if k in sumstats.columns:
|
|
624
|
+
log.write(" -Columns used to filter variants: {}".format(k),verbose=verbose)
|
|
625
|
+
chunk_snpid = k
|
|
626
|
+
break
|
|
627
|
+
|
|
628
|
+
log.write(" -Loading only variants with pattern : {} ...".format(snpid_pat),verbose=verbose)
|
|
629
|
+
sumstats_filtered = sumstats.filter(pl.col(chunk_snpid).str.contains(snpid_pat))
|
|
630
|
+
|
|
631
|
+
log.write(" -Loaded {} variants with pattern : {} ...".format(len(sumstats_filtered), snpid_pat),verbose=verbose)
|
|
632
|
+
return sumstats_filtered
|
gwaslab/io_process_args.py
CHANGED
|
@@ -37,4 +37,28 @@ def _merge_and_sync_dic(list_of_dics:list, default:dict) -> dict:
|
|
|
37
37
|
for dic in list_of_dics:
|
|
38
38
|
if isinstance(dic, dict):
|
|
39
39
|
temp.update(dic)
|
|
40
|
-
return temp
|
|
40
|
+
return temp
|
|
41
|
+
|
|
42
|
+
def _update_args(args=None, default_args=None):
|
|
43
|
+
|
|
44
|
+
if default_args is None:
|
|
45
|
+
default_args={}
|
|
46
|
+
|
|
47
|
+
if args is None:
|
|
48
|
+
# if None, return default dict
|
|
49
|
+
return default_args
|
|
50
|
+
else:
|
|
51
|
+
# if not None, update default dict
|
|
52
|
+
for key,value in args.items():
|
|
53
|
+
default_args[key] = value
|
|
54
|
+
return default_args
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _update_arg(arg=None, default_arg=None):
|
|
59
|
+
if arg is None:
|
|
60
|
+
# if None, return default
|
|
61
|
+
return default_arg
|
|
62
|
+
else:
|
|
63
|
+
# if not None, return arg
|
|
64
|
+
return arg
|