gwaslab 3.4.36__py3-none-any.whl → 3.4.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +1 -1
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +8 -0
- gwaslab/g_Sumstats.py +80 -178
- gwaslab/g_SumstatsPair.py +6 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_meta.py +13 -3
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +29 -15
- gwaslab/hm_harmonize_sumstats.py +312 -159
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +46 -37
- gwaslab/io_to_formats.py +428 -295
- gwaslab/qc_check_datatype.py +15 -1
- gwaslab/qc_fix_sumstats.py +956 -719
- gwaslab/util_ex_calculate_ldmatrix.py +29 -11
- gwaslab/util_ex_gwascatalog.py +1 -1
- gwaslab/util_ex_ldproxyfinder.py +1 -1
- gwaslab/util_ex_process_h5.py +26 -17
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_convert_h2.py +1 -1
- gwaslab/util_in_fill_data.py +44 -5
- gwaslab/util_in_filter_value.py +122 -34
- gwaslab/util_in_get_density.py +2 -2
- gwaslab/util_in_get_sig.py +41 -9
- gwaslab/viz_aux_quickfix.py +26 -21
- gwaslab/viz_aux_reposition_text.py +7 -4
- gwaslab/viz_aux_save_figure.py +6 -5
- gwaslab/viz_plot_compare_af.py +5 -5
- gwaslab/viz_plot_compare_effect.py +22 -5
- gwaslab/viz_plot_miamiplot2.py +28 -20
- gwaslab/viz_plot_mqqplot.py +214 -98
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +16 -9
- gwaslab/viz_plot_trumpetplot.py +15 -6
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/METADATA +3 -3
- gwaslab-3.4.38.dist-info/RECORD +72 -0
- gwaslab-3.4.36.dist-info/RECORD +0 -72
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0
gwaslab/io_to_formats.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
import yaml
|
|
3
3
|
import hashlib
|
|
4
|
+
import copy
|
|
4
5
|
from pysam import tabix_compress
|
|
5
6
|
from pysam import tabix_index
|
|
6
7
|
from datetime import datetime
|
|
@@ -11,6 +12,8 @@ from gwaslab.g_Log import Log
|
|
|
11
12
|
from gwaslab.bd_common_data import get_format_dict
|
|
12
13
|
from gwaslab.bd_common_data import get_number_to_chr
|
|
13
14
|
from gwaslab.g_version import gwaslab_info
|
|
15
|
+
from gwaslab.bd_get_hapmap3 import gethapmap3
|
|
16
|
+
|
|
14
17
|
# to vcf
|
|
15
18
|
# to fmt
|
|
16
19
|
## vcf
|
|
@@ -19,7 +22,168 @@ from gwaslab.g_version import gwaslab_info
|
|
|
19
22
|
## annovar
|
|
20
23
|
## general : ldsc, plink, plink2, saige, regenie
|
|
21
24
|
###################################################################################################################################################
|
|
25
|
+
def _to_format(sumstats,
|
|
26
|
+
path="./sumstats",
|
|
27
|
+
fmt="gwaslab",
|
|
28
|
+
extract=None,
|
|
29
|
+
exclude=None,
|
|
30
|
+
cols=None,
|
|
31
|
+
id_use="rsID",
|
|
32
|
+
hapmap3=False,
|
|
33
|
+
exclude_hla=False,
|
|
34
|
+
hla_range=(25,34),
|
|
35
|
+
build=None,
|
|
36
|
+
n=None,
|
|
37
|
+
no_status=False,
|
|
38
|
+
output_log=True,
|
|
39
|
+
to_csvargs=None,
|
|
40
|
+
float_formats=None,
|
|
41
|
+
xymt_number=False,
|
|
42
|
+
xymt=None,
|
|
43
|
+
chr_prefix="",
|
|
44
|
+
meta=None,
|
|
45
|
+
ssfmeta=False,
|
|
46
|
+
md5sum=False,
|
|
47
|
+
bgzip=False,
|
|
48
|
+
tabix=False,
|
|
49
|
+
tabix_indexargs={},
|
|
50
|
+
log=Log(),
|
|
51
|
+
verbose=True):
|
|
52
|
+
|
|
53
|
+
if to_csvargs is None:
|
|
54
|
+
to_csvargs = {}
|
|
55
|
+
if float_formats is None:
|
|
56
|
+
float_formats={}
|
|
57
|
+
if cols is None:
|
|
58
|
+
cols=[]
|
|
59
|
+
if xymt is None:
|
|
60
|
+
xymt = ["X","Y","MT"]
|
|
61
|
+
onetime_log = copy.deepcopy(log)
|
|
62
|
+
|
|
63
|
+
#######################################################################################################
|
|
64
|
+
|
|
65
|
+
formatlist= get_formats_list() + ["vep","bed","annovar","vcf"]
|
|
66
|
+
if fmt in formatlist:
|
|
67
|
+
onetime_log.write("Start to convert the output sumstats in: ",fmt, " format",verbose=verbose)
|
|
68
|
+
else:
|
|
69
|
+
raise ValueError("Please select a format to output")
|
|
70
|
+
suffix=fmt
|
|
71
|
+
|
|
72
|
+
#######################################################################################################
|
|
73
|
+
# filter
|
|
74
|
+
output = sumstats.copy()
|
|
75
|
+
|
|
76
|
+
if extract is not None:
|
|
77
|
+
onetime_log.write(" -Extracting {} variants from the main DataFrame...".format(len(extract)),verbose=verbose)
|
|
78
|
+
output = output.loc[output[id_use].isin(extract),:]
|
|
79
|
+
onetime_log.write(" -Extracted {} variants from the main DataFrame...".format(len(output)),verbose=verbose)
|
|
80
|
+
|
|
81
|
+
if exclude is not None:
|
|
82
|
+
onetime_log.write(" -Excluding {} variants from the main DataFrame...".format(len(exclude)),verbose=verbose)
|
|
83
|
+
output = output.loc[~output[id_use].isin(exclude),:]
|
|
84
|
+
onetime_log.write(" -Excluded {} variants from the main DataFrame...".format(len(output)),verbose=verbose)
|
|
85
|
+
|
|
86
|
+
#hla and hapmap3 #######################################################################################
|
|
87
|
+
|
|
88
|
+
#exclude hla
|
|
89
|
+
if exclude_hla==True:
|
|
90
|
+
onetime_log.write(" -Excluding variants in MHC (HLA) region ...",verbose=verbose)
|
|
91
|
+
before = len(output)
|
|
92
|
+
is_hla = (output["CHR"].astype("string") == "6") & (output["POS"].astype("Int64") > hla_range[0]*1000000) & (output["POS"].astype("Int64") < hla_range[1]*1000000)
|
|
93
|
+
output = output.loc[~is_hla,:]
|
|
94
|
+
after = len(output)
|
|
95
|
+
onetime_log.write(" -Exclude "+ str(before - after) + " variants in MHC (HLA) region : {}Mb - {}Mb.".format(hla_range[0], hla_range[1]),verbose=verbose)
|
|
96
|
+
suffix = "noMHC."+suffix
|
|
97
|
+
|
|
98
|
+
#extract hapmap3 SNPs
|
|
99
|
+
if hapmap3==True:
|
|
100
|
+
output = gethapmap3(output,build=build,verbose=verbose)
|
|
101
|
+
after = len(output)
|
|
102
|
+
onetime_log.write(" -Extract {} variants in Hapmap3 datasets for build {}.".format(after, build ),verbose=verbose)
|
|
103
|
+
suffix = "hapmap3."+suffix
|
|
104
|
+
|
|
105
|
+
# add a n column
|
|
106
|
+
if n is not None:
|
|
107
|
+
output["N"] = n
|
|
108
|
+
|
|
109
|
+
#######################################################################################################
|
|
110
|
+
#formatting float statistics
|
|
111
|
+
onetime_log.write(" -Formatting statistics ...",verbose=verbose)
|
|
112
|
+
|
|
113
|
+
formats = {
|
|
114
|
+
'EAF': '{:.4g}',
|
|
115
|
+
'MAF': '{:.4g}',
|
|
116
|
+
'BETA': '{:.4f}',
|
|
117
|
+
'SE': '{:.4f}',
|
|
118
|
+
'BETA_95U': '{:.4f}',
|
|
119
|
+
'BETA_95L': '{:.4f}',
|
|
120
|
+
'Z': '{:.4f}',
|
|
121
|
+
'CHISQ': '{:.4f}',
|
|
122
|
+
'F': '{:.4f}',
|
|
123
|
+
'OR': '{:.4f}',
|
|
124
|
+
'OR_95U': '{:.4f}',
|
|
125
|
+
'OR_95L': '{:.4f}',
|
|
126
|
+
'HR': '{:.4f}',
|
|
127
|
+
'HR_95U': '{:.4f}',
|
|
128
|
+
'HR_95L': '{:.4f}',
|
|
129
|
+
'INFO': '{:.4f}',
|
|
130
|
+
'P': '{:.4e}',
|
|
131
|
+
'MLOG10P': '{:.4f}',
|
|
132
|
+
'DAF': '{:.4f}'}
|
|
133
|
+
|
|
134
|
+
for col, f in float_formats.items():
|
|
135
|
+
if col in output.columns:
|
|
136
|
+
formats[col]=f
|
|
137
|
+
|
|
138
|
+
for col, f in formats.items():
|
|
139
|
+
if col in output.columns:
|
|
140
|
+
if str(output[col].dtype) in ["Float32","Float64","float64","float32","float16","float"]:
|
|
141
|
+
output[col] = output[col].map(f.format)
|
|
22
142
|
|
|
143
|
+
onetime_log.write(" -Float statistics formats:",verbose=verbose)
|
|
144
|
+
keys=[]
|
|
145
|
+
values=[]
|
|
146
|
+
for key,value in formats.items():
|
|
147
|
+
if key in output.columns:
|
|
148
|
+
keys.append(key)
|
|
149
|
+
values.append(value)
|
|
150
|
+
|
|
151
|
+
onetime_log.write(" - Columns :",keys,verbose=verbose)
|
|
152
|
+
onetime_log.write(" - Output formats:",values,verbose=verbose)
|
|
153
|
+
|
|
154
|
+
##########################################################################################################
|
|
155
|
+
# output, mapping column names
|
|
156
|
+
|
|
157
|
+
if fmt in get_formats_list() + ["vep","bed","annovar","vcf"]:
|
|
158
|
+
tofmt(output,
|
|
159
|
+
path=path,
|
|
160
|
+
fmt=fmt,
|
|
161
|
+
cols=cols,
|
|
162
|
+
suffix=suffix,
|
|
163
|
+
build=build,
|
|
164
|
+
verbose=verbose,
|
|
165
|
+
no_status=no_status,
|
|
166
|
+
log=onetime_log,
|
|
167
|
+
to_csvargs=to_csvargs,
|
|
168
|
+
chr_prefix=chr_prefix,
|
|
169
|
+
meta=meta,
|
|
170
|
+
ssfmeta=ssfmeta,
|
|
171
|
+
bgzip=bgzip,
|
|
172
|
+
tabix=tabix,
|
|
173
|
+
tabix_indexargs=tabix_indexargs,
|
|
174
|
+
md5sum=md5sum,
|
|
175
|
+
xymt_number=xymt_number,
|
|
176
|
+
xymt=xymt)
|
|
177
|
+
|
|
178
|
+
if output_log is True:
|
|
179
|
+
log_path = path + "."+ suffix + ".log"
|
|
180
|
+
onetime_log.write(" -Saving log file to: {}".format(log_path),verbose=verbose)
|
|
181
|
+
onetime_log.write("Finished outputting successfully!",verbose=verbose)
|
|
182
|
+
try:
|
|
183
|
+
onetime_log.save(log_path, verbose=False)
|
|
184
|
+
except:
|
|
185
|
+
pass
|
|
186
|
+
|
|
23
187
|
###################################################################################################################################################
|
|
24
188
|
def tofmt(sumstats,
|
|
25
189
|
meta,
|
|
@@ -47,229 +211,74 @@ def tofmt(sumstats,
|
|
|
47
211
|
if fmt in ["ssf"]:
|
|
48
212
|
xymt_number=True
|
|
49
213
|
if "SNPID" in sumstats.columns:
|
|
50
|
-
|
|
214
|
+
log.write(' -Replacing SNPID separator from ":" to "_"...')
|
|
51
215
|
sumstats["SNPID"] = sumstats["SNPID"].str.replace(":","_")
|
|
52
|
-
|
|
216
|
+
log.write(" -Start outputting sumstats in "+fmt+" format...")
|
|
53
217
|
|
|
54
218
|
if "CHR" in sumstats.columns:
|
|
55
219
|
if xymt_number is False and pd.api.types.is_integer_dtype(sumstats["CHR"]):
|
|
56
220
|
sumstats["CHR"]= sumstats["CHR"].map(get_number_to_chr(xymt=xymt,prefix=chr_prefix))
|
|
57
221
|
elif chr_prefix is not None:
|
|
58
222
|
sumstats["CHR"]= chr_prefix + sumstats["CHR"].astype("string")
|
|
59
|
-
|
|
60
|
-
### calculate meta data
|
|
61
|
-
if "EAF" in sumstats.columns:
|
|
62
|
-
min_maf = sumstats["EAF"].min()
|
|
63
|
-
else:
|
|
64
|
-
min_maf = "Unknown"
|
|
65
|
-
|
|
66
|
-
if "N" in sumstats.columns:
|
|
67
|
-
n_median = sumstats["N"].median()
|
|
68
|
-
n_max = sumstats["N"].max()
|
|
69
|
-
n_min = sumstats["N"].min()
|
|
70
|
-
else:
|
|
71
|
-
n_median = "Unknown"
|
|
72
|
-
n_max = "Unknown"
|
|
73
|
-
n_min = "Unknown"
|
|
74
|
-
|
|
75
223
|
|
|
224
|
+
####################################################################################################################
|
|
76
225
|
if fmt=="bed":
|
|
77
226
|
# bed-like format, 0-based,
|
|
78
227
|
# first 3 columns : chromosome, start, end
|
|
79
228
|
# https://genome.ucsc.edu/FAQ/FAQformat.html#format1
|
|
80
|
-
is_snp = (sumstats
|
|
81
|
-
|
|
82
|
-
|
|
229
|
+
is_snp,is_insert,is_delete = _check_indel(sumstats,log,verbose)
|
|
230
|
+
log.write(" -formatting to 0-based bed-like file...")
|
|
231
|
+
log.write(" -format description: {}".format("https://genome.ucsc.edu/FAQ/FAQformat.html#format1"))
|
|
83
232
|
|
|
84
|
-
|
|
85
|
-
if verbose: log.write(" -Number of Insertions :",sum(is_insert))
|
|
86
|
-
if verbose: log.write(" -Number of Deletions :",sum(is_delete))
|
|
87
|
-
|
|
88
|
-
if verbose: log.write(" -formatting to 0-based bed-like file...")
|
|
89
|
-
# for snp
|
|
90
|
-
# start = pos - 1 ; end = pos
|
|
91
|
-
# A/G
|
|
92
|
-
# AT/CG
|
|
93
|
-
sumstats.loc[is_snp,"START"] = sumstats.loc[is_snp,"POS"]-1
|
|
94
|
-
sumstats.loc[is_snp,"END"] = sumstats.loc[is_snp,"POS"]-1 + sumstats.loc[is_snp,"NEA"].str.len()
|
|
95
|
-
sumstats.loc[is_snp,"NEA/EA"] = sumstats.loc[is_snp,"NEA"].astype("string")+"/"+sumstats.loc[is_snp,"EA"].astype("string")
|
|
96
|
-
|
|
97
|
-
# for insertion
|
|
98
|
-
# start = pos : end = pos
|
|
99
|
-
# A/ATC -> -/TC
|
|
100
|
-
sumstats.loc[is_insert,"START"] = sumstats.loc[is_insert,"POS"]
|
|
101
|
-
sumstats.loc[is_insert,"END"] = sumstats.loc[is_insert,"POS"]
|
|
102
|
-
sumstats.loc[is_insert,"NEA/EA"] = "-/"+sumstats.loc[is_insert,"EA"].str.slice(start=1)
|
|
103
|
-
|
|
104
|
-
# for deletion
|
|
105
|
-
# start = pos - 1 +1; end = pos -1 +1+ len(Ref)
|
|
106
|
-
# ATC/A -> TC/-
|
|
107
|
-
sumstats.loc[is_delete,"START"] = sumstats.loc[is_delete,"POS"]
|
|
108
|
-
sumstats.loc[is_delete,"END"] = sumstats.loc[is_delete,"POS"] + sumstats.loc[is_delete,"NEA"].str.len() - 1
|
|
109
|
-
sumstats.loc[is_delete,"NEA/EA"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)+"/-"
|
|
110
|
-
|
|
111
|
-
sumstats["STRAND"]="+"
|
|
233
|
+
sumstats = _adjust_position(sumstats, fmt, is_snp, is_insert, is_delete, log, verbose )
|
|
112
234
|
|
|
113
|
-
sumstats["START"] = sumstats["START"].astype("Int64")
|
|
114
|
-
sumstats["END"] = sumstats["END"].astype("Int64")
|
|
115
235
|
ouput_cols=["CHR","START","END","NEA/EA","STRAND","SNPID"] + cols
|
|
116
236
|
|
|
117
|
-
sumstats
|
|
118
|
-
path = path + "."+suffix
|
|
119
|
-
if verbose: log.write(" -Output columns:",sumstats.columns)
|
|
120
|
-
if verbose: log.write(" -Output path:",path)
|
|
121
|
-
|
|
122
|
-
sumstats.to_csv(path,sep="\t",index=None,header=None,**to_csvargs)
|
|
123
|
-
#tabix_compress
|
|
124
|
-
#tabix_index
|
|
125
|
-
if bgzip is True:
|
|
126
|
-
if verbose: log.write(" -bgzip compressing ...")
|
|
127
|
-
tabix_compress(path, path+".gz",force=True)
|
|
128
|
-
if tabix is True:
|
|
129
|
-
if verbose: log.write(" -tabix indexing...")
|
|
130
|
-
if "preset" not in tabix_indexargs:
|
|
131
|
-
tabix_indexargs["preset"] = "bed"
|
|
132
|
-
if "force" not in tabix_indexargs:
|
|
133
|
-
tabix_indexargs["force"] = True
|
|
134
|
-
|
|
135
|
-
tabix_index(path+".gz", **tabix_indexargs)
|
|
237
|
+
_output_bed_like(sumstats, path, "bed", suffix, ouput_cols,to_csvargs,bgzip, tabix, tabix_indexargs, md5sum, log, verbose)
|
|
136
238
|
####################################################################################################################
|
|
137
239
|
elif fmt=="vep":
|
|
138
240
|
# bed-like format, 1-based
|
|
139
241
|
# first 6 columns : chromosome, start, end, allele, strand, identifier
|
|
140
242
|
# https://asia.ensembl.org/info/docs/tools/vep/vep_formats.html
|
|
141
243
|
|
|
142
|
-
is_snp = (sumstats
|
|
143
|
-
is_insert = (sumstats["EA"].str.len()>1) &(sumstats["NEA"].str.len()==1)
|
|
144
|
-
is_delete = (sumstats["EA"].str.len()==1) &(sumstats["NEA"].str.len()>1)
|
|
145
|
-
|
|
146
|
-
if verbose: log.write(" -Number of SNPs :",sum(is_snp))
|
|
147
|
-
if verbose: log.write(" -Number of Insertions :",sum(is_insert))
|
|
148
|
-
if verbose: log.write(" -Number of Deletions :",sum(is_delete))
|
|
244
|
+
is_snp,is_insert,is_delete = _check_indel(sumstats,log,verbose)
|
|
149
245
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
sumstats.loc[is_snp,"START"] = sumstats.loc[is_snp,"POS"] + (sumstats.loc[is_snp,"NEA"].str.len() - 1 )
|
|
154
|
-
sumstats.loc[is_snp,"END"] = sumstats.loc[is_snp,"POS"] + (sumstats.loc[is_snp,"NEA"].str.len() - 1 )
|
|
155
|
-
sumstats.loc[is_snp,"NEA/EA"] = sumstats.loc[is_snp,"NEA"].astype("string")+"/"+sumstats.loc[is_snp,"EA"].astype("string")
|
|
156
|
-
|
|
157
|
-
# for insertion
|
|
158
|
-
# start = pos+1 ; end = pos
|
|
159
|
-
# A/ATC -> -/TC
|
|
160
|
-
sumstats.loc[is_insert,"START"] = sumstats.loc[is_insert,"POS"] + 1
|
|
161
|
-
sumstats.loc[is_insert,"END"] = sumstats.loc[is_insert,"POS"]
|
|
162
|
-
sumstats.loc[is_insert,"NEA/EA"] = "-/" + sumstats.loc[is_insert,"EA"].str.slice(start=1)
|
|
163
|
-
|
|
164
|
-
# for deletion
|
|
165
|
-
# start = pos ; end = pos + len(Ref) -1
|
|
166
|
-
# ATC/A -> TC/-
|
|
167
|
-
sumstats.loc[is_delete,"START"] = sumstats.loc[is_delete,"POS"] + 1
|
|
168
|
-
sumstats.loc[is_delete,"END"] = sumstats.loc[is_delete,"POS"] + (sumstats.loc[is_delete,"NEA"].str.len() -1)
|
|
169
|
-
sumstats.loc[is_delete,"NEA/EA"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)+"/-"
|
|
170
|
-
|
|
171
|
-
sumstats["STRAND"]="+"
|
|
172
|
-
|
|
173
|
-
sumstats["START"] = sumstats["START"].astype("Int64")
|
|
174
|
-
sumstats["END"] = sumstats["END"].astype("Int64")
|
|
246
|
+
log.write(" -formatting to 1-based bed-like file (for vep)...")
|
|
247
|
+
log.write(" -format description: {}".format("http://asia.ensembl.org/info/docs/tools/vep/vep_formats.html"))
|
|
248
|
+
sumstats = _adjust_position(sumstats, fmt, is_snp, is_insert, is_delete , log, verbose)
|
|
175
249
|
|
|
176
250
|
ouput_cols=["CHR","START","END","NEA/EA","STRAND","SNPID"]+ cols
|
|
177
|
-
sumstats = sumstats.loc[:,ouput_cols]
|
|
178
|
-
path = path + "."+suffix+".gz"
|
|
179
|
-
if verbose: log.write(" -Output columns:",sumstats.columns)
|
|
180
|
-
if verbose: log.write(" -Output path:",path)
|
|
181
251
|
|
|
182
|
-
sumstats
|
|
183
|
-
|
|
252
|
+
_output_bed_like(sumstats, path,"vep", suffix, ouput_cols,to_csvargs,bgzip, tabix, tabix_indexargs, md5sum, log, verbose)
|
|
253
|
+
|
|
184
254
|
####################################################################################################################
|
|
185
255
|
elif fmt=="annovar":
|
|
186
256
|
# bed-like format, 1-based,
|
|
187
257
|
# first 3 columns : Chromosome ("chr" prefix is optional), Start, End, Reference Allelel, Alternative Allele
|
|
188
258
|
# https://annovar.openbioinformatics.org/en/latest/user-guide/input/
|
|
189
|
-
is_snp = (sumstats
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
if verbose: log.write(" -Number of SNPs :",sum(is_snp))
|
|
194
|
-
if verbose: log.write(" -Number of Insertions :",sum(is_insert))
|
|
195
|
-
if verbose: log.write(" -Number of Deletions :",sum(is_delete))
|
|
196
|
-
|
|
197
|
-
if verbose: log.write(" -formatting to 1-based bed-like file...")
|
|
198
|
-
# for snp
|
|
199
|
-
# start = pos ; end = pos
|
|
200
|
-
# A/G
|
|
201
|
-
# AT/CG
|
|
202
|
-
sumstats.loc[is_snp,"START"] = sumstats.loc[is_snp,"POS"]
|
|
203
|
-
sumstats.loc[is_snp,"END"] = sumstats.loc[is_snp,"POS"]-1 + sumstats.loc[is_snp,"NEA"].str.len()
|
|
204
|
-
sumstats.loc[is_snp,"NEA_out"] = sumstats.loc[is_snp,"NEA"].astype("string")
|
|
205
|
-
sumstats.loc[is_snp,"EA_out"] = sumstats.loc[is_snp,"EA"].astype("string")
|
|
206
|
-
|
|
207
|
-
# for insertion
|
|
208
|
-
# start = pos : end = pos
|
|
209
|
-
# A/ATC -> -/TC
|
|
210
|
-
sumstats.loc[is_insert,"START"] = sumstats.loc[is_insert,"POS"]+1
|
|
211
|
-
sumstats.loc[is_insert,"END"] = sumstats.loc[is_insert,"POS"]+1
|
|
212
|
-
sumstats.loc[is_insert,"NEA_out"] = "-"
|
|
213
|
-
sumstats.loc[is_insert,"EA_out"] = sumstats.loc[is_insert,"EA"].str.slice(start=1)
|
|
214
|
-
|
|
215
|
-
# for deletion
|
|
216
|
-
# start = pos - 1 +1; end = pos -1 +1+ len(Ref)
|
|
217
|
-
# ATC/A -> TC/-
|
|
218
|
-
sumstats.loc[is_delete,"START"] = sumstats.loc[is_delete,"POS"]
|
|
219
|
-
sumstats.loc[is_delete,"END"] = sumstats.loc[is_delete,"POS"]- 1 + sumstats.loc[is_delete,"NEA"].str.len()
|
|
220
|
-
sumstats.loc[is_delete,"NEA_out"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)
|
|
221
|
-
sumstats.loc[is_delete,"EA_out"] = "-"
|
|
259
|
+
is_snp,is_insert,is_delete = _check_indel(sumstats,log,verbose)
|
|
260
|
+
|
|
261
|
+
log.write(" -formatting to 1-based bed-like file...")
|
|
262
|
+
log.write(" -format description: {}".format("https://annovar.openbioinformatics.org/en/latest/user-guide/input/"))
|
|
222
263
|
|
|
223
|
-
sumstats
|
|
224
|
-
sumstats["END"] = sumstats["END"].astype("Int64")
|
|
264
|
+
sumstats = _adjust_position(sumstats, fmt, is_snp, is_insert, is_delete, log, verbose )
|
|
225
265
|
|
|
226
266
|
ouput_cols=["CHR","START","END","NEA_out","EA_out","SNPID"]+ cols
|
|
227
|
-
sumstats = sumstats.loc[:,ouput_cols]
|
|
228
|
-
path = path + "."+suffix
|
|
229
|
-
if verbose: log.write(" -Output columns:",sumstats.columns)
|
|
230
|
-
if verbose: log.write(" -Output path:",path)
|
|
231
267
|
|
|
232
|
-
sumstats
|
|
233
|
-
|
|
234
|
-
#tabix_index
|
|
235
|
-
if bgzip is True:
|
|
236
|
-
if verbose: log.write(" -bgzip compressing ...")
|
|
237
|
-
tabix_compress(path, path+".gz",force=True)
|
|
238
|
-
if md5sum is True: md5sum_file(path+".gz",log,verbose)
|
|
239
|
-
if tabix is True:
|
|
240
|
-
if verbose: log.write(" -tabix indexing...")
|
|
241
|
-
if "preset" not in tabix_indexargs:
|
|
242
|
-
tabix_indexargs["preset"] = "bed"
|
|
243
|
-
if "force" not in tabix_indexargs:
|
|
244
|
-
tabix_indexargs["force"] = True
|
|
245
|
-
tabix_index(path+".gz", **tabix_indexargs)
|
|
268
|
+
_output_bed_like(sumstats, path, fmt, suffix, ouput_cols,to_csvargs,bgzip, tabix, tabix_indexargs, md5sum, log, verbose)
|
|
269
|
+
|
|
246
270
|
####################################################################################################################
|
|
247
271
|
elif fmt=="vcf":
|
|
248
|
-
|
|
272
|
+
# GWAS-VCF
|
|
273
|
+
log.write(" -"+fmt+" format will be loaded...",verbose=verbose)
|
|
249
274
|
meta_data,rename_dictionary = get_format_dict(fmt,inverse=True)
|
|
250
|
-
|
|
251
|
-
if verbose:
|
|
252
|
-
log.write(" -"+fmt+" format meta info:")
|
|
253
|
-
for key,value in meta_data.items():
|
|
254
|
-
if key not in ["format_fixed_header","format_contig_19","format_contig_38"]:
|
|
255
|
-
log.write(" -",key," : ",value)
|
|
275
|
+
print_format_info(fmt=fmt, meta_data=meta_data,rename_dictionary=rename_dictionary,verbose=verbose, log=log, output=True, skip_meta_records=["format_fixed_header","format_contig_19","format_contig_38"])
|
|
256
276
|
|
|
257
277
|
# determine which ID to use
|
|
258
278
|
if "rsID" in sumstats.columns:
|
|
259
279
|
rename_dictionary["rsID"]="ID"
|
|
260
280
|
else:
|
|
261
281
|
rename_dictionary["SNPID"]="ID"
|
|
262
|
-
|
|
263
|
-
# logging
|
|
264
|
-
if verbose:
|
|
265
|
-
log.write(" -gwaslab to "+fmt+" format dictionary:")
|
|
266
|
-
keys=[]
|
|
267
|
-
values=[]
|
|
268
|
-
for key,value in rename_dictionary.items():
|
|
269
|
-
keys.append(key)
|
|
270
|
-
values.append(value)
|
|
271
|
-
log.write(" - gwaslab keys:",','.join(keys))
|
|
272
|
-
log.write(" - "+fmt+" values:",','.join(values))
|
|
273
282
|
|
|
274
283
|
# get the columns to output
|
|
275
284
|
ouput_cols=[]
|
|
@@ -277,12 +286,10 @@ def tofmt(sumstats,
|
|
|
277
286
|
if i in rename_dictionary.keys():
|
|
278
287
|
ouput_cols.append(i)
|
|
279
288
|
ouput_cols = ouput_cols +["STATUS"]+ cols
|
|
280
|
-
sumstats = sumstats
|
|
289
|
+
sumstats = sumstats[ouput_cols]
|
|
281
290
|
sumstats = sumstats.rename(columns=rename_dictionary)
|
|
282
291
|
|
|
283
|
-
#
|
|
284
|
-
harmonised = sum(sumstats["STATUS"].str.match( r"\w\w\w[0][0123][012][01234]", case=False, flags=0, na=False ) )
|
|
285
|
-
switchedalleles = sum(sumstats["STATUS"].str.match( r"\w\w\w[0][0123][12][24]", case=False, flags=0, na=False ) )
|
|
292
|
+
# replace : with _
|
|
286
293
|
sumstats["ID"] = sumstats["ID"].str.replace(":","_")
|
|
287
294
|
|
|
288
295
|
# process Allele frequency data
|
|
@@ -297,35 +304,21 @@ def tofmt(sumstats,
|
|
|
297
304
|
if i in meta_data["format_format"]:
|
|
298
305
|
output_format.append(i)
|
|
299
306
|
|
|
300
|
-
# Create vcf header
|
|
301
|
-
vcf_header= meta_data["format_fixed_header"] +"\n"+ meta_data["format_contig_"+str(build)]+"\n"
|
|
302
|
-
# Create sample header
|
|
303
|
-
vcf_header+="##SAMPLE=<ID={},TotalVariants={},VariantsNotRead=0,HarmonisedVariants={},VariantsNotHarmonised={},SwitchedAlleles={},StudyType={}>\n".format(meta["gwaslab"]["study_name"],len(sumstats),harmonised,len(sumstats)-harmonised,switchedalleles,meta["gwaslab"]["study_type"])
|
|
304
|
-
vcf_header+="##gwaslab_version="+gwaslab_info()["version"]+"\n"
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
#StudyID=meta["Name"]
|
|
308
|
-
#otalVariants = len(sumstats)
|
|
309
|
-
#HarmonisedVariants =
|
|
310
|
-
#VariantsNotHarmonised =
|
|
311
|
-
#StudyType=
|
|
312
|
-
##SAMPLE=<ID=IEU-b-1,TotalVariants=9851866,VariantsNotRead=0,HarmonisedVariants=9851866,VariantsNotHarmonised=0,SwitchedAlleles=9851866,StudyType=Continuous>
|
|
313
|
-
|
|
314
|
-
|
|
315
307
|
# determine path
|
|
316
308
|
path = path + "."+suffix
|
|
317
|
-
if verbose: log.write(" -Output path:",path)
|
|
318
|
-
if verbose: log.write(" -vcf header contig build:"+str(build))
|
|
319
309
|
|
|
310
|
+
|
|
311
|
+
vcf_header = _process_vcf_header(sumstats, meta, meta_data, build, log, verbose)
|
|
312
|
+
|
|
313
|
+
log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
|
|
320
314
|
# output header
|
|
321
315
|
with open(path,"w") as file:
|
|
322
316
|
file.write(vcf_header)
|
|
323
317
|
|
|
324
318
|
with open(path,"a") as file:
|
|
325
|
-
|
|
319
|
+
log.write(" -Output columns:"," ".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]]))
|
|
326
320
|
file.write("\t".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]])+"\n")
|
|
327
|
-
|
|
328
|
-
counter=0
|
|
321
|
+
log.write(" -Outputing data...")
|
|
329
322
|
QUAL="."
|
|
330
323
|
FILTER="PASS"
|
|
331
324
|
for index,row in sumstats.iterrows():
|
|
@@ -337,112 +330,153 @@ def tofmt(sumstats,
|
|
|
337
330
|
INFO=str(row["INFO"])
|
|
338
331
|
FORMAT=":".join(output_format)
|
|
339
332
|
DATA=":".join(row[output_format].astype("string"))
|
|
340
|
-
file.write(
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
if verbose: log.write(" -bgzip compressing ...")
|
|
344
|
-
tabix_compress(path, path+".gz",force=True)
|
|
345
|
-
if md5sum is True: md5sum_file(path+".gz",log,verbose)
|
|
346
|
-
if tabix==True:
|
|
347
|
-
if verbose: log.write(" -tabix indexing...")
|
|
348
|
-
if "preset" not in tabix_indexargs:
|
|
349
|
-
tabix_indexargs["preset"] = "vcf"
|
|
350
|
-
if "force" not in tabix_indexargs:
|
|
351
|
-
tabix_indexargs["force"] = True
|
|
352
|
-
tabix_index(path+".gz", **tabix_indexargs)
|
|
333
|
+
file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, DATA))
|
|
334
|
+
_bgzip_tabix_md5sum(path, fmt, bgzip, md5sum, tabix, tabix_indexargs, log, verbose)
|
|
335
|
+
|
|
353
336
|
####################################################################################################################
|
|
354
|
-
elif fmt in get_formats_list():
|
|
355
|
-
|
|
337
|
+
elif fmt in get_formats_list():
|
|
338
|
+
# tabular
|
|
339
|
+
log.write(" -"+fmt+" format will be loaded...",verbose=verbose)
|
|
356
340
|
meta_data,rename_dictionary = get_format_dict(fmt,inverse=True)
|
|
357
341
|
print_format_info(fmt=fmt, meta_data=meta_data,rename_dictionary=rename_dictionary,verbose=verbose, log=log, output=True)
|
|
358
|
-
#if verbose:
|
|
359
|
-
# log.write(" -"+fmt+" format meta info:")
|
|
360
|
-
# for key,value in meta_data.items():
|
|
361
|
-
# if type(value) is list:
|
|
362
|
-
# log.write(" -",key," : ",','.join(value))
|
|
363
|
-
# else:
|
|
364
|
-
# log.write(" -",key," : ",value)
|
|
365
|
-
#if verbose:
|
|
366
|
-
# log.write(" -gwaslab to "+fmt+" format dictionary:",)
|
|
367
|
-
# keys=[]
|
|
368
|
-
# values=[]
|
|
369
|
-
# for key,value in rename_dictionary.items():
|
|
370
|
-
# keys.append(key)
|
|
371
|
-
# values.append(value)
|
|
372
|
-
# log.write(" - gwaslab keys:", ','.join(keys))
|
|
373
|
-
# log.write(" - "+fmt+" values:" , ','.join(values))
|
|
374
|
-
|
|
375
|
-
# grab format cols that exist in sumstats
|
|
376
|
-
ouput_cols=[]
|
|
377
|
-
for i in sumstats.columns:
|
|
378
|
-
if i in rename_dictionary.keys():
|
|
379
|
-
ouput_cols.append(i)
|
|
380
|
-
# + additional cols
|
|
381
|
-
ouput_cols = ouput_cols + cols
|
|
382
|
-
try:
|
|
383
|
-
if no_status == True:
|
|
384
|
-
ouput_cols.remove("STATUS")
|
|
385
|
-
except:
|
|
386
|
-
pass
|
|
387
|
-
sumstats = sumstats.loc[:,ouput_cols]
|
|
388
|
-
sumstats = sumstats.rename(columns=rename_dictionary)
|
|
389
342
|
|
|
390
|
-
|
|
391
343
|
ymal_path = path + "."+suffix+".tsv-meta.ymal"
|
|
392
344
|
path = path + "."+suffix+".tsv.gz"
|
|
345
|
+
log.write(" -Output path:",path, verbose=verbose)
|
|
393
346
|
|
|
394
|
-
|
|
347
|
+
sumstats,to_csvargs = _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status, path, meta_data, to_csvargs, log, verbose)
|
|
395
348
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
to_csvargs["sep"]="\t"
|
|
401
|
-
if "format_na" in meta_data.keys():
|
|
402
|
-
to_csvargs["na_rep"] = meta_data["format_na"]
|
|
403
|
-
if "format_col_order" in meta_data.keys():
|
|
404
|
-
fixed_col =[]
|
|
405
|
-
other_col=[]
|
|
406
|
-
for i in meta_data["format_col_order"]:
|
|
407
|
-
if i in sumstats.columns:
|
|
408
|
-
fixed_col.append(i)
|
|
409
|
-
for i in sumstats.columns:
|
|
410
|
-
if i not in meta_data["format_col_order"]:
|
|
411
|
-
other_col.append(i)
|
|
412
|
-
|
|
413
|
-
sumstats = sumstats.loc[:,fixed_col + other_col]
|
|
414
|
-
if verbose: log.write(" -Reordering columns...")
|
|
415
|
-
|
|
416
|
-
if verbose: log.write(" -Output columns:",','.join(sumstats.columns))
|
|
417
|
-
sumstats.to_csv(path, index=None,**to_csvargs)
|
|
418
|
-
|
|
419
|
-
if md5sum is True:
|
|
349
|
+
log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
|
|
350
|
+
sumstats.to_csv(path, index=None,**to_csvargs)
|
|
351
|
+
|
|
352
|
+
if md5sum == True:
|
|
420
353
|
md5_value = md5sum_file(path,log,verbose)
|
|
421
354
|
else:
|
|
422
355
|
md5_value = calculate_md5sum_file(path)
|
|
423
356
|
|
|
424
357
|
## update ssf-style meta data and export to yaml file
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
if "format_cite_name" in meta_data.keys():
|
|
428
|
-
meta_copy["file_type"] = meta_data["format_cite_name"]
|
|
429
|
-
else:
|
|
430
|
-
meta_copy["file_type"] = fmt
|
|
431
|
-
meta_copy["minor_allele_freq_lower_limit"] = min_maf
|
|
432
|
-
meta_copy["data_file_name"] = path
|
|
433
|
-
meta_copy["data_file_md5sum"] = md5_value
|
|
434
|
-
meta_copy["date_last_modified"] = get_format_date_and_time()
|
|
435
|
-
meta_copy["samples"]["sample_size"] = n_max
|
|
436
|
-
meta_copy["gwaslab"]["samples"]["sample_size_min"] = n_min
|
|
437
|
-
meta_copy["gwaslab"]["samples"]["sample_size_median"] = n_median
|
|
438
|
-
meta_copy["gwaslab"]["variants"]["variant_number"] = len(sumstats)
|
|
439
|
-
if verbose: log.write(" -Exporting SSF-style meta data to {}".format(ymal_path))
|
|
440
|
-
with open(ymal_path, 'w') as outfile:
|
|
441
|
-
yaml.dump(meta_copy, outfile)
|
|
358
|
+
_configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose)
|
|
359
|
+
|
|
442
360
|
return sumstats
|
|
361
|
+
####################################################################################################################
|
|
362
|
+
def _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status, path, meta_data, to_csvargs, log, verbose):
|
|
363
|
+
# grab format cols that exist in sumstats
|
|
364
|
+
ouput_cols=[]
|
|
365
|
+
for i in sumstats.columns:
|
|
366
|
+
if i in rename_dictionary.keys():
|
|
367
|
+
ouput_cols.append(i)
|
|
368
|
+
|
|
369
|
+
# + additional cols and remove duplicated
|
|
370
|
+
ouput_cols = list(set(ouput_cols + cols))
|
|
443
371
|
|
|
372
|
+
# remove STATUS
|
|
373
|
+
try:
|
|
374
|
+
if no_status == True:
|
|
375
|
+
ouput_cols.remove("STATUS")
|
|
376
|
+
except:
|
|
377
|
+
pass
|
|
378
|
+
|
|
379
|
+
#filter and rename to target fromat headers
|
|
380
|
+
sumstats = sumstats[ouput_cols]
|
|
381
|
+
sumstats = sumstats.rename(columns=rename_dictionary)
|
|
382
|
+
|
|
383
|
+
# configure target format args and reorder columns
|
|
384
|
+
if "format_separator" in meta_data.keys():
|
|
385
|
+
to_csvargs["sep"] = meta_data["format_separator"]
|
|
386
|
+
else:
|
|
387
|
+
to_csvargs["sep"]="\t"
|
|
388
|
+
if "format_na" in meta_data.keys():
|
|
389
|
+
to_csvargs["na_rep"] = meta_data["format_na"]
|
|
390
|
+
if "format_col_order" in meta_data.keys():
|
|
391
|
+
fixed_col =[]
|
|
392
|
+
other_col=[]
|
|
393
|
+
for i in meta_data["format_col_order"]:
|
|
394
|
+
if i in sumstats.columns:
|
|
395
|
+
fixed_col.append(i)
|
|
396
|
+
for i in sumstats.columns:
|
|
397
|
+
if i not in meta_data["format_col_order"]:
|
|
398
|
+
other_col.append(i)
|
|
399
|
+
sumstats = sumstats[fixed_col + other_col]
|
|
400
|
+
log.write(" -Output columns: {}".format(",".join(sumstats.columns)),verbose=verbose)
|
|
401
|
+
return sumstats, to_csvargs
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose):
|
|
405
|
+
### calculate meta data
|
|
406
|
+
if "EAF" in sumstats.columns:
|
|
407
|
+
min_maf = sumstats["EAF"].min()
|
|
408
|
+
else:
|
|
409
|
+
min_maf = "Unknown"
|
|
410
|
+
|
|
411
|
+
if "N" in sumstats.columns:
|
|
412
|
+
n_median = sumstats["N"].median()
|
|
413
|
+
n_max = sumstats["N"].max()
|
|
414
|
+
n_min = sumstats["N"].min()
|
|
415
|
+
else:
|
|
416
|
+
n_median = "Unknown"
|
|
417
|
+
n_max = "Unknown"
|
|
418
|
+
n_min = "Unknown"
|
|
419
|
+
|
|
420
|
+
if ssfmeta==True:
|
|
421
|
+
sumstats_meta_copy = meta.copy()
|
|
422
|
+
if "format_cite_name" in meta_data.keys():
|
|
423
|
+
sumstats_meta_copy["file_type"] = meta_data["format_cite_name"]
|
|
424
|
+
else:
|
|
425
|
+
sumstats_meta_copy["file_type"] = fmt
|
|
426
|
+
sumstats_meta_copy["minor_allele_freq_lower_limit"] = min_maf
|
|
427
|
+
sumstats_meta_copy["data_file_name"] = path
|
|
428
|
+
sumstats_meta_copy["data_file_md5sum"] = md5_value
|
|
429
|
+
sumstats_meta_copy["date_last_modified"] = get_format_date_and_time()
|
|
430
|
+
sumstats_meta_copy["samples"]["sample_size"] = n_max
|
|
431
|
+
sumstats_meta_copy["gwaslab"]["samples"]["sample_size_min"] = n_min
|
|
432
|
+
sumstats_meta_copy["gwaslab"]["samples"]["sample_size_median"] = n_median
|
|
433
|
+
sumstats_meta_copy["gwaslab"]["variants"]["variant_number"] = len(sumstats)
|
|
434
|
+
log.write(" -Exporting SSF-style meta data to {}".format(ymal_path),verbose=verbose)
|
|
435
|
+
with open(ymal_path, 'w') as outfile:
|
|
436
|
+
yaml.dump(sumstats_meta_copy, outfile)
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def _output_bed_like(sumstats, path, fmt, suffix, ouput_cols,to_csvargs,bgzip, tabix, tabix_indexargs, md5sum, log, verbose):
|
|
441
|
+
sumstats = sumstats[ouput_cols]
|
|
442
|
+
path = path + "."+suffix
|
|
443
|
+
log.write(" -Output columns: {}".format(",".join(sumstats.columns)),verbose=verbose)
|
|
444
|
+
log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
|
|
445
|
+
sumstats.to_csv(path,sep="\t",index=None,header=None,**to_csvargs)
|
|
446
|
+
_bgzip_tabix_md5sum(path, fmt, bgzip, md5sum, tabix, tabix_indexargs, log, verbose)
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def _bgzip_tabix_md5sum(path, fmt, bgzip, md5sum, tabix, tabix_indexargs, log, verbose):
|
|
450
|
+
if bgzip == True:
|
|
451
|
+
log.write(" -bgzip compressing : {}...".format(path+".gz"),verbose=verbose)
|
|
452
|
+
tabix_compress(path, path+".gz",force=True)
|
|
453
|
+
if md5sum == True:
|
|
454
|
+
if bgzip == True:
|
|
455
|
+
md5sum_file(path+".gz",log,verbose)
|
|
456
|
+
else:
|
|
457
|
+
md5sum_file(path,log,verbose)
|
|
458
|
+
if tabix == True and bgzip == True:
|
|
459
|
+
log.write(" -tabix indexing : : {}...".format(path+".gz.tbi"),verbose=verbose)
|
|
460
|
+
if "preset" not in tabix_indexargs:
|
|
461
|
+
tabix_indexargs["preset"] = fmt
|
|
462
|
+
if "force" not in tabix_indexargs:
|
|
463
|
+
tabix_indexargs["force"] = True
|
|
464
|
+
tabix_index(path+".gz", **tabix_indexargs)
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def _check_indel(sumstats,log,verbose):
|
|
468
|
+
is_snp = (sumstats["EA"].str.len() == sumstats["NEA"].str.len())
|
|
469
|
+
is_insert = (sumstats["EA"].str.len()>1) &(sumstats["NEA"].str.len()==1)
|
|
470
|
+
is_delete = (sumstats["EA"].str.len()==1) &(sumstats["NEA"].str.len()>1)
|
|
471
|
+
|
|
472
|
+
log.write(" -Number of SNPs :",sum(is_snp))
|
|
473
|
+
log.write(" -Number of Insertions :",sum(is_insert))
|
|
474
|
+
log.write(" -Number of Deletions :",sum(is_delete))
|
|
475
|
+
return is_snp,is_insert,is_delete
|
|
476
|
+
|
|
477
|
+
|
|
444
478
|
def md5sum_file(filename,log,verbose):
|
|
445
|
-
|
|
479
|
+
log.write(" -md5sum hashing for the file:",filename,verbose=verbose)
|
|
446
480
|
md5_hash = hashlib.md5()
|
|
447
481
|
with open(filename,"rb") as f:
|
|
448
482
|
# Read and update hash in chunks
|
|
@@ -451,7 +485,8 @@ def md5sum_file(filename,log,verbose):
|
|
|
451
485
|
with open(filename+".md5sum","w") as f:
|
|
452
486
|
out = str(md5_hash.hexdigest())
|
|
453
487
|
f.write(out+"\n")
|
|
454
|
-
|
|
488
|
+
log.write(" -md5sum path:",filename+".md5sum",verbose=verbose)
|
|
489
|
+
log.write(" -md5sum: {}".format(out),verbose=verbose)
|
|
455
490
|
return out
|
|
456
491
|
|
|
457
492
|
def calculate_md5sum_file(filename):
|
|
@@ -466,4 +501,102 @@ def calculate_md5sum_file(filename):
|
|
|
466
501
|
def get_format_date_and_time():
|
|
467
502
|
now = datetime.now()
|
|
468
503
|
dt_string = now.strftime("%Y-%m-%d-%H:%M:%S")
|
|
469
|
-
return dt_string
|
|
504
|
+
return dt_string
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def _adjust_position(sumstats, fmt,is_snp, is_insert, is_delete, log, verbose):
|
|
508
|
+
log.write(" -Adjusting positions in format-specific manner..",verbose=verbose)
|
|
509
|
+
if fmt=="bed":
|
|
510
|
+
sumstats.loc[is_snp,"START"] = sumstats.loc[is_snp,"POS"]-1
|
|
511
|
+
sumstats.loc[is_snp,"END"] = sumstats.loc[is_snp,"POS"]-1 + sumstats.loc[is_snp,"NEA"].str.len()
|
|
512
|
+
sumstats.loc[is_snp,"NEA/EA"] = sumstats.loc[is_snp,"NEA"].astype("string")+"/"+sumstats.loc[is_snp,"EA"].astype("string")
|
|
513
|
+
|
|
514
|
+
# for insertion
|
|
515
|
+
# start = pos : end = pos
|
|
516
|
+
# A/ATC -> -/TC
|
|
517
|
+
sumstats.loc[is_insert,"START"] = sumstats.loc[is_insert,"POS"]
|
|
518
|
+
sumstats.loc[is_insert,"END"] = sumstats.loc[is_insert,"POS"]
|
|
519
|
+
sumstats.loc[is_insert,"NEA/EA"] = "-/"+sumstats.loc[is_insert,"EA"].str.slice(start=1)
|
|
520
|
+
|
|
521
|
+
# for deletion
|
|
522
|
+
# start = pos - 1 +1; end = pos -1 +1+ len(Ref)
|
|
523
|
+
# ATC/A -> TC/-
|
|
524
|
+
sumstats.loc[is_delete,"START"] = sumstats.loc[is_delete,"POS"]
|
|
525
|
+
sumstats.loc[is_delete,"END"] = sumstats.loc[is_delete,"POS"] + sumstats.loc[is_delete,"NEA"].str.len() - 1
|
|
526
|
+
sumstats.loc[is_delete,"NEA/EA"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)+"/-"
|
|
527
|
+
sumstats["STRAND"]="+"
|
|
528
|
+
elif fmt=="vep":
|
|
529
|
+
sumstats.loc[is_snp,"START"] = sumstats.loc[is_snp,"POS"] + (sumstats.loc[is_snp,"NEA"].str.len() - 1 )
|
|
530
|
+
sumstats.loc[is_snp,"END"] = sumstats.loc[is_snp,"POS"] + (sumstats.loc[is_snp,"NEA"].str.len() - 1 )
|
|
531
|
+
sumstats.loc[is_snp,"NEA/EA"] = sumstats.loc[is_snp,"NEA"].astype("string")+"/"+sumstats.loc[is_snp,"EA"].astype("string")
|
|
532
|
+
|
|
533
|
+
# for insertion
|
|
534
|
+
# start = pos+1 ; end = pos
|
|
535
|
+
# A/ATC -> -/TC
|
|
536
|
+
sumstats.loc[is_insert,"START"] = sumstats.loc[is_insert,"POS"] + 1
|
|
537
|
+
sumstats.loc[is_insert,"END"] = sumstats.loc[is_insert,"POS"]
|
|
538
|
+
sumstats.loc[is_insert,"NEA/EA"] = "-/" + sumstats.loc[is_insert,"EA"].str.slice(start=1)
|
|
539
|
+
|
|
540
|
+
# for deletion
|
|
541
|
+
# start = pos ; end = pos + len(Ref) -1
|
|
542
|
+
# ATC/A -> TC/-
|
|
543
|
+
sumstats.loc[is_delete,"START"] = sumstats.loc[is_delete,"POS"] + 1
|
|
544
|
+
sumstats.loc[is_delete,"END"] = sumstats.loc[is_delete,"POS"] + (sumstats.loc[is_delete,"NEA"].str.len() -1)
|
|
545
|
+
sumstats.loc[is_delete,"NEA/EA"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)+"/-"
|
|
546
|
+
sumstats["STRAND"]="+"
|
|
547
|
+
elif fmt=="annovar":
|
|
548
|
+
# for snp
|
|
549
|
+
# start = pos ; end = pos
|
|
550
|
+
# A/G
|
|
551
|
+
# AT/CG
|
|
552
|
+
sumstats.loc[is_snp,"START"] = sumstats.loc[is_snp,"POS"]
|
|
553
|
+
sumstats.loc[is_snp,"END"] = sumstats.loc[is_snp,"POS"]-1 + sumstats.loc[is_snp,"NEA"].str.len()
|
|
554
|
+
sumstats.loc[is_snp,"NEA_out"] = sumstats.loc[is_snp,"NEA"].astype("string")
|
|
555
|
+
sumstats.loc[is_snp,"EA_out"] = sumstats.loc[is_snp,"EA"].astype("string")
|
|
556
|
+
# for insertion
|
|
557
|
+
# start = pos : end = pos
|
|
558
|
+
# A/ATC -> -/TC
|
|
559
|
+
sumstats.loc[is_insert,"START"] = sumstats.loc[is_insert,"POS"]+1
|
|
560
|
+
sumstats.loc[is_insert,"END"] = sumstats.loc[is_insert,"POS"]+1
|
|
561
|
+
sumstats.loc[is_insert,"NEA_out"] = "-"
|
|
562
|
+
sumstats.loc[is_insert,"EA_out"] = sumstats.loc[is_insert,"EA"].str.slice(start=1)
|
|
563
|
+
|
|
564
|
+
# for deletion
|
|
565
|
+
# start = pos - 1 +1; end = pos -1 +1+ len(Ref)
|
|
566
|
+
# ATC/A -> TC/-
|
|
567
|
+
sumstats.loc[is_delete,"START"] = sumstats.loc[is_delete,"POS"]
|
|
568
|
+
sumstats.loc[is_delete,"END"] = sumstats.loc[is_delete,"POS"]- 1 + sumstats.loc[is_delete,"NEA"].str.len()
|
|
569
|
+
sumstats.loc[is_delete,"NEA_out"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)
|
|
570
|
+
sumstats.loc[is_delete,"EA_out"] = "-"
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
sumstats["START"] = sumstats["START"].astype("Int64")
|
|
574
|
+
sumstats["END"] = sumstats["END"].astype("Int64")
|
|
575
|
+
return sumstats
|
|
576
|
+
|
|
577
|
+
def _process_vcf_header(sumstats, meta, meta_data, build, log, verbose):
|
|
578
|
+
|
|
579
|
+
log.write(" -Creating VCF file header...",verbose=verbose)
|
|
580
|
+
log.write(" -VCF header contig build:"+str(build),verbose=verbose)
|
|
581
|
+
|
|
582
|
+
# calculate meta data
|
|
583
|
+
harmonised = sum(sumstats["STATUS"].str.match( r"\w\w\w[0][0123][012][01234]", case=False, flags=0, na=False ) )
|
|
584
|
+
switchedalleles = sum(sumstats["STATUS"].str.match( r"\w\w\w[0][0123][12][24]", case=False, flags=0, na=False ) )
|
|
585
|
+
|
|
586
|
+
# Create vcf header
|
|
587
|
+
vcf_header = meta_data["format_fixed_header"] +"\n"+ meta_data["format_contig_"+str(build)]+"\n"
|
|
588
|
+
|
|
589
|
+
# Create sample header
|
|
590
|
+
vcf_header+="##SAMPLE=<ID={},TotalVariants={},VariantsNotRead=0,HarmonisedVariants={},VariantsNotHarmonised={},SwitchedAlleles={},StudyType={}>\n".format(
|
|
591
|
+
meta["gwaslab"]["study_name"], len(sumstats), harmonised, len(sumstats)-harmonised, switchedalleles, meta["gwaslab"]["study_type"])
|
|
592
|
+
vcf_header+="##gwaslab_version="+gwaslab_info()["version"]+"\n"
|
|
593
|
+
|
|
594
|
+
log.write(" -ID:{}".format( meta["gwaslab"]["study_name"]),verbose=verbose)
|
|
595
|
+
log.write(" -StudyType:{}".format(meta["gwaslab"]["study_type"]),verbose=verbose)
|
|
596
|
+
log.write(" -TotalVariants:{}".format(len(sumstats)),verbose=verbose)
|
|
597
|
+
log.write(" -HarmonisedVariants:{}".format(harmonised),verbose=verbose)
|
|
598
|
+
log.write(" -VariantsNotHarmonised:{}".format(len(sumstats)-harmonised),verbose=verbose)
|
|
599
|
+
log.write(" -SwitchedAlleles:{}".format(switchedalleles),verbose=verbose)
|
|
600
|
+
|
|
601
|
+
return vcf_header
|
|
602
|
+
|