gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (57) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/data/formatbook.json +722 -721
  5. gwaslab/g_Log.py +22 -5
  6. gwaslab/g_Sumstats.py +110 -163
  7. gwaslab/g_SumstatsPair.py +76 -25
  8. gwaslab/g_SumstatsT.py +2 -2
  9. gwaslab/g_Sumstats_summary.py +3 -3
  10. gwaslab/g_version.py +10 -10
  11. gwaslab/hm_casting.py +36 -17
  12. gwaslab/hm_harmonize_sumstats.py +354 -221
  13. gwaslab/hm_rsid_to_chrpos.py +1 -1
  14. gwaslab/io_preformat_input.py +49 -43
  15. gwaslab/io_read_ldsc.py +49 -1
  16. gwaslab/io_to_formats.py +428 -295
  17. gwaslab/ldsc_irwls.py +198 -0
  18. gwaslab/ldsc_jackknife.py +514 -0
  19. gwaslab/ldsc_ldscore.py +417 -0
  20. gwaslab/ldsc_parse.py +294 -0
  21. gwaslab/ldsc_regressions.py +747 -0
  22. gwaslab/ldsc_sumstats.py +629 -0
  23. gwaslab/qc_check_datatype.py +3 -3
  24. gwaslab/qc_fix_sumstats.py +891 -778
  25. gwaslab/util_ex_calculate_ldmatrix.py +31 -13
  26. gwaslab/util_ex_gwascatalog.py +25 -25
  27. gwaslab/util_ex_ldproxyfinder.py +10 -10
  28. gwaslab/util_ex_ldsc.py +189 -0
  29. gwaslab/util_ex_process_ref.py +3 -3
  30. gwaslab/util_ex_run_coloc.py +26 -4
  31. gwaslab/util_in_calculate_gc.py +6 -6
  32. gwaslab/util_in_calculate_power.py +42 -43
  33. gwaslab/util_in_convert_h2.py +8 -8
  34. gwaslab/util_in_fill_data.py +30 -30
  35. gwaslab/util_in_filter_value.py +201 -74
  36. gwaslab/util_in_get_density.py +10 -10
  37. gwaslab/util_in_get_sig.py +445 -71
  38. gwaslab/viz_aux_annotate_plot.py +12 -12
  39. gwaslab/viz_aux_quickfix.py +42 -37
  40. gwaslab/viz_aux_reposition_text.py +10 -7
  41. gwaslab/viz_aux_save_figure.py +18 -8
  42. gwaslab/viz_plot_compare_af.py +32 -33
  43. gwaslab/viz_plot_compare_effect.py +63 -71
  44. gwaslab/viz_plot_miamiplot2.py +34 -26
  45. gwaslab/viz_plot_mqqplot.py +126 -75
  46. gwaslab/viz_plot_qqplot.py +11 -8
  47. gwaslab/viz_plot_regionalplot.py +36 -33
  48. gwaslab/viz_plot_rg_heatmap.py +28 -26
  49. gwaslab/viz_plot_stackedregional.py +40 -21
  50. gwaslab/viz_plot_trumpetplot.py +65 -61
  51. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  52. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
  53. gwaslab-3.4.39.dist-info/RECORD +80 -0
  54. gwaslab-3.4.37.dist-info/RECORD +0 -72
  55. /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  56. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  57. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/io_to_formats.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import pandas as pd
2
2
  import yaml
3
3
  import hashlib
4
+ import copy
4
5
  from pysam import tabix_compress
5
6
  from pysam import tabix_index
6
7
  from datetime import datetime
@@ -11,6 +12,8 @@ from gwaslab.g_Log import Log
11
12
  from gwaslab.bd_common_data import get_format_dict
12
13
  from gwaslab.bd_common_data import get_number_to_chr
13
14
  from gwaslab.g_version import gwaslab_info
15
+ from gwaslab.bd_get_hapmap3 import gethapmap3
16
+
14
17
  # to vcf
15
18
  # to fmt
16
19
  ## vcf
@@ -19,7 +22,168 @@ from gwaslab.g_version import gwaslab_info
19
22
  ## annovar
20
23
  ## general : ldsc, plink, plink2, saige, regenie
21
24
  ###################################################################################################################################################
25
+ def _to_format(sumstats,
26
+ path="./sumstats",
27
+ fmt="gwaslab",
28
+ extract=None,
29
+ exclude=None,
30
+ cols=None,
31
+ id_use="rsID",
32
+ hapmap3=False,
33
+ exclude_hla=False,
34
+ hla_range=(25,34),
35
+ build=None,
36
+ n=None,
37
+ no_status=False,
38
+ output_log=True,
39
+ to_csvargs=None,
40
+ float_formats=None,
41
+ xymt_number=False,
42
+ xymt=None,
43
+ chr_prefix="",
44
+ meta=None,
45
+ ssfmeta=False,
46
+ md5sum=False,
47
+ bgzip=False,
48
+ tabix=False,
49
+ tabix_indexargs={},
50
+ log=Log(),
51
+ verbose=True):
52
+
53
+ if to_csvargs is None:
54
+ to_csvargs = {}
55
+ if float_formats is None:
56
+ float_formats={}
57
+ if cols is None:
58
+ cols=[]
59
+ if xymt is None:
60
+ xymt = ["X","Y","MT"]
61
+ onetime_log = copy.deepcopy(log)
62
+
63
+ #######################################################################################################
64
+
65
+ formatlist= get_formats_list() + ["vep","bed","annovar","vcf"]
66
+ if fmt in formatlist:
67
+ onetime_log.write("Start to convert the output sumstats in: ",fmt, " format",verbose=verbose)
68
+ else:
69
+ raise ValueError("Please select a format to output")
70
+ suffix=fmt
71
+
72
+ #######################################################################################################
73
+ # filter
74
+ output = sumstats.copy()
75
+
76
+ if extract is not None:
77
+ onetime_log.write(" -Extracting {} variants from the main DataFrame...".format(len(extract)),verbose=verbose)
78
+ output = output.loc[output[id_use].isin(extract),:]
79
+ onetime_log.write(" -Extracted {} variants from the main DataFrame...".format(len(output)),verbose=verbose)
80
+
81
+ if exclude is not None:
82
+ onetime_log.write(" -Excluding {} variants from the main DataFrame...".format(len(exclude)),verbose=verbose)
83
+ output = output.loc[~output[id_use].isin(exclude),:]
84
+ onetime_log.write(" -Excluded {} variants from the main DataFrame...".format(len(output)),verbose=verbose)
85
+
86
+ #hla and hapmap3 #######################################################################################
87
+
88
+ #exclude hla
89
+ if exclude_hla==True:
90
+ onetime_log.write(" -Excluding variants in MHC (HLA) region ...",verbose=verbose)
91
+ before = len(output)
92
+ is_hla = (output["CHR"].astype("string") == "6") & (output["POS"].astype("Int64") > hla_range[0]*1000000) & (output["POS"].astype("Int64") < hla_range[1]*1000000)
93
+ output = output.loc[~is_hla,:]
94
+ after = len(output)
95
+ onetime_log.write(" -Exclude "+ str(before - after) + " variants in MHC (HLA) region : {}Mb - {}Mb.".format(hla_range[0], hla_range[1]),verbose=verbose)
96
+ suffix = "noMHC."+suffix
97
+
98
+ #extract hapmap3 SNPs
99
+ if hapmap3==True:
100
+ output = gethapmap3(output,build=build,verbose=verbose)
101
+ after = len(output)
102
+ onetime_log.write(" -Extract {} variants in Hapmap3 datasets for build {}.".format(after, build ),verbose=verbose)
103
+ suffix = "hapmap3."+suffix
104
+
105
+ # add a n column
106
+ if n is not None:
107
+ output["N"] = n
108
+
109
+ #######################################################################################################
110
+ #formatting float statistics
111
+ onetime_log.write(" -Formatting statistics ...",verbose=verbose)
112
+
113
+ formats = {
114
+ 'EAF': '{:.4g}',
115
+ 'MAF': '{:.4g}',
116
+ 'BETA': '{:.4f}',
117
+ 'SE': '{:.4f}',
118
+ 'BETA_95U': '{:.4f}',
119
+ 'BETA_95L': '{:.4f}',
120
+ 'Z': '{:.4f}',
121
+ 'CHISQ': '{:.4f}',
122
+ 'F': '{:.4f}',
123
+ 'OR': '{:.4f}',
124
+ 'OR_95U': '{:.4f}',
125
+ 'OR_95L': '{:.4f}',
126
+ 'HR': '{:.4f}',
127
+ 'HR_95U': '{:.4f}',
128
+ 'HR_95L': '{:.4f}',
129
+ 'INFO': '{:.4f}',
130
+ 'P': '{:.4e}',
131
+ 'MLOG10P': '{:.4f}',
132
+ 'DAF': '{:.4f}'}
133
+
134
+ for col, f in float_formats.items():
135
+ if col in output.columns:
136
+ formats[col]=f
137
+
138
+ for col, f in formats.items():
139
+ if col in output.columns:
140
+ if str(output[col].dtype) in ["Float32","Float64","float64","float32","float16","float"]:
141
+ output[col] = output[col].map(f.format)
22
142
 
143
+ onetime_log.write(" -Float statistics formats:",verbose=verbose)
144
+ keys=[]
145
+ values=[]
146
+ for key,value in formats.items():
147
+ if key in output.columns:
148
+ keys.append(key)
149
+ values.append(value)
150
+
151
+ onetime_log.write(" - Columns :",keys,verbose=verbose)
152
+ onetime_log.write(" - Output formats:",values,verbose=verbose)
153
+
154
+ ##########################################################################################################
155
+ # output, mapping column names
156
+
157
+ if fmt in get_formats_list() + ["vep","bed","annovar","vcf"]:
158
+ tofmt(output,
159
+ path=path,
160
+ fmt=fmt,
161
+ cols=cols,
162
+ suffix=suffix,
163
+ build=build,
164
+ verbose=verbose,
165
+ no_status=no_status,
166
+ log=onetime_log,
167
+ to_csvargs=to_csvargs,
168
+ chr_prefix=chr_prefix,
169
+ meta=meta,
170
+ ssfmeta=ssfmeta,
171
+ bgzip=bgzip,
172
+ tabix=tabix,
173
+ tabix_indexargs=tabix_indexargs,
174
+ md5sum=md5sum,
175
+ xymt_number=xymt_number,
176
+ xymt=xymt)
177
+
178
+ if output_log is True:
179
+ log_path = path + "."+ suffix + ".log"
180
+ onetime_log.write(" -Saving log file to: {}".format(log_path),verbose=verbose)
181
+ onetime_log.write("Finished outputting successfully!",verbose=verbose)
182
+ try:
183
+ onetime_log.save(log_path, verbose=False)
184
+ except:
185
+ pass
186
+
23
187
  ###################################################################################################################################################
24
188
  def tofmt(sumstats,
25
189
  meta,
@@ -47,229 +211,74 @@ def tofmt(sumstats,
47
211
  if fmt in ["ssf"]:
48
212
  xymt_number=True
49
213
  if "SNPID" in sumstats.columns:
50
- if verbose: log.write(' - Replacing SNPID separator from ":" to "_"...')
214
+ log.write(' -Replacing SNPID separator from ":" to "_"...')
51
215
  sumstats["SNPID"] = sumstats["SNPID"].str.replace(":","_")
52
- if verbose: log.write(" - Start outputting sumstats in "+fmt+" format...")
216
+ log.write(" -Start outputting sumstats in "+fmt+" format...")
53
217
 
54
218
  if "CHR" in sumstats.columns:
55
219
  if xymt_number is False and pd.api.types.is_integer_dtype(sumstats["CHR"]):
56
220
  sumstats["CHR"]= sumstats["CHR"].map(get_number_to_chr(xymt=xymt,prefix=chr_prefix))
57
221
  elif chr_prefix is not None:
58
222
  sumstats["CHR"]= chr_prefix + sumstats["CHR"].astype("string")
59
-
60
- ### calculate meta data
61
- if "EAF" in sumstats.columns:
62
- min_maf = sumstats["EAF"].min()
63
- else:
64
- min_maf = "Unknown"
65
-
66
- if "N" in sumstats.columns:
67
- n_median = sumstats["N"].median()
68
- n_max = sumstats["N"].max()
69
- n_min = sumstats["N"].min()
70
- else:
71
- n_median = "Unknown"
72
- n_max = "Unknown"
73
- n_min = "Unknown"
74
-
75
223
 
224
+ ####################################################################################################################
76
225
  if fmt=="bed":
77
226
  # bed-like format, 0-based,
78
227
  # first 3 columns : chromosome, start, end
79
228
  # https://genome.ucsc.edu/FAQ/FAQformat.html#format1
80
- is_snp = (sumstats["EA"].str.len() == sumstats["NEA"].str.len())
81
- is_insert = (sumstats["EA"].str.len()>1) &(sumstats["NEA"].str.len()==1)
82
- is_delete = (sumstats["EA"].str.len()==1) &(sumstats["NEA"].str.len()>1)
229
+ is_snp,is_insert,is_delete = _check_indel(sumstats,log,verbose)
230
+ log.write(" -formatting to 0-based bed-like file...")
231
+ log.write(" -format description: {}".format("https://genome.ucsc.edu/FAQ/FAQformat.html#format1"))
83
232
 
84
- if verbose: log.write(" -Number of SNPs :",sum(is_snp))
85
- if verbose: log.write(" -Number of Insertions :",sum(is_insert))
86
- if verbose: log.write(" -Number of Deletions :",sum(is_delete))
87
-
88
- if verbose: log.write(" -formatting to 0-based bed-like file...")
89
- # for snp
90
- # start = pos - 1 ; end = pos
91
- # A/G
92
- # AT/CG
93
- sumstats.loc[is_snp,"START"] = sumstats.loc[is_snp,"POS"]-1
94
- sumstats.loc[is_snp,"END"] = sumstats.loc[is_snp,"POS"]-1 + sumstats.loc[is_snp,"NEA"].str.len()
95
- sumstats.loc[is_snp,"NEA/EA"] = sumstats.loc[is_snp,"NEA"].astype("string")+"/"+sumstats.loc[is_snp,"EA"].astype("string")
96
-
97
- # for insertion
98
- # start = pos : end = pos
99
- # A/ATC -> -/TC
100
- sumstats.loc[is_insert,"START"] = sumstats.loc[is_insert,"POS"]
101
- sumstats.loc[is_insert,"END"] = sumstats.loc[is_insert,"POS"]
102
- sumstats.loc[is_insert,"NEA/EA"] = "-/"+sumstats.loc[is_insert,"EA"].str.slice(start=1)
103
-
104
- # for deletion
105
- # start = pos - 1 +1; end = pos -1 +1+ len(Ref)
106
- # ATC/A -> TC/-
107
- sumstats.loc[is_delete,"START"] = sumstats.loc[is_delete,"POS"]
108
- sumstats.loc[is_delete,"END"] = sumstats.loc[is_delete,"POS"] + sumstats.loc[is_delete,"NEA"].str.len() - 1
109
- sumstats.loc[is_delete,"NEA/EA"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)+"/-"
110
-
111
- sumstats["STRAND"]="+"
233
+ sumstats = _adjust_position(sumstats, fmt, is_snp, is_insert, is_delete, log, verbose )
112
234
 
113
- sumstats["START"] = sumstats["START"].astype("Int64")
114
- sumstats["END"] = sumstats["END"].astype("Int64")
115
235
  ouput_cols=["CHR","START","END","NEA/EA","STRAND","SNPID"] + cols
116
236
 
117
- sumstats = sumstats.loc[:,ouput_cols ]
118
- path = path + "."+suffix
119
- if verbose: log.write(" -Output columns:",sumstats.columns)
120
- if verbose: log.write(" -Output path:",path)
121
-
122
- sumstats.to_csv(path,sep="\t",index=None,header=None,**to_csvargs)
123
- #tabix_compress
124
- #tabix_index
125
- if bgzip is True:
126
- if verbose: log.write(" -bgzip compressing ...")
127
- tabix_compress(path, path+".gz",force=True)
128
- if tabix is True:
129
- if verbose: log.write(" -tabix indexing...")
130
- if "preset" not in tabix_indexargs:
131
- tabix_indexargs["preset"] = "bed"
132
- if "force" not in tabix_indexargs:
133
- tabix_indexargs["force"] = True
134
-
135
- tabix_index(path+".gz", **tabix_indexargs)
237
+ _output_bed_like(sumstats, path, "bed", suffix, ouput_cols,to_csvargs,bgzip, tabix, tabix_indexargs, md5sum, log, verbose)
136
238
  ####################################################################################################################
137
239
  elif fmt=="vep":
138
240
  # bed-like format, 1-based
139
241
  # first 6 columns : chromosome, start, end, allele, strand, identifier
140
242
  # https://asia.ensembl.org/info/docs/tools/vep/vep_formats.html
141
243
 
142
- is_snp = (sumstats["EA"].str.len() == sumstats["NEA"].str.len())
143
- is_insert = (sumstats["EA"].str.len()>1) &(sumstats["NEA"].str.len()==1)
144
- is_delete = (sumstats["EA"].str.len()==1) &(sumstats["NEA"].str.len()>1)
145
-
146
- if verbose: log.write(" -Number of SNPs :",sum(is_snp))
147
- if verbose: log.write(" -Number of Insertions :",sum(is_insert))
148
- if verbose: log.write(" -Number of Deletions :",sum(is_delete))
244
+ is_snp,is_insert,is_delete = _check_indel(sumstats,log,verbose)
149
245
 
150
- if verbose: log.write(" -formatting to 1-based bed-like file (for vep)...")
151
- # for snp
152
- # start = pos ; end = pos
153
- sumstats.loc[is_snp,"START"] = sumstats.loc[is_snp,"POS"] + (sumstats.loc[is_snp,"NEA"].str.len() - 1 )
154
- sumstats.loc[is_snp,"END"] = sumstats.loc[is_snp,"POS"] + (sumstats.loc[is_snp,"NEA"].str.len() - 1 )
155
- sumstats.loc[is_snp,"NEA/EA"] = sumstats.loc[is_snp,"NEA"].astype("string")+"/"+sumstats.loc[is_snp,"EA"].astype("string")
156
-
157
- # for insertion
158
- # start = pos+1 ; end = pos
159
- # A/ATC -> -/TC
160
- sumstats.loc[is_insert,"START"] = sumstats.loc[is_insert,"POS"] + 1
161
- sumstats.loc[is_insert,"END"] = sumstats.loc[is_insert,"POS"]
162
- sumstats.loc[is_insert,"NEA/EA"] = "-/" + sumstats.loc[is_insert,"EA"].str.slice(start=1)
163
-
164
- # for deletion
165
- # start = pos ; end = pos + len(Ref) -1
166
- # ATC/A -> TC/-
167
- sumstats.loc[is_delete,"START"] = sumstats.loc[is_delete,"POS"] + 1
168
- sumstats.loc[is_delete,"END"] = sumstats.loc[is_delete,"POS"] + (sumstats.loc[is_delete,"NEA"].str.len() -1)
169
- sumstats.loc[is_delete,"NEA/EA"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)+"/-"
170
-
171
- sumstats["STRAND"]="+"
172
-
173
- sumstats["START"] = sumstats["START"].astype("Int64")
174
- sumstats["END"] = sumstats["END"].astype("Int64")
246
+ log.write(" -formatting to 1-based bed-like file (for vep)...")
247
+ log.write(" -format description: {}".format("http://asia.ensembl.org/info/docs/tools/vep/vep_formats.html"))
248
+ sumstats = _adjust_position(sumstats, fmt, is_snp, is_insert, is_delete , log, verbose)
175
249
 
176
250
  ouput_cols=["CHR","START","END","NEA/EA","STRAND","SNPID"]+ cols
177
- sumstats = sumstats.loc[:,ouput_cols]
178
- path = path + "."+suffix+".gz"
179
- if verbose: log.write(" -Output columns:",sumstats.columns)
180
- if verbose: log.write(" -Output path:",path)
181
251
 
182
- sumstats.to_csv(path,sep="\t",index=None,header=None,**to_csvargs)
183
- if md5sum is True: md5sum_file(path,log,verbose)
252
+ _output_bed_like(sumstats, path,"vep", suffix, ouput_cols,to_csvargs,bgzip, tabix, tabix_indexargs, md5sum, log, verbose)
253
+
184
254
  ####################################################################################################################
185
255
  elif fmt=="annovar":
186
256
  # bed-like format, 1-based,
187
257
  # first 3 columns : Chromosome ("chr" prefix is optional), Start, End, Reference Allelel, Alternative Allele
188
258
  # https://annovar.openbioinformatics.org/en/latest/user-guide/input/
189
- is_snp = (sumstats["EA"].str.len() == sumstats["NEA"].str.len())
190
- is_insert = (sumstats["EA"].str.len()>1) &(sumstats["NEA"].str.len()==1)
191
- is_delete = (sumstats["EA"].str.len()==1) &(sumstats["NEA"].str.len()>1)
192
-
193
- if verbose: log.write(" -Number of SNPs :",sum(is_snp))
194
- if verbose: log.write(" -Number of Insertions :",sum(is_insert))
195
- if verbose: log.write(" -Number of Deletions :",sum(is_delete))
196
-
197
- if verbose: log.write(" -formatting to 1-based bed-like file...")
198
- # for snp
199
- # start = pos ; end = pos
200
- # A/G
201
- # AT/CG
202
- sumstats.loc[is_snp,"START"] = sumstats.loc[is_snp,"POS"]
203
- sumstats.loc[is_snp,"END"] = sumstats.loc[is_snp,"POS"]-1 + sumstats.loc[is_snp,"NEA"].str.len()
204
- sumstats.loc[is_snp,"NEA_out"] = sumstats.loc[is_snp,"NEA"].astype("string")
205
- sumstats.loc[is_snp,"EA_out"] = sumstats.loc[is_snp,"EA"].astype("string")
206
-
207
- # for insertion
208
- # start = pos : end = pos
209
- # A/ATC -> -/TC
210
- sumstats.loc[is_insert,"START"] = sumstats.loc[is_insert,"POS"]+1
211
- sumstats.loc[is_insert,"END"] = sumstats.loc[is_insert,"POS"]+1
212
- sumstats.loc[is_insert,"NEA_out"] = "-"
213
- sumstats.loc[is_insert,"EA_out"] = sumstats.loc[is_insert,"EA"].str.slice(start=1)
214
-
215
- # for deletion
216
- # start = pos - 1 +1; end = pos -1 +1+ len(Ref)
217
- # ATC/A -> TC/-
218
- sumstats.loc[is_delete,"START"] = sumstats.loc[is_delete,"POS"]
219
- sumstats.loc[is_delete,"END"] = sumstats.loc[is_delete,"POS"]- 1 + sumstats.loc[is_delete,"NEA"].str.len()
220
- sumstats.loc[is_delete,"NEA_out"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)
221
- sumstats.loc[is_delete,"EA_out"] = "-"
259
+ is_snp,is_insert,is_delete = _check_indel(sumstats,log,verbose)
260
+
261
+ log.write(" -formatting to 1-based bed-like file...")
262
+ log.write(" -format description: {}".format("https://annovar.openbioinformatics.org/en/latest/user-guide/input/"))
222
263
 
223
- sumstats["START"] = sumstats["START"].astype("Int64")
224
- sumstats["END"] = sumstats["END"].astype("Int64")
264
+ sumstats = _adjust_position(sumstats, fmt, is_snp, is_insert, is_delete, log, verbose )
225
265
 
226
266
  ouput_cols=["CHR","START","END","NEA_out","EA_out","SNPID"]+ cols
227
- sumstats = sumstats.loc[:,ouput_cols]
228
- path = path + "."+suffix
229
- if verbose: log.write(" -Output columns:",sumstats.columns)
230
- if verbose: log.write(" -Output path:",path)
231
267
 
232
- sumstats.to_csv(path,sep="\t",index=None,header=None,**to_csvargs)
233
- #tabix_compress
234
- #tabix_index
235
- if bgzip is True:
236
- if verbose: log.write(" -bgzip compressing ...")
237
- tabix_compress(path, path+".gz",force=True)
238
- if md5sum is True: md5sum_file(path+".gz",log,verbose)
239
- if tabix is True:
240
- if verbose: log.write(" -tabix indexing...")
241
- if "preset" not in tabix_indexargs:
242
- tabix_indexargs["preset"] = "bed"
243
- if "force" not in tabix_indexargs:
244
- tabix_indexargs["force"] = True
245
- tabix_index(path+".gz", **tabix_indexargs)
268
+ _output_bed_like(sumstats, path, fmt, suffix, ouput_cols,to_csvargs,bgzip, tabix, tabix_indexargs, md5sum, log, verbose)
269
+
246
270
  ####################################################################################################################
247
271
  elif fmt=="vcf":
248
- if verbose: log.write(" -"+fmt+" format will be loaded...")
272
+ # GWAS-VCF
273
+ log.write(" -"+fmt+" format will be loaded...",verbose=verbose)
249
274
  meta_data,rename_dictionary = get_format_dict(fmt,inverse=True)
250
- #loading format data
251
- if verbose:
252
- log.write(" -"+fmt+" format meta info:")
253
- for key,value in meta_data.items():
254
- if key not in ["format_fixed_header","format_contig_19","format_contig_38"]:
255
- log.write(" -",key," : ",value)
275
+ print_format_info(fmt=fmt, meta_data=meta_data,rename_dictionary=rename_dictionary,verbose=verbose, log=log, output=True, skip_meta_records=["format_fixed_header","format_contig_19","format_contig_38"])
256
276
 
257
277
  # determine which ID to use
258
278
  if "rsID" in sumstats.columns:
259
279
  rename_dictionary["rsID"]="ID"
260
280
  else:
261
281
  rename_dictionary["SNPID"]="ID"
262
-
263
- # logging
264
- if verbose:
265
- log.write(" -gwaslab to "+fmt+" format dictionary:")
266
- keys=[]
267
- values=[]
268
- for key,value in rename_dictionary.items():
269
- keys.append(key)
270
- values.append(value)
271
- log.write(" - gwaslab keys:",','.join(keys))
272
- log.write(" - "+fmt+" values:",','.join(values))
273
282
 
274
283
  # get the columns to output
275
284
  ouput_cols=[]
@@ -277,12 +286,10 @@ def tofmt(sumstats,
277
286
  if i in rename_dictionary.keys():
278
287
  ouput_cols.append(i)
279
288
  ouput_cols = ouput_cols +["STATUS"]+ cols
280
- sumstats = sumstats.loc[:,ouput_cols]
289
+ sumstats = sumstats[ouput_cols]
281
290
  sumstats = sumstats.rename(columns=rename_dictionary)
282
291
 
283
- # calculate meta data
284
- harmonised = sum(sumstats["STATUS"].str.match( r"\w\w\w[0][0123][012][01234]", case=False, flags=0, na=False ) )
285
- switchedalleles = sum(sumstats["STATUS"].str.match( r"\w\w\w[0][0123][12][24]", case=False, flags=0, na=False ) )
292
+ # replace : with _
286
293
  sumstats["ID"] = sumstats["ID"].str.replace(":","_")
287
294
 
288
295
  # process Allele frequency data
@@ -297,35 +304,21 @@ def tofmt(sumstats,
297
304
  if i in meta_data["format_format"]:
298
305
  output_format.append(i)
299
306
 
300
- # Create vcf header
301
- vcf_header= meta_data["format_fixed_header"] +"\n"+ meta_data["format_contig_"+str(build)]+"\n"
302
- # Create sample header
303
- vcf_header+="##SAMPLE=<ID={},TotalVariants={},VariantsNotRead=0,HarmonisedVariants={},VariantsNotHarmonised={},SwitchedAlleles={},StudyType={}>\n".format(meta["gwaslab"]["study_name"],len(sumstats),harmonised,len(sumstats)-harmonised,switchedalleles,meta["gwaslab"]["study_type"])
304
- vcf_header+="##gwaslab_version="+gwaslab_info()["version"]+"\n"
305
-
306
-
307
- #StudyID=meta["Name"]
308
- #otalVariants = len(sumstats)
309
- #HarmonisedVariants =
310
- #VariantsNotHarmonised =
311
- #StudyType=
312
- ##SAMPLE=<ID=IEU-b-1,TotalVariants=9851866,VariantsNotRead=0,HarmonisedVariants=9851866,VariantsNotHarmonised=0,SwitchedAlleles=9851866,StudyType=Continuous>
313
-
314
-
315
307
  # determine path
316
308
  path = path + "."+suffix
317
- if verbose: log.write(" -Output path:",path)
318
- if verbose: log.write(" -vcf header contig build:"+str(build))
319
309
 
310
+
311
+ vcf_header = _process_vcf_header(sumstats, meta, meta_data, build, log, verbose)
312
+
313
+ log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
320
314
  # output header
321
315
  with open(path,"w") as file:
322
316
  file.write(vcf_header)
323
317
 
324
318
  with open(path,"a") as file:
325
- if verbose: log.write(" -Output columns:"," ".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]]))
319
+ log.write(" -Output columns:"," ".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]]))
326
320
  file.write("\t".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]])+"\n")
327
- if verbose: log.write(" -Outputing data...")
328
- counter=0
321
+ log.write(" -Outputing data...")
329
322
  QUAL="."
330
323
  FILTER="PASS"
331
324
  for index,row in sumstats.iterrows():
@@ -337,112 +330,153 @@ def tofmt(sumstats,
337
330
  INFO=str(row["INFO"])
338
331
  FORMAT=":".join(output_format)
339
332
  DATA=":".join(row[output_format].astype("string"))
340
- file.write(CHROM+"\t"+POS+"\t"+ID+"\t"+REF+"\t"+ALT+"\t"+QUAL+"\t"+FILTER+"\t"+INFO+"\t"+FORMAT+"\t"+DATA+"\n")
341
-
342
- if bgzip==True:
343
- if verbose: log.write(" -bgzip compressing ...")
344
- tabix_compress(path, path+".gz",force=True)
345
- if md5sum is True: md5sum_file(path+".gz",log,verbose)
346
- if tabix==True:
347
- if verbose: log.write(" -tabix indexing...")
348
- if "preset" not in tabix_indexargs:
349
- tabix_indexargs["preset"] = "vcf"
350
- if "force" not in tabix_indexargs:
351
- tabix_indexargs["force"] = True
352
- tabix_index(path+".gz", **tabix_indexargs)
333
+ file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, DATA))
334
+ _bgzip_tabix_md5sum(path, fmt, bgzip, md5sum, tabix, tabix_indexargs, log, verbose)
335
+
353
336
  ####################################################################################################################
354
- elif fmt in get_formats_list():
355
- if verbose: log.write(" -"+fmt+" format will be loaded...")
337
+ elif fmt in get_formats_list():
338
+ # tabular
339
+ log.write(" -"+fmt+" format will be loaded...",verbose=verbose)
356
340
  meta_data,rename_dictionary = get_format_dict(fmt,inverse=True)
357
341
  print_format_info(fmt=fmt, meta_data=meta_data,rename_dictionary=rename_dictionary,verbose=verbose, log=log, output=True)
358
- #if verbose:
359
- # log.write(" -"+fmt+" format meta info:")
360
- # for key,value in meta_data.items():
361
- # if type(value) is list:
362
- # log.write(" -",key," : ",','.join(value))
363
- # else:
364
- # log.write(" -",key," : ",value)
365
- #if verbose:
366
- # log.write(" -gwaslab to "+fmt+" format dictionary:",)
367
- # keys=[]
368
- # values=[]
369
- # for key,value in rename_dictionary.items():
370
- # keys.append(key)
371
- # values.append(value)
372
- # log.write(" - gwaslab keys:", ','.join(keys))
373
- # log.write(" - "+fmt+" values:" , ','.join(values))
374
-
375
- # grab format cols that exist in sumstats
376
- ouput_cols=[]
377
- for i in sumstats.columns:
378
- if i in rename_dictionary.keys():
379
- ouput_cols.append(i)
380
- # + additional cols
381
- ouput_cols = ouput_cols + cols
382
- try:
383
- if no_status == True:
384
- ouput_cols.remove("STATUS")
385
- except:
386
- pass
387
- sumstats = sumstats.loc[:,ouput_cols]
388
- sumstats = sumstats.rename(columns=rename_dictionary)
389
342
 
390
-
391
343
  ymal_path = path + "."+suffix+".tsv-meta.ymal"
392
344
  path = path + "."+suffix+".tsv.gz"
345
+ log.write(" -Output path:",path, verbose=verbose)
393
346
 
394
- if verbose: log.write(" -Output path:",path)
347
+ sumstats,to_csvargs = _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status, path, meta_data, to_csvargs, log, verbose)
395
348
 
396
- if path is not None:
397
- if "format_separator" in meta_data.keys():
398
- to_csvargs["sep"] = meta_data["format_separator"]
399
- else:
400
- to_csvargs["sep"]="\t"
401
- if "format_na" in meta_data.keys():
402
- to_csvargs["na_rep"] = meta_data["format_na"]
403
- if "format_col_order" in meta_data.keys():
404
- fixed_col =[]
405
- other_col=[]
406
- for i in meta_data["format_col_order"]:
407
- if i in sumstats.columns:
408
- fixed_col.append(i)
409
- for i in sumstats.columns:
410
- if i not in meta_data["format_col_order"]:
411
- other_col.append(i)
412
-
413
- sumstats = sumstats.loc[:,fixed_col + other_col]
414
- if verbose: log.write(" -Reordering columns...")
415
-
416
- if verbose: log.write(" -Output columns:",','.join(sumstats.columns))
417
- sumstats.to_csv(path, index=None,**to_csvargs)
418
-
419
- if md5sum is True:
349
+ log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
350
+ sumstats.to_csv(path, index=None,**to_csvargs)
351
+
352
+ if md5sum == True:
420
353
  md5_value = md5sum_file(path,log,verbose)
421
354
  else:
422
355
  md5_value = calculate_md5sum_file(path)
423
356
 
424
357
  ## update ssf-style meta data and export to yaml file
425
- if ssfmeta==True:
426
- meta_copy = meta.copy()
427
- if "format_cite_name" in meta_data.keys():
428
- meta_copy["file_type"] = meta_data["format_cite_name"]
429
- else:
430
- meta_copy["file_type"] = fmt
431
- meta_copy["minor_allele_freq_lower_limit"] = min_maf
432
- meta_copy["data_file_name"] = path
433
- meta_copy["data_file_md5sum"] = md5_value
434
- meta_copy["date_last_modified"] = get_format_date_and_time()
435
- meta_copy["samples"]["sample_size"] = n_max
436
- meta_copy["gwaslab"]["samples"]["sample_size_min"] = n_min
437
- meta_copy["gwaslab"]["samples"]["sample_size_median"] = n_median
438
- meta_copy["gwaslab"]["variants"]["variant_number"] = len(sumstats)
439
- if verbose: log.write(" -Exporting SSF-style meta data to {}".format(ymal_path))
440
- with open(ymal_path, 'w') as outfile:
441
- yaml.dump(meta_copy, outfile)
358
+ _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose)
359
+
442
360
  return sumstats
361
+ ####################################################################################################################
362
+ def _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status, path, meta_data, to_csvargs, log, verbose):
363
+ # grab format cols that exist in sumstats
364
+ ouput_cols=[]
365
+ for i in sumstats.columns:
366
+ if i in rename_dictionary.keys():
367
+ ouput_cols.append(i)
368
+
369
+ # + additional cols and remove duplicated
370
+ ouput_cols = list(set(ouput_cols + cols))
443
371
 
372
+ # remove STATUS
373
+ try:
374
+ if no_status == True:
375
+ ouput_cols.remove("STATUS")
376
+ except:
377
+ pass
378
+
379
+ #filter and rename to target fromat headers
380
+ sumstats = sumstats[ouput_cols]
381
+ sumstats = sumstats.rename(columns=rename_dictionary)
382
+
383
+ # configure target format args and reorder columns
384
+ if "format_separator" in meta_data.keys():
385
+ to_csvargs["sep"] = meta_data["format_separator"]
386
+ else:
387
+ to_csvargs["sep"]="\t"
388
+ if "format_na" in meta_data.keys():
389
+ to_csvargs["na_rep"] = meta_data["format_na"]
390
+ if "format_col_order" in meta_data.keys():
391
+ fixed_col =[]
392
+ other_col=[]
393
+ for i in meta_data["format_col_order"]:
394
+ if i in sumstats.columns:
395
+ fixed_col.append(i)
396
+ for i in sumstats.columns:
397
+ if i not in meta_data["format_col_order"]:
398
+ other_col.append(i)
399
+ sumstats = sumstats[fixed_col + other_col]
400
+ log.write(" -Output columns: {}".format(",".join(sumstats.columns)),verbose=verbose)
401
+ return sumstats, to_csvargs
402
+
403
+
404
+ def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose):
405
+ ### calculate meta data
406
+ if "EAF" in sumstats.columns:
407
+ min_maf = sumstats["EAF"].min()
408
+ else:
409
+ min_maf = "Unknown"
410
+
411
+ if "N" in sumstats.columns:
412
+ n_median = sumstats["N"].median()
413
+ n_max = sumstats["N"].max()
414
+ n_min = sumstats["N"].min()
415
+ else:
416
+ n_median = "Unknown"
417
+ n_max = "Unknown"
418
+ n_min = "Unknown"
419
+
420
+ if ssfmeta==True:
421
+ sumstats_meta_copy = meta.copy()
422
+ if "format_cite_name" in meta_data.keys():
423
+ sumstats_meta_copy["file_type"] = meta_data["format_cite_name"]
424
+ else:
425
+ sumstats_meta_copy["file_type"] = fmt
426
+ sumstats_meta_copy["minor_allele_freq_lower_limit"] = min_maf
427
+ sumstats_meta_copy["data_file_name"] = path
428
+ sumstats_meta_copy["data_file_md5sum"] = md5_value
429
+ sumstats_meta_copy["date_last_modified"] = get_format_date_and_time()
430
+ sumstats_meta_copy["samples"]["sample_size"] = n_max
431
+ sumstats_meta_copy["gwaslab"]["samples"]["sample_size_min"] = n_min
432
+ sumstats_meta_copy["gwaslab"]["samples"]["sample_size_median"] = n_median
433
+ sumstats_meta_copy["gwaslab"]["variants"]["variant_number"] = len(sumstats)
434
+ log.write(" -Exporting SSF-style meta data to {}".format(ymal_path),verbose=verbose)
435
+ with open(ymal_path, 'w') as outfile:
436
+ yaml.dump(sumstats_meta_copy, outfile)
437
+
438
+
439
+
440
+ def _output_bed_like(sumstats, path, fmt, suffix, ouput_cols,to_csvargs,bgzip, tabix, tabix_indexargs, md5sum, log, verbose):
441
+ sumstats = sumstats[ouput_cols]
442
+ path = path + "."+suffix
443
+ log.write(" -Output columns: {}".format(",".join(sumstats.columns)),verbose=verbose)
444
+ log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
445
+ sumstats.to_csv(path,sep="\t",index=None,header=None,**to_csvargs)
446
+ _bgzip_tabix_md5sum(path, fmt, bgzip, md5sum, tabix, tabix_indexargs, log, verbose)
447
+
448
+
449
+ def _bgzip_tabix_md5sum(path, fmt, bgzip, md5sum, tabix, tabix_indexargs, log, verbose):
450
+ if bgzip == True:
451
+ log.write(" -bgzip compressing : {}...".format(path+".gz"),verbose=verbose)
452
+ tabix_compress(path, path+".gz",force=True)
453
+ if md5sum == True:
454
+ if bgzip == True:
455
+ md5sum_file(path+".gz",log,verbose)
456
+ else:
457
+ md5sum_file(path,log,verbose)
458
+ if tabix == True and bgzip == True:
459
+ log.write(" -tabix indexing : : {}...".format(path+".gz.tbi"),verbose=verbose)
460
+ if "preset" not in tabix_indexargs:
461
+ tabix_indexargs["preset"] = fmt
462
+ if "force" not in tabix_indexargs:
463
+ tabix_indexargs["force"] = True
464
+ tabix_index(path+".gz", **tabix_indexargs)
465
+
466
+
467
+ def _check_indel(sumstats,log,verbose):
468
+ is_snp = (sumstats["EA"].str.len() == sumstats["NEA"].str.len())
469
+ is_insert = (sumstats["EA"].str.len()>1) &(sumstats["NEA"].str.len()==1)
470
+ is_delete = (sumstats["EA"].str.len()==1) &(sumstats["NEA"].str.len()>1)
471
+
472
+ log.write(" -Number of SNPs :",sum(is_snp))
473
+ log.write(" -Number of Insertions :",sum(is_insert))
474
+ log.write(" -Number of Deletions :",sum(is_delete))
475
+ return is_snp,is_insert,is_delete
476
+
477
+
444
478
  def md5sum_file(filename,log,verbose):
445
- if verbose: log.write(" -md5sum hashing for the file:",filename)
479
+ log.write(" -md5sum hashing for the file:",filename,verbose=verbose)
446
480
  md5_hash = hashlib.md5()
447
481
  with open(filename,"rb") as f:
448
482
  # Read and update hash in chunks
@@ -451,7 +485,8 @@ def md5sum_file(filename,log,verbose):
451
485
  with open(filename+".md5sum","w") as f:
452
486
  out = str(md5_hash.hexdigest())
453
487
  f.write(out+"\n")
454
- if verbose: log.write(" -md5sum path:",filename+".md5sum")
488
+ log.write(" -md5sum path:",filename+".md5sum",verbose=verbose)
489
+ log.write(" -md5sum: {}".format(out),verbose=verbose)
455
490
  return out
456
491
 
457
492
  def calculate_md5sum_file(filename):
@@ -466,4 +501,102 @@ def calculate_md5sum_file(filename):
466
501
  def get_format_date_and_time():
467
502
  now = datetime.now()
468
503
  dt_string = now.strftime("%Y-%m-%d-%H:%M:%S")
469
- return dt_string
504
+ return dt_string
505
+
506
+
507
+ def _adjust_position(sumstats, fmt,is_snp, is_insert, is_delete, log, verbose):
508
+ log.write(" -Adjusting positions in format-specific manner..",verbose=verbose)
509
+ if fmt=="bed":
510
+ sumstats.loc[is_snp,"START"] = sumstats.loc[is_snp,"POS"]-1
511
+ sumstats.loc[is_snp,"END"] = sumstats.loc[is_snp,"POS"]-1 + sumstats.loc[is_snp,"NEA"].str.len()
512
+ sumstats.loc[is_snp,"NEA/EA"] = sumstats.loc[is_snp,"NEA"].astype("string")+"/"+sumstats.loc[is_snp,"EA"].astype("string")
513
+
514
+ # for insertion
515
+ # start = pos : end = pos
516
+ # A/ATC -> -/TC
517
+ sumstats.loc[is_insert,"START"] = sumstats.loc[is_insert,"POS"]
518
+ sumstats.loc[is_insert,"END"] = sumstats.loc[is_insert,"POS"]
519
+ sumstats.loc[is_insert,"NEA/EA"] = "-/"+sumstats.loc[is_insert,"EA"].str.slice(start=1)
520
+
521
+ # for deletion
522
+ # start = pos - 1 +1; end = pos -1 +1+ len(Ref)
523
+ # ATC/A -> TC/-
524
+ sumstats.loc[is_delete,"START"] = sumstats.loc[is_delete,"POS"]
525
+ sumstats.loc[is_delete,"END"] = sumstats.loc[is_delete,"POS"] + sumstats.loc[is_delete,"NEA"].str.len() - 1
526
+ sumstats.loc[is_delete,"NEA/EA"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)+"/-"
527
+ sumstats["STRAND"]="+"
528
+ elif fmt=="vep":
529
+ sumstats.loc[is_snp,"START"] = sumstats.loc[is_snp,"POS"] + (sumstats.loc[is_snp,"NEA"].str.len() - 1 )
530
+ sumstats.loc[is_snp,"END"] = sumstats.loc[is_snp,"POS"] + (sumstats.loc[is_snp,"NEA"].str.len() - 1 )
531
+ sumstats.loc[is_snp,"NEA/EA"] = sumstats.loc[is_snp,"NEA"].astype("string")+"/"+sumstats.loc[is_snp,"EA"].astype("string")
532
+
533
+ # for insertion
534
+ # start = pos+1 ; end = pos
535
+ # A/ATC -> -/TC
536
+ sumstats.loc[is_insert,"START"] = sumstats.loc[is_insert,"POS"] + 1
537
+ sumstats.loc[is_insert,"END"] = sumstats.loc[is_insert,"POS"]
538
+ sumstats.loc[is_insert,"NEA/EA"] = "-/" + sumstats.loc[is_insert,"EA"].str.slice(start=1)
539
+
540
+ # for deletion
541
+ # start = pos ; end = pos + len(Ref) -1
542
+ # ATC/A -> TC/-
543
+ sumstats.loc[is_delete,"START"] = sumstats.loc[is_delete,"POS"] + 1
544
+ sumstats.loc[is_delete,"END"] = sumstats.loc[is_delete,"POS"] + (sumstats.loc[is_delete,"NEA"].str.len() -1)
545
+ sumstats.loc[is_delete,"NEA/EA"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)+"/-"
546
+ sumstats["STRAND"]="+"
547
+ elif fmt=="annovar":
548
+ # for snp
549
+ # start = pos ; end = pos
550
+ # A/G
551
+ # AT/CG
552
+ sumstats.loc[is_snp,"START"] = sumstats.loc[is_snp,"POS"]
553
+ sumstats.loc[is_snp,"END"] = sumstats.loc[is_snp,"POS"]-1 + sumstats.loc[is_snp,"NEA"].str.len()
554
+ sumstats.loc[is_snp,"NEA_out"] = sumstats.loc[is_snp,"NEA"].astype("string")
555
+ sumstats.loc[is_snp,"EA_out"] = sumstats.loc[is_snp,"EA"].astype("string")
556
+ # for insertion
557
+ # start = pos : end = pos
558
+ # A/ATC -> -/TC
559
+ sumstats.loc[is_insert,"START"] = sumstats.loc[is_insert,"POS"]+1
560
+ sumstats.loc[is_insert,"END"] = sumstats.loc[is_insert,"POS"]+1
561
+ sumstats.loc[is_insert,"NEA_out"] = "-"
562
+ sumstats.loc[is_insert,"EA_out"] = sumstats.loc[is_insert,"EA"].str.slice(start=1)
563
+
564
+ # for deletion
565
+ # start = pos - 1 +1; end = pos -1 +1+ len(Ref)
566
+ # ATC/A -> TC/-
567
+ sumstats.loc[is_delete,"START"] = sumstats.loc[is_delete,"POS"]
568
+ sumstats.loc[is_delete,"END"] = sumstats.loc[is_delete,"POS"]- 1 + sumstats.loc[is_delete,"NEA"].str.len()
569
+ sumstats.loc[is_delete,"NEA_out"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)
570
+ sumstats.loc[is_delete,"EA_out"] = "-"
571
+
572
+
573
+ sumstats["START"] = sumstats["START"].astype("Int64")
574
+ sumstats["END"] = sumstats["END"].astype("Int64")
575
+ return sumstats
576
+
577
+ def _process_vcf_header(sumstats, meta, meta_data, build, log, verbose):
578
+
579
+ log.write(" -Creating VCF file header...",verbose=verbose)
580
+ log.write(" -VCF header contig build:"+str(build),verbose=verbose)
581
+
582
+ # calculate meta data
583
+ harmonised = sum(sumstats["STATUS"].str.match( r"\w\w\w[0][0123][012][01234]", case=False, flags=0, na=False ) )
584
+ switchedalleles = sum(sumstats["STATUS"].str.match( r"\w\w\w[0][0123][12][24]", case=False, flags=0, na=False ) )
585
+
586
+ # Create vcf header
587
+ vcf_header = meta_data["format_fixed_header"] +"\n"+ meta_data["format_contig_"+str(build)]+"\n"
588
+
589
+ # Create sample header
590
+ vcf_header+="##SAMPLE=<ID={},TotalVariants={},VariantsNotRead=0,HarmonisedVariants={},VariantsNotHarmonised={},SwitchedAlleles={},StudyType={}>\n".format(
591
+ meta["gwaslab"]["study_name"], len(sumstats), harmonised, len(sumstats)-harmonised, switchedalleles, meta["gwaslab"]["study_type"])
592
+ vcf_header+="##gwaslab_version="+gwaslab_info()["version"]+"\n"
593
+
594
+ log.write(" -ID:{}".format( meta["gwaslab"]["study_name"]),verbose=verbose)
595
+ log.write(" -StudyType:{}".format(meta["gwaslab"]["study_type"]),verbose=verbose)
596
+ log.write(" -TotalVariants:{}".format(len(sumstats)),verbose=verbose)
597
+ log.write(" -HarmonisedVariants:{}".format(harmonised),verbose=verbose)
598
+ log.write(" -VariantsNotHarmonised:{}".format(len(sumstats)-harmonised),verbose=verbose)
599
+ log.write(" -SwitchedAlleles:{}".format(switchedalleles),verbose=verbose)
600
+
601
+ return vcf_header
602
+