gwaslab 3.5.6__py3-none-any.whl → 3.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (64) hide show
  1. gwaslab/__init__.py +2 -0
  2. gwaslab/bd_common_data.py +1 -0
  3. gwaslab/bd_get_hapmap3.py +0 -1
  4. gwaslab/data/formatbook.json +78 -0
  5. gwaslab/g_Sumstats.py +98 -24
  6. gwaslab/g_SumstatsMulti.py +287 -0
  7. gwaslab/g_SumstatsPair.py +101 -16
  8. gwaslab/g_Sumstats_polars.py +245 -0
  9. gwaslab/g_headers.py +12 -3
  10. gwaslab/g_meta.py +123 -47
  11. gwaslab/g_meta_update.py +48 -0
  12. gwaslab/g_vchange_status_polars.py +44 -0
  13. gwaslab/g_version.py +2 -2
  14. gwaslab/hm_casting.py +169 -110
  15. gwaslab/hm_casting_polars.py +202 -0
  16. gwaslab/hm_harmonize_sumstats.py +19 -8
  17. gwaslab/io_load_ld.py +529 -0
  18. gwaslab/io_preformat_input.py +11 -0
  19. gwaslab/io_preformat_input_polars.py +632 -0
  20. gwaslab/io_process_args.py +25 -1
  21. gwaslab/io_read_ldsc.py +34 -3
  22. gwaslab/io_read_pipcs.py +62 -6
  23. gwaslab/prscs_gigrnd.py +122 -0
  24. gwaslab/prscs_mcmc_gtb.py +136 -0
  25. gwaslab/prscs_parse_genet.py +98 -0
  26. gwaslab/qc_build.py +53 -0
  27. gwaslab/qc_check_datatype.py +10 -8
  28. gwaslab/qc_check_datatype_polars.py +128 -0
  29. gwaslab/qc_fix_sumstats.py +25 -23
  30. gwaslab/qc_fix_sumstats_polars.py +193 -0
  31. gwaslab/util_ex_calculate_ldmatrix.py +49 -19
  32. gwaslab/util_ex_gwascatalog.py +71 -28
  33. gwaslab/util_ex_ldsc.py +67 -21
  34. gwaslab/util_ex_match_ldmatrix.py +396 -0
  35. gwaslab/util_ex_run_2samplemr.py +0 -2
  36. gwaslab/util_ex_run_ccgwas.py +155 -0
  37. gwaslab/util_ex_run_coloc.py +1 -1
  38. gwaslab/util_ex_run_hyprcoloc.py +117 -0
  39. gwaslab/util_ex_run_mesusie.py +155 -0
  40. gwaslab/util_ex_run_mtag.py +92 -0
  41. gwaslab/util_ex_run_prscs.py +85 -0
  42. gwaslab/util_ex_run_susie.py +40 -9
  43. gwaslab/util_in_estimate_ess.py +18 -0
  44. gwaslab/util_in_fill_data.py +20 -1
  45. gwaslab/util_in_filter_value.py +10 -5
  46. gwaslab/util_in_get_sig.py +71 -13
  47. gwaslab/util_in_meta.py +168 -4
  48. gwaslab/util_in_meta_polars.py +174 -0
  49. gwaslab/viz_plot_compare_effect.py +87 -23
  50. gwaslab/viz_plot_credible_sets.py +55 -11
  51. gwaslab/viz_plot_effect.py +22 -12
  52. gwaslab/viz_plot_miamiplot2.py +3 -2
  53. gwaslab/viz_plot_mqqplot.py +165 -141
  54. gwaslab/viz_plot_qqplot.py +6 -6
  55. gwaslab/viz_plot_regional2.py +5 -13
  56. gwaslab/viz_plot_rg_heatmap.py +6 -1
  57. gwaslab/viz_plot_stackedregional.py +21 -6
  58. {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/METADATA +9 -7
  59. gwaslab-3.5.8.dist-info/RECORD +117 -0
  60. {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/WHEEL +1 -1
  61. gwaslab-3.5.6.dist-info/RECORD +0 -96
  62. {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE +0 -0
  63. {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
  64. {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,632 @@
1
+ import pandas as pd
2
+ import polars as pl
3
+ import numpy as np
4
+ import scipy.stats as ss
5
+ import gzip
6
+ import os
7
+ import gc
8
+ from gwaslab.bd_common_data import get_format_dict
9
+ from gwaslab.qc_fix_sumstats import sortcolumn
10
+ from gwaslab.qc_fix_sumstats import _process_build
11
+ from gwaslab.qc_check_datatype_polars import check_datatype
12
+ from gwaslab.qc_check_datatype_polars import quick_convert_datatype
13
+ from gwaslab.qc_check_datatype_polars import check_dataframe_memory_usage
14
+ from gwaslab.g_headers import _check_overlap_with_reserved_keys
15
+ #20221030
16
+ def preformatp(sumstats,
17
+ fmt=None,
18
+ tab_fmt="tsv",
19
+ snpid=None,
20
+ rsid=None,
21
+ chrom=None,
22
+ pos=None,
23
+ ea=None,
24
+ nea=None,
25
+ ref=None,
26
+ alt=None,
27
+ eaf=None,
28
+ neaf=None,
29
+ maf=None,
30
+ n=None,
31
+ beta=None,
32
+ se=None,
33
+ chisq=None,
34
+ z=None,
35
+ f=None,
36
+ t=None,
37
+ p=None,
38
+ q=None,
39
+ mlog10p=None,
40
+ test=None,
41
+ info=None,
42
+ OR=None,
43
+ OR_95L=None,
44
+ OR_95U=None,
45
+ beta_95L=None,
46
+ beta_95U=None,
47
+ HR=None,
48
+ HR_95L=None,
49
+ HR_95U=None,
50
+ i2=None,
51
+ snpr2=None,
52
+ phet=None,
53
+ dof=None,
54
+ ncase=None,
55
+ ncontrol=None,
56
+ neff=None,
57
+ direction=None,
58
+ status=None,
59
+ study=None,
60
+ trait=None,
61
+ build=None,
62
+ other=[],
63
+ usekeys=None,
64
+ chrom_pat=None,
65
+ snpid_pat=None,
66
+ verbose=False,
67
+ readargs=None,
68
+ log=None):
69
+
70
+ #renaming dictionary
71
+ rename_dictionary = {}
72
+ usecols = []
73
+ dtype_dictionary ={}
74
+ if readargs is None:
75
+ readargs={}
76
+ #######################################################################################################################################################
77
+ # workflow:
78
+ # 1. formatbook
79
+ # 2. user specified header
80
+ # 3. usekeys
81
+ if tab_fmt=="parquet":
82
+ if type(sumstats) is str:
83
+ log.write("Start to load data from parquet file....",verbose=verbose)
84
+ log.write(" -path: {}".format(sumstats),verbose=verbose)
85
+ sumstats = pd.read_parquet(sumstats,**readargs)
86
+ log.write("Finished loading parquet file into pd.DataFrame....",verbose=verbose)
87
+ else:
88
+ raise ValueError("Please input a path for parquet file.")
89
+
90
+ if fmt is not None:
91
+ # loading format parameters
92
+ log.write("Start to load format from formatbook....",verbose=verbose)
93
+
94
+ # load format data
95
+ meta_data,rename_dictionary = get_format_dict(fmt)
96
+
97
+ ########## print format information################################################
98
+ print_format_info(fmt=fmt, meta_data=meta_data,rename_dictionary=rename_dictionary,verbose=verbose, log=log)
99
+
100
+ if "format_separator" in meta_data.keys():
101
+ if "separator" not in readargs.keys():
102
+ readargs["separator"] = meta_data["format_separator"]
103
+ else:
104
+ if readargs["separator"] != meta_data["format_separator"]:
105
+ log.write(' - format_separator will be changed to: "{}"'.format(readargs["separator"]),verbose=verbose)
106
+
107
+ if "format_na" in meta_data.keys():
108
+ readargs["null_values"] = meta_data["format_na"]
109
+
110
+ if "format_comment" in meta_data.keys():
111
+ readargs["comment_prefix"] = meta_data["format_comment"]
112
+
113
+ if "format_other_cols" in meta_data.keys():
114
+ other += meta_data["format_other_cols"]
115
+
116
+ if "sep" not in readargs.keys():
117
+ readargs["separator"] = "\t"
118
+
119
+ #########################################################################################################################################################
120
+
121
+ # check chr-separated path / vcf / then print header.
122
+ try:
123
+ if type(sumstats) is str:
124
+ ## loading data from path #################################################
125
+ inpath = sumstats
126
+ ###load sumstats by each chromosome #################################################
127
+ if "@" in inpath:
128
+ log.write(" -Detected @ in path: load sumstats by each chromosome...",verbose=verbose)
129
+ inpath_chr_list=[]
130
+ inpath_chr_num_list=[]
131
+ for chromosome in list(range(1,26))+["x","y","X","Y","MT","mt","m","M"]:
132
+ inpath_chr = inpath.replace("@",str(chromosome))
133
+ if isfile_casesensitive(inpath_chr):
134
+ inpath_chr_num_list.append(str(chromosome))
135
+ inpath_chr_list.append(inpath_chr)
136
+ log.write(" -Chromosomes detected:",",".join(inpath_chr_num_list),verbose=verbose)
137
+ readargs_header = get_readargs_header(inpath = inpath_chr_list[0], readargs = readargs)
138
+ row_one = pl.read_csv(inpath_chr_list[0],**readargs_header)
139
+ # columns in the sumstats
140
+ raw_cols = row_one.columns
141
+ else:
142
+ ##### loading data from tabular file#################################################
143
+ readargs_header = get_readargs_header(inpath = inpath, readargs = readargs)
144
+ row_one = pl.read_csv(inpath,**readargs_header)
145
+ raw_cols = row_one.columns
146
+
147
+ if fmt=="vcf":
148
+ # expanded
149
+ format_cols = list(row_one["FORMAT"].str.split(":"))[0]
150
+ # fixed + study1 + expanded
151
+ raw_cols = meta_data["format_fixed"] + [raw_cols[9]] + format_cols
152
+
153
+ ######################################################################################
154
+ elif type(sumstats) is pd.DataFrame:
155
+ ## loading data from dataframe
156
+ raw_cols = sumstats.columns
157
+
158
+ ################################################
159
+ for key,value in rename_dictionary.items():
160
+ # check avaiable keys key->raw header
161
+ # usecols : a list of raw headers to load from file/DataFrame
162
+ if key in raw_cols:
163
+ usecols.append(key)
164
+ if value in ["EA","NEA"]:
165
+ dtype_dictionary[key]=pl.String()
166
+ if value in ["STATUS"]:
167
+ dtype_dictionary[key]=pl.String()
168
+ if value in ["CHR"]:
169
+ dtype_dictionary[key]=pl.String()
170
+
171
+ except ValueError:
172
+ raise ValueError("Please input a path or a pd.DataFrame, and make sure the separator is correct and the columns you specified are in the file.")
173
+
174
+ ###################################################################################################################################################
175
+ ## check columns/datatype to use
176
+ if snpid:
177
+ usecols.append(snpid)
178
+ rename_dictionary[snpid]= "SNPID"
179
+ if rsid:
180
+ usecols.append(rsid)
181
+ rename_dictionary[rsid]= "rsID"
182
+ if chrom:
183
+ usecols.append(chrom)
184
+ rename_dictionary[chrom]= "CHR"
185
+ dtype_dictionary[chrom]=pl.String()
186
+ if pos:
187
+ usecols.append(pos)
188
+ rename_dictionary[pos]= "POS"
189
+ dtype_dictionary[pos]=pl.Float64()
190
+ if ea:
191
+ usecols.append(ea)
192
+ rename_dictionary[ea]= "EA"
193
+ dtype_dictionary[ea]=pl.String()
194
+ if nea:
195
+ usecols.append(nea)
196
+ rename_dictionary[nea]= "NEA"
197
+ dtype_dictionary[nea]=pl.String()
198
+ if ref:
199
+ usecols.append(ref)
200
+ rename_dictionary[ref]= "REF"
201
+ dtype_dictionary[ref]=pl.String()
202
+ if alt:
203
+ usecols.append(alt)
204
+ rename_dictionary[alt]= "ALT"
205
+ dtype_dictionary[alt]=pl.String()
206
+ if eaf:
207
+ usecols.append(eaf)
208
+ rename_dictionary[eaf]= "EAF"
209
+ elif neaf:
210
+ # neaf will be converted to eaf
211
+ usecols.append(neaf)
212
+ rename_dictionary[neaf]= "EAF"
213
+ if maf:
214
+ usecols.append(maf)
215
+ rename_dictionary[maf]= "MAF"
216
+ if n and (type(n) is str):
217
+ usecols.append(n)
218
+ rename_dictionary[n]= "N"
219
+ if ncase and (type(ncase) is str):
220
+ usecols.append(ncase)
221
+ rename_dictionary[ncase]= "N_CASE"
222
+ if ncontrol and (type(ncontrol) is str):
223
+ usecols.append(ncontrol)
224
+ rename_dictionary[ncontrol]= "N_CONTROL"
225
+ if neff and (type(neff) is str):
226
+ usecols.append(neff)
227
+ rename_dictionary[neff]= "N_EFF"
228
+ if beta:
229
+ usecols.append(beta)
230
+ rename_dictionary[beta]= "BETA"
231
+ if beta_95L:
232
+ usecols.append(beta_95L)
233
+ rename_dictionary[beta_95L]= "BETA_95L"
234
+ if beta_95U:
235
+ usecols.append(beta_95U)
236
+ rename_dictionary[beta_95U]= "BETA_95U"
237
+ if se:
238
+ usecols.append(se)
239
+ rename_dictionary[se]= "SE"
240
+ if chisq:
241
+ usecols.append(chisq)
242
+ rename_dictionary[chisq]="CHISQ"
243
+ if z:
244
+ usecols.append(z)
245
+ rename_dictionary[z]= "Z"
246
+ if q:
247
+ usecols.append(q)
248
+ rename_dictionary[q]= "Q"
249
+ if p:
250
+ usecols.append(p)
251
+ rename_dictionary[p]= "P"
252
+ if t:
253
+ usecols.append(t)
254
+ rename_dictionary[t]= "T"
255
+ if f:
256
+ usecols.append(f)
257
+ rename_dictionary[f]= "F"
258
+ if mlog10p:
259
+ usecols.append(mlog10p)
260
+ rename_dictionary[mlog10p]= "MLOG10P"
261
+ if test:
262
+ usecols.append(test)
263
+ rename_dictionary[test]= "TEST"
264
+ if info:
265
+ usecols.append(info)
266
+ rename_dictionary[info]= "INFO"
267
+ if OR:
268
+ usecols.append(OR)
269
+ rename_dictionary[OR]= "OR"
270
+ if OR_95L:
271
+ usecols.append(OR_95L)
272
+ rename_dictionary[OR_95L]= "OR_95L"
273
+ if OR_95U:
274
+ usecols.append(OR_95U)
275
+ rename_dictionary[OR_95U]= "OR_95U"
276
+ if HR:
277
+ usecols.append(HR)
278
+ rename_dictionary[HR]= "HR"
279
+ if HR_95L:
280
+ usecols.append(HR_95L)
281
+ rename_dictionary[HR_95L]= "HR_95L"
282
+ if HR_95U:
283
+ usecols.append(HR_95U)
284
+ rename_dictionary[HR_95U]= "HR_95U"
285
+ if phet:
286
+ usecols.append(phet)
287
+ rename_dictionary[phet]= "P_HET"
288
+ if i2:
289
+ usecols.append(i2)
290
+ rename_dictionary[i2]= "I2"
291
+ if snpr2:
292
+ usecols.append(snpr2)
293
+ rename_dictionary[snpr2]= "SNPR2"
294
+ if dof:
295
+ usecols.append(dof)
296
+ rename_dictionary[dof]= "DOF"
297
+ if direction:
298
+ usecols.append(direction)
299
+ rename_dictionary[direction]="DIRECTION"
300
+ if status:
301
+ usecols.append(status)
302
+ rename_dictionary[status]="STATUS"
303
+ dtype_dictionary[status]=pl.String()
304
+ if other:
305
+ overlapped = _check_overlap_with_reserved_keys(other)
306
+ log.warning("Columns with headers overlapping with GWASLab reserved keywords:{}".format(overlapped),verbose=verbose)
307
+ usecols = usecols + other
308
+ for i in other:
309
+ rename_dictionary[i] = i
310
+ if fmt=="vcf":
311
+ # store the final column list
312
+ vcf_usecols = usecols.copy()
313
+ # loading the fixed columns + study
314
+ usecols = meta_data["format_fixed"]
315
+ if study is not None:
316
+ usecols = usecols + [study]
317
+ else:
318
+ study = raw_cols[9]
319
+ usecols = usecols + [study]
320
+
321
+ if usekeys is not None:
322
+ # extract only specified keys
323
+ usecols_new =[]
324
+ for i in usekeys:
325
+ for k, v in rename_dictionary.items():
326
+ if i == v:
327
+ usecols_new.append(k)
328
+ usecols_valid =[]
329
+ for i in usecols_new:
330
+ if i in usecols:
331
+ usecols_valid.append(i)
332
+ usecols = usecols_valid
333
+
334
+ usecols = list(set(usecols))
335
+
336
+ #loading data ##########################################################################################################
337
+
338
+ try:
339
+ if type(sumstats) is str:
340
+ ## loading data from path
341
+ inpath = sumstats
342
+ if "@" in inpath:
343
+ log.write("Start to initialize gl.Sumstats from files with pattern :" + inpath,verbose=verbose)
344
+ sumstats_chr_list=[]
345
+ for i in inpath_chr_list:
346
+ log.write(" -Loading:" + i)
347
+ skip_rows = get_skip_rows(i)
348
+ readargs["skip_rows"] = skip_rows
349
+ sumstats_chr = pl.read_csv(i,
350
+ columns = usecols,
351
+ schema_overrides=dtype_dictionary,
352
+ **readargs)
353
+ sumstats_chr_list.append(sumstats_chr)
354
+ log.write(" -Merging sumstats for chromosomes:",",".join(inpath_chr_num_list),verbose=verbose)
355
+ sumstats = pl.concat(sumstats_chr_list, rechunk=True)
356
+ del(sumstats_chr_list)
357
+ gc.collect()
358
+ else:
359
+ skip_rows = get_skip_rows(inpath)
360
+ readargs["skip_rows"] = skip_rows
361
+ log.write("Start to initialize gl.Sumstats from file :" + inpath,verbose=verbose)
362
+
363
+ sumstats = pl.read_csv(inpath,
364
+ columns =usecols,
365
+ schema_overrides=dtype_dictionary,
366
+ **readargs)
367
+
368
+ elif type(sumstats) is pd.DataFrame:
369
+ ## loading data from dataframe
370
+ log.write("Start to initialize gl.Sumstats from pandas DataFrame ...",verbose=verbose)
371
+ sumstats = sumstats[usecols].copy()
372
+ for key,value in dtype_dictionary.items():
373
+ if key in usecols:
374
+ astype = value
375
+ if rename_dictionary[key]=="CHR":
376
+ astype ="Int64"
377
+ try:
378
+ sumstats[key] = sumstats[key].astype(astype)
379
+ except:
380
+ sumstats[key] = sumstats[key].astype("string")
381
+ except ValueError:
382
+ raise ValueError("Please input a path or a pd.DataFrame, and make sure it contain the columns.")
383
+
384
+ if chrom_pat is not None:
385
+ sumstats = _load_single_chr(sumstats,
386
+ usecols=usecols,
387
+ rename_dictionary=rename_dictionary,
388
+ chrom_pat=chrom_pat,
389
+ log=log,
390
+ verbose=verbose)
391
+ elif snpid_pat is not None:
392
+ sumstats = _load_variants_with_pattern(sumstats,
393
+ usecols=usecols,
394
+ rename_dictionary=rename_dictionary,
395
+ snpid_pat=snpid_pat,
396
+ log=log,
397
+ verbose=verbose)
398
+ ## renaming columns ###############################################################################################
399
+ if fmt == "vcf":
400
+ sumstats = parse_vcf_study(sumstats,format_cols,study,vcf_usecols,log=log,verbose=verbose)
401
+ usecols = vcf_usecols
402
+
403
+ converted_columns = list(map(lambda x: rename_dictionary[x], set(usecols)))
404
+
405
+ ## renaming log
406
+ log.write(" -Reading columns :", ",".join(set(usecols)),verbose=verbose)
407
+ log.write(" -Renaming columns to :", ",".join(converted_columns),verbose=verbose)
408
+ log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns),verbose=verbose)
409
+
410
+ ## renaming #####################################################################################
411
+ sumstats = sumstats.rename(rename_dictionary)
412
+
413
+ ## if n was provided as int #####################################################################################
414
+ if type(n) is int:
415
+ sumstats["N"] = n
416
+ if type(ncase) is int:
417
+ sumstats["N_CASE"] = ncase
418
+ if type(ncontrol) is int:
419
+ sumstats["N_CONTROL"] = ncontrol
420
+
421
+ ### status ######################################################################################################
422
+ if status is None:
423
+ sumstats = process_status(sumstats=sumstats,build=build,log=log,verbose=verbose)
424
+
425
+ ## ea/nea, ref/alt ##############################################################################################
426
+ sumstats = process_allele(sumstats=sumstats,log=log,verbose=verbose)
427
+
428
+ ## NEAF to EAF ###########################################################################################################
429
+ if neaf is not None :
430
+ sumstats = process_neaf(sumstats=sumstats,log=log,verbose=verbose)
431
+
432
+ ## reodering ###################################################################################################
433
+ #sumstats = sortcolumn(sumstats=sumstats,log=log,verbose=verbose)
434
+ sumstats = quick_convert_datatype(sumstats,log=log,verbose=verbose)
435
+
436
+ check_datatype(sumstats,log=log,verbose=verbose)
437
+ #gc.collect()
438
+ check_dataframe_memory_usage(sumstats,log=log,verbose=verbose)
439
+
440
+ log.write("Finished loading data successfully!",verbose=verbose)
441
+ return sumstats
442
+
443
+
444
+ #### helper #######################################################################
445
+ def isfile_casesensitive(path):
446
+ if not os.path.isfile(path):
447
+ return False # exit early
448
+ directory, filename = os.path.split(path)
449
+ return filename in os.listdir(directory)
450
+
451
+ def get_readargs_header(inpath,readargs):
452
+ if "vcf.gz" in inpath:
453
+ with gzip.open(inpath,'r') as file:
454
+ skip=0
455
+ for line in file:
456
+ if line.decode('utf-8').startswith('##'):
457
+ skip+=1
458
+ else:
459
+ readargs["skip_rows"]=skip
460
+ readargs["separator"]="\t"
461
+ break
462
+ readargs_header = readargs.copy()
463
+ readargs_header["n_rows"]=1
464
+ #readargs_header["dtype"]="string"
465
+ readargs_header["infer_schema"] = False
466
+ return readargs_header
467
+
468
+ def get_skip_rows(inpath):
469
+ if "vcf.gz" in inpath:
470
+ with gzip.open(inpath,'r') as file:
471
+ skip=0
472
+ for line in file:
473
+ if line.decode('utf-8').startswith('##'):
474
+ skip+=1
475
+ else:
476
+ return skip
477
+ else:
478
+ return 0
479
+
480
+ def parse_vcf_study(sumstats,format_cols,study,vcf_usecols,log,verbose=True):
481
+ log.write(" -Parsing based on FORMAT: ", format_cols,verbose=verbose)
482
+ log.write(" -Parsing vcf study : ", study,verbose=verbose)
483
+ #sumstats[format_cols] = sumstats[study].str.split(":",expand=True).values
484
+ sumstats = sumstats.drop(["FORMAT",study])
485
+ sumstats = sumstats[vcf_usecols]
486
+ gc.collect()
487
+ return sumstats
488
+
489
+ def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=False, skip_meta_records=None):
490
+ log.write(" -"+fmt+" format meta info:",verbose=verbose)
491
+ if skip_meta_records is None:
492
+ skip_meta_records =[]
493
+ for key,value in meta_data.items():
494
+ if key in skip_meta_records:
495
+ continue
496
+ if value is None:
497
+ continue
498
+ if type(value) is str:
499
+ if "\n" in value:
500
+ value_first_line=value.split("\n")[0]
501
+ log.write(" -",key," : "+value_first_line.strip()+"...",verbose=verbose)
502
+ elif value==" ":
503
+ log.write(' -',key,' : \\s ',verbose=verbose)
504
+ elif value=="\t":
505
+ log.write(' -',key,' : \\t',verbose=verbose)
506
+ else:
507
+ log.write(" -",key," : "+value.strip(),verbose=verbose)
508
+ elif type(value) is list:
509
+ log.write(" -",key," : "+','.join(value),verbose=verbose)
510
+ else:
511
+ log.write(" -",key," : ",value,verbose=verbose)
512
+ keys=[]
513
+ values=[]
514
+ for key,value in rename_dictionary.items():
515
+ keys.append(key)
516
+ values.append(value)
517
+ if fmt!="gwaslab":
518
+ if output == False:
519
+ if fmt!="auto":
520
+ log.write(" -"+fmt+" to gwaslab format dictionary:",verbose=verbose)
521
+ log.write(" - "+fmt+" keys:",",".join(keys),verbose=verbose)
522
+ log.write(" - gwaslab values:",",".join(values),verbose=verbose)
523
+ else:
524
+ log.write(" - Auto-detection mode. Note: auto-detection assumes A1=EA; Alt=EA and Frq=EAF...",verbose=verbose)
525
+ log.write(" - Header conversion source: https://github.com/Cloufield/formatbook/blob/main/formats/auto.json",verbose=verbose)
526
+ else:
527
+ log.write(" -gwaslab to "+fmt+" format dictionary:",verbose=verbose)
528
+ keys=[]
529
+ values=[]
530
+ for key,value in rename_dictionary.items():
531
+ keys.append(key)
532
+ values.append(value)
533
+ log.write(" - gwaslab keys:", ','.join(keys),verbose=verbose)
534
+ log.write(" - "+fmt+" values:" , ','.join(values),verbose=verbose)
535
+
536
+ def process_neaf(sumstats,log,verbose):
537
+ log.write(" -NEAF is specified...",verbose=verbose)
538
+ pre_number=len(sumstats)
539
+ log.write(" -Checking if 0<= NEAF <=1 ...",verbose=verbose)
540
+
541
+ sumstats["EAF"] = pd.to_numeric(sumstats["EAF"], errors='coerce')
542
+
543
+ sumstats = sumstats.filter(pl.col("EAF")>=0 & pl.col("EAF")<=1)
544
+ sumstats = sumstats.with_columns(
545
+ EAF = 1- pl.col("EAF")
546
+ )
547
+ log.write(" -Converted NEAF to EAF.",verbose=verbose)
548
+
549
+ after_number=len(sumstats)
550
+
551
+ log.write(" -Removed "+str(pre_number - after_number)+" variants with bad NEAF.",verbose=verbose)
552
+
553
+ return sumstats
554
+
555
+ def process_allele(sumstats,log,verbose):
556
+
557
+ if "EA" in sumstats.columns:
558
+
559
+ if "REF" in sumstats.columns and "ALT" in sumstats.columns:
560
+
561
+ if "NEA" not in sumstats.columns:
562
+ log.write(" NEA not available: assigning REF to NEA...",verbose=verbose)
563
+
564
+ sumstats = sumstats.with_columns(NEA = pl.col("REF"))
565
+
566
+ log.write(" -EA,REF and ALT columns are available: assigning NEA...",verbose=verbose)
567
+ ea_alt = sumstats["EA"]==sumstats["ALT"]
568
+
569
+ log.write(" -For variants with EA == ALT : assigning REF to NEA ...",verbose=verbose)
570
+ sumstats.loc[ea_alt,"NEA"] = sumstats.loc[ea_alt,"REF"]
571
+
572
+ sumstats = sumstats.with_columns(
573
+ pl.when(ea_alt)
574
+ .then(pl.col("REF"))
575
+ .otherwise(pl.col("NEA"))
576
+ .alias("NEA")
577
+ )
578
+
579
+ ea_not_alt = sumstats["EA"]!=sumstats["ALT"]
580
+ log.write(" -For variants with EA != ALT : assigning ALT to NEA ...",verbose=verbose)
581
+ sumstats = sumstats.with_columns(
582
+ pl.when(ea_not_alt)
583
+ .then(pl.col("ALT"))
584
+ .otherwise(pl.col("NEA"))
585
+ .alias("NEA")
586
+ )
587
+
588
+ return sumstats
589
+
590
+ def process_status(sumstats,build,log,verbose):
591
+ log.write(" -Initiating a status column: STATUS ...",verbose=verbose)
592
+ #sumstats["STATUS"] = int(build)*(10**5) +99999
593
+ build = _process_build(build,log,verbose)
594
+ sumstats = sumstats.with_columns(
595
+ STATUS = pl.lit(build +"99999")
596
+ )
597
+ return sumstats
598
+
599
+
600
+ def _load_single_chr(sumstats,usecols, rename_dictionary,chrom_pat,log,verbose):
601
+
602
+
603
+ # get chr
604
+ for k,v in rename_dictionary.items():
605
+ if v=="CHR":
606
+ if k in sumstats.columns:
607
+ log.write(" -Columns used to filter variants: {}".format(k),verbose=verbose)
608
+ chunk_chrom = k
609
+ break
610
+
611
+ log.write(" -Loading only variants on chromosome with pattern : {} ...".format(chrom_pat),verbose=verbose)
612
+
613
+ sumstats_filtered = sumstats.filter(pl.col(chunk_chrom).str.contains(chrom_pat))
614
+
615
+ log.write(" -Loaded {} variants on chromosome with pattern :{} ...".format(len(sumstats_filtered), chrom_pat),verbose=verbose)
616
+ return sumstats_filtered
617
+
618
+ def _load_variants_with_pattern(sumstats,usecols, rename_dictionary,snpid_pat,log,verbose):
619
+
620
+ # get chr
621
+ for k,v in rename_dictionary.items():
622
+ if v=="SNPID":
623
+ if k in sumstats.columns:
624
+ log.write(" -Columns used to filter variants: {}".format(k),verbose=verbose)
625
+ chunk_snpid = k
626
+ break
627
+
628
+ log.write(" -Loading only variants with pattern : {} ...".format(snpid_pat),verbose=verbose)
629
+ sumstats_filtered = sumstats.filter(pl.col(chunk_snpid).str.contains(snpid_pat))
630
+
631
+ log.write(" -Loaded {} variants with pattern : {} ...".format(len(sumstats_filtered), snpid_pat),verbose=verbose)
632
+ return sumstats_filtered
@@ -37,4 +37,28 @@ def _merge_and_sync_dic(list_of_dics:list, default:dict) -> dict:
37
37
  for dic in list_of_dics:
38
38
  if isinstance(dic, dict):
39
39
  temp.update(dic)
40
- return temp
40
+ return temp
41
+
42
+ def _update_args(args=None, default_args=None):
43
+
44
+ if default_args is None:
45
+ default_args={}
46
+
47
+ if args is None:
48
+ # if None, return default dict
49
+ return default_args
50
+ else:
51
+ # if not None, update default dict
52
+ for key,value in args.items():
53
+ default_args[key] = value
54
+ return default_args
55
+
56
+
57
+
58
+ def _update_arg(arg=None, default_arg=None):
59
+ if arg is None:
60
+ # if None, return default
61
+ return default_arg
62
+ else:
63
+ # if not None, return arg
64
+ return arg