gwaslab 3.4.36__tar.gz → 3.4.37__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (77) hide show
  1. {gwaslab-3.4.36/src/gwaslab.egg-info → gwaslab-3.4.37}/PKG-INFO +2 -2
  2. {gwaslab-3.4.36 → gwaslab-3.4.37}/pyproject.toml +2 -2
  3. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/__init__.py +1 -1
  4. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/g_Sumstats.py +54 -31
  5. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/g_meta.py +13 -3
  6. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/g_version.py +2 -2
  7. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/hm_harmonize_sumstats.py +43 -18
  8. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/io_preformat_input.py +3 -0
  9. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/qc_check_datatype.py +14 -0
  10. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/qc_fix_sumstats.py +217 -91
  11. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_ex_process_h5.py +26 -17
  12. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_in_fill_data.py +42 -3
  13. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/viz_aux_quickfix.py +2 -2
  14. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/viz_plot_compare_effect.py +22 -5
  15. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/viz_plot_mqqplot.py +127 -48
  16. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/viz_plot_regionalplot.py +13 -8
  17. {gwaslab-3.4.36 → gwaslab-3.4.37/src/gwaslab.egg-info}/PKG-INFO +2 -2
  18. {gwaslab-3.4.36 → gwaslab-3.4.37}/LICENSE +0 -0
  19. {gwaslab-3.4.36 → gwaslab-3.4.37}/README.md +0 -0
  20. {gwaslab-3.4.36 → gwaslab-3.4.37}/setup.cfg +0 -0
  21. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/bd_common_data.py +0 -0
  22. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/bd_config.py +0 -0
  23. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/bd_download.py +0 -0
  24. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/bd_get_hapmap3.py +0 -0
  25. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/data/chrx_par/chrx_par_hg19.bed.gz +0 -0
  26. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/data/chrx_par/chrx_par_hg38.bed.gz +0 -0
  27. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/data/formatbook.json +0 -0
  28. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz +0 -0
  29. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz +0 -0
  30. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/data/high_ld/high_ld_hla_hg19.bed.gz +0 -0
  31. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/data/high_ld/high_ld_hla_hg38.bed.gz +0 -0
  32. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/data/reference.json +0 -0
  33. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/g_Log.py +0 -0
  34. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/g_Phenotypes.py +0 -0
  35. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/g_SumstatsPair.py +0 -0
  36. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/g_SumstatsT.py +0 -0
  37. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/g_Sumstats_summary.py +0 -0
  38. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/g_vchange_status.py +0 -0
  39. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/hm_casting.py +0 -0
  40. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/hm_rsid_to_chrpos.py +0 -0
  41. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/io_read_ldsc.py +0 -0
  42. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/io_read_tabular.py +0 -0
  43. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/io_to_formats.py +0 -0
  44. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/io_to_pickle.py +0 -0
  45. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/run_script.py +0 -0
  46. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_ex_calculate_ldmatrix.py +0 -0
  47. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_ex_calculate_prs.py +0 -0
  48. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_ex_gwascatalog.py +0 -0
  49. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_ex_ldproxyfinder.py +0 -0
  50. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_ex_plink_filter.py +0 -0
  51. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_ex_process_ref.py +0 -0
  52. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_ex_run_2samplemr.py +0 -0
  53. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_ex_run_clumping.py +0 -0
  54. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_ex_run_coloc.py +0 -0
  55. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_ex_run_susie.py +0 -0
  56. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_in_calculate_gc.py +0 -0
  57. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_in_calculate_power.py +0 -0
  58. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_in_convert_h2.py +0 -0
  59. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_in_correct_winnerscurse.py +0 -0
  60. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_in_filter_value.py +0 -0
  61. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_in_get_density.py +0 -0
  62. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/util_in_get_sig.py +0 -0
  63. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/viz_aux_annotate_plot.py +0 -0
  64. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/viz_aux_reposition_text.py +0 -0
  65. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/viz_aux_save_figure.py +0 -0
  66. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/viz_plot_compare_af.py +0 -0
  67. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/viz_plot_forestplot.py +0 -0
  68. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/viz_plot_miamiplot.py +0 -0
  69. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/viz_plot_miamiplot2.py +0 -0
  70. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/viz_plot_qqplot.py +0 -0
  71. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/viz_plot_rg_heatmap.py +0 -0
  72. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/viz_plot_stackedregional.py +0 -0
  73. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab/viz_plot_trumpetplot.py +0 -0
  74. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab.egg-info/SOURCES.txt +0 -0
  75. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab.egg-info/dependency_links.txt +0 -0
  76. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab.egg-info/requires.txt +0 -0
  77. {gwaslab-3.4.36 → gwaslab-3.4.37}/src/gwaslab.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: gwaslab
3
- Version: 3.4.36
3
+ Version: 3.4.37
4
4
  Summary: A collection of handy tools for GWAS SumStats
5
5
  Author-email: Yunye <yunye@gwaslab.com>
6
6
  Project-URL: Homepage, https://cloufield.github.io/gwaslab/
@@ -8,7 +8,7 @@ Project-URL: Github, https://github.com/Cloufield/gwaslab
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: License :: OSI Approved :: MIT License
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: <=3.10,>=3.8
11
+ Requires-Python: <=3.10,>=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: pandas!=1.5,>=1.3
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
7
7
 
8
8
  [project]
9
9
  name = "gwaslab"
10
- version = "3.4.36"
10
+ version = "3.4.37"
11
11
  authors = [
12
12
  { name="Yunye", email="yunye@gwaslab.com" },
13
13
  ]
@@ -30,7 +30,7 @@ dependencies = [
30
30
  "gtfparse==1.3.0"
31
31
  ]
32
32
 
33
- requires-python = ">=3.8,<=3.10"
33
+ requires-python = ">=3.9,<=3.10"
34
34
  classifiers = [
35
35
  "Programming Language :: Python :: 3",
36
36
  "License :: OSI Approved :: MIT License",
@@ -42,6 +42,6 @@ from gwaslab.util_in_calculate_power import get_power
42
42
  from gwaslab.util_in_calculate_power import get_beta
43
43
  from gwaslab.viz_plot_trumpetplot import plot_power
44
44
  from gwaslab.viz_plot_trumpetplot import plot_power_x
45
- from gwaslab.util_ex_process_h5 import process_ref_vcf
45
+ from gwaslab.util_ex_process_h5 import process_vcf_to_hfd5
46
46
  from gwaslab.util_ex_run_susie import _run_susie_rss as run_susie_rss
47
47
  from gwaslab.io_read_tabular import _read_tabular as read_tabular
@@ -52,7 +52,8 @@ from gwaslab.bd_common_data import get_format_dict
52
52
  from gwaslab.bd_common_data import get_formats_list
53
53
  from gwaslab.g_version import _show_version
54
54
  from gwaslab.g_version import gwaslab_info
55
- from gwaslab.g_meta import init_meta
55
+ from gwaslab.g_meta import _init_meta
56
+ from gwaslab.g_meta import _append_meta_record
56
57
  from gwaslab.util_ex_run_clumping import _clump
57
58
  from gwaslab.util_ex_calculate_ldmatrix import tofinemapping
58
59
  from gwaslab.util_ex_calculate_prs import _calculate_prs
@@ -60,6 +61,7 @@ from gwaslab.viz_plot_mqqplot import mqqplot
60
61
  from gwaslab.viz_plot_trumpetplot import plottrumpet
61
62
  from gwaslab.viz_plot_compare_af import plotdaf
62
63
  from gwaslab.util_ex_run_susie import _run_susie_rss
64
+ from gwaslab.qc_fix_sumstats import _check_data_consistency
63
65
  import gc
64
66
 
65
67
  #20220309
@@ -119,10 +121,9 @@ class Sumstats():
119
121
  self.log = Log()
120
122
 
121
123
  # meta information
122
- self.meta = init_meta()
124
+ self.meta = _init_meta()
123
125
  self.build = build
124
- self.meta["gwaslab"]["study_name"] = study
125
- #self.meta["gwaslab"]["genome_build"] = build
126
+ self.meta["gwaslab"]["study_name"] = study
126
127
  self.meta["gwaslab"]["species"] = species
127
128
 
128
129
  # initialize attributes for clumping and finmapping
@@ -217,8 +218,22 @@ class Sumstats():
217
218
  return lookupstatus(self.data[status])
218
219
 
219
220
  def set_build(self, build, verbose=True):
220
- self.data = _set_build(self.data, build=build, log=self.log,verbose=verbose)
221
+ self.data, self.meta["gwaslab"]["genome_build"] = _set_build(self.data, build=build, log=self.log,verbose=verbose)
221
222
  gc.collect()
223
+
224
+ def infer_build(self,**args):
225
+ self.data, self.meta["gwaslab"]["genome_build"] = inferbuild(self.data,**args)
226
+
227
+ def liftover(self,to_build, from_build=None,**args):
228
+ if from_build is None:
229
+ if self.meta["gwaslab"]["genome_build"]=="99":
230
+ self.data, self.meta["gwaslab"]["genome_build"] = inferbuild(self.data,**args)
231
+ from_build = self.meta["gwaslab"]["genome_build"]
232
+ self.data = parallelizeliftovervariant(self.data,from_build=from_build, to_build=to_build, log=self.log,**args)
233
+ self.meta["is_sorted"] = False
234
+ self.meta["is_harmonised"] = False
235
+ self.meta["gwaslab"]["genome_build"]=to_build
236
+
222
237
  # QC ######################################################################################
223
238
  #clean the sumstats with one line
224
239
  def basic_check(self,
@@ -231,6 +246,7 @@ class Sumstats():
231
246
  fixpos_args={},
232
247
  fixallele_args={},
233
248
  sanitycheckstats_args={},
249
+ consistencycheck_args={},
234
250
  normalize=True,
235
251
  normalizeallele_args={},
236
252
  verbose=True):
@@ -241,6 +257,8 @@ class Sumstats():
241
257
  self.data = fixpos(self.data,log=self.log,remove=remove,verbose=verbose,**fixpos_args)
242
258
  self.data = fixallele(self.data,log=self.log,remove=remove,verbose=verbose,**fixallele_args)
243
259
  self.data = sanitycheckstats(self.data,log=self.log,verbose=verbose,**sanitycheckstats_args)
260
+ _check_data_consistency(self.data,log=self.log,verbose=verbose,**consistencycheck_args)
261
+
244
262
  if normalize is True:
245
263
  self.data = parallelnormalizeallele(self.data,n_cores=n_cores,verbose=verbose,log=self.log,**normalizeallele_args)
246
264
  if remove_dup is True:
@@ -329,9 +347,9 @@ class Sumstats():
329
347
 
330
348
  self.data= parallelinferstrand(self.data,ref_infer = ref_infer,ref_alt_freq=ref_alt_freq,maf_threshold=maf_threshold,
331
349
  n_cores=n_cores,log=self.log,**inferstrand_args)
332
-
333
- self.meta["gwaslab"]["references"]["ref_infer"] = ref_infer
334
350
 
351
+ self.meta["gwaslab"]["references"]["ref_infer"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer"] , ref_infer)
352
+
335
353
  self.data =flipallelestats(self.data,log=self.log,**flipallelestats_args)
336
354
 
337
355
  gc.collect()
@@ -341,13 +359,18 @@ class Sumstats():
341
359
 
342
360
  self.data = parallelizeassignrsid(self.data,path=ref_rsid_tsv,ref_mode="tsv",
343
361
  n_cores=n_cores,log=self.log,**assignrsid_args)
362
+
363
+
364
+
344
365
  self.meta["gwaslab"]["references"]["ref_rsid_tsv"] = ref_rsid_tsv
345
366
  gc.collect()
367
+
346
368
  if ref_rsid_vcf is not None:
347
-
348
369
  self.data = parallelizeassignrsid(self.data,path=ref_rsid_vcf,ref_mode="vcf",
349
370
  n_cores=n_cores,log=self.log,**assignrsid_args)
350
- self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = ref_rsid_vcf
371
+
372
+ self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_rsid_vcf"] , ref_rsid_vcf)
373
+
351
374
  gc.collect()
352
375
  ######################################################
353
376
  if remove is True:
@@ -376,17 +399,23 @@ class Sumstats():
376
399
  self.data = removedup(self.data,log=self.log,**args)
377
400
  def check_sanity(self,**args):
378
401
  self.data = sanitycheckstats(self.data,log=self.log,**args)
379
- #
402
+ def check_data_consistency(self, **args):
403
+ _check_data_consistency(self.data,log=self.log,**args)
380
404
  def check_id(self,**args):
381
405
  pass
382
- def check_ref(self,**args):
383
- self.data = checkref(self.data,log=self.log,**args)
384
- def infer_strand(self,**args):
385
- self.data = parallelinferstrand(self.data,log=self.log,**args)
406
+
407
+ def check_ref(self,ref_seq,**args):
408
+ self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
409
+ self.data = checkref(self.data,ref_seq,log=self.log,**args)
410
+ def infer_strand(self,ref_infer,**args):
411
+ self.meta["gwaslab"]["references"]["ref_infer"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer"] , ref_infer)
412
+ self.data = parallelinferstrand(self.data,ref_infer=ref_infer,log=self.log,**args)
413
+
386
414
  def flip_allele_stats(self,**args):
387
415
  self.data = flipallelestats(self.data,log=self.log,**args)
388
416
  def normalize_allele(self,**args):
389
417
  self.data = parallelnormalizeallele(self.data,log=self.log,**args)
418
+
390
419
  def assign_rsid(self,
391
420
  ref_rsid_tsv=None,
392
421
  ref_rsid_vcf=None,
@@ -396,21 +425,15 @@ class Sumstats():
396
425
  self.meta["gwaslab"]["references"]["ref_rsid_tsv"] = ref_rsid_tsv
397
426
  if ref_rsid_vcf is not None:
398
427
  self.data = parallelizeassignrsid(self.data,path=ref_rsid_vcf,ref_mode="vcf",log=self.log,**args)
399
- self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = ref_rsid_vcf
428
+ self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_rsid_vcf"] , ref_rsid_vcf)
429
+
400
430
  def rsid_to_chrpos(self,**args):
401
431
  self.data = rsidtochrpos(self.data,log=self.log,**args)
432
+
402
433
  def rsid_to_chrpos2(self,**args):
403
434
  self.data = parallelrsidtochrpos(self.data,log=self.log,**args)
404
435
 
405
- def liftover(self,to_build, from_build=None,**args):
406
- if from_build is None:
407
- if self.meta["gwaslab"]["genome_build"]=="99":
408
- self.data, self.meta["gwaslab"]["genome_build"] = inferbuild(self.data,**args)
409
- from_build = self.meta["gwaslab"]["genome_build"]
410
- self.data = parallelizeliftovervariant(self.data,from_build=from_build, to_build=to_build, log=self.log,**args)
411
- self.meta["is_sorted"] = False
412
- self.meta["is_harmonised"] = False
413
- self.meta["gwaslab"]["genome_build"]=to_build
436
+
414
437
  ############################################################################################################
415
438
 
416
439
  def sort_coordinate(self,**sort_args):
@@ -420,11 +443,10 @@ class Sumstats():
420
443
  self.data = sortcolumn(self.data,log=self.log,**args)
421
444
 
422
445
  ############################################################################################################
423
- def fill_data(self, **args):
424
- self.data = filldata(self.data,**args)
425
-
426
- def infer_build(self,**args):
427
- self.data, self.meta["gwaslab"]["genome_build"] = inferbuild(self.data,**args)
446
+ def fill_data(self, verbose=True, **args):
447
+ self.data = filldata(self.data, verbose=verbose, **args)
448
+ self.data = sortcolumn(self.data, verbose=verbose, log=self.log)
449
+
428
450
  # utilities ############################################################################################################
429
451
  # filter series ######################################################################
430
452
  def get_flanking(self, inplace=False,**args):
@@ -485,11 +507,12 @@ class Sumstats():
485
507
 
486
508
  def check_af(self,ref_infer,**args):
487
509
  self.data = parallelecheckaf(self.data,ref_infer=ref_infer,log=self.log,**args)
488
- self.meta["gwaslab"]["references"]["ref_infer_daf"] = ref_infer
489
-
510
+ self.meta["gwaslab"]["references"]["ref_infer_daf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_daf"] , ref_infer)
511
+
490
512
  def infer_af(self,ref_infer,**args):
491
513
  self.data = paralleleinferaf(self.data,ref_infer=ref_infer,log=self.log,**args)
492
514
  self.meta["gwaslab"]["references"]["ref_infer_af"] = ref_infer
515
+ self.meta["gwaslab"]["references"]["ref_infer_af"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_af"] , ref_infer)
493
516
 
494
517
  def plot_daf(self, **args):
495
518
  fig,outliers = plotdaf(self.data, **args)
@@ -1,6 +1,6 @@
1
1
  from gwaslab.g_version import gwaslab_info
2
2
 
3
- def init_meta():
3
+ def _init_meta():
4
4
  metadata = {"gwaslab":{
5
5
  "gwaslab_version": gwaslab_info()["version"],
6
6
  "study_name":"Sumstats_1",
@@ -23,7 +23,11 @@ def init_meta():
23
23
  "ref_rsid_tsv":"Unknown",
24
24
  "ref_rsid_vcf":"Unknown",
25
25
  "ref_seq":"Unknown",
26
- "ref_infer":"Unknown"
26
+ "ref_infer":"Unknown",
27
+ "ref_infer_af":"Unknown",
28
+ "ref_infer_daf":"Unknown",
29
+ "ref_rsid_to_chrpos_tsv":"Unknown",
30
+ "ref_rsid_to_chrpos_vcf":"Unknown"
27
31
  }
28
32
  },
29
33
  "genotyping_technology":"Unknown",
@@ -45,4 +49,10 @@ def init_meta():
45
49
  "coordinate_system":"1-based",
46
50
  "sex": "M|F|combined"
47
51
  }
48
- return metadata.copy()
52
+ return metadata.copy()
53
+
54
+ def _append_meta_record(old, new):
55
+ if old == "Unknown" or old== "Unchecked":
56
+ return new
57
+ else:
58
+ return "{}, {}".format(old, new)
@@ -15,8 +15,8 @@ def _get_version():
15
15
  def gwaslab_info():
16
16
  # version meta information
17
17
  dic={
18
- "version":"3.4.36",
19
- "release_date":"20240123"
18
+ "version":"3.4.37",
19
+ "release_date":"20240129"
20
20
  }
21
21
  return dic
22
22
 
@@ -11,6 +11,8 @@ import gc
11
11
  from gwaslab.g_Log import Log
12
12
  from gwaslab.qc_fix_sumstats import fixchr
13
13
  from gwaslab.qc_fix_sumstats import fixpos
14
+ from gwaslab.qc_fix_sumstats import sortcolumn
15
+ from gwaslab.qc_check_datatype import check_dataframe_shape
14
16
  from gwaslab.bd_common_data import get_number_to_chr
15
17
  from gwaslab.bd_common_data import get_chr_list
16
18
  from gwaslab.bd_common_data import get_chr_to_number
@@ -27,7 +29,7 @@ from gwaslab.g_version import _get_version
27
29
 
28
30
  ###~!!!!
29
31
  def rsidtochrpos(sumstats,
30
- path="", snpid="SNPID",
32
+ path=None, ref_rsid_to_chrpos_tsv=None, snpid="SNPID",
31
33
  rsid="rsID", chrom="CHR",pos="POS",ref_rsid="rsID",ref_chr="CHR",ref_pos="POS", build="19",
32
34
  overwrite=False,remove=False,chunksize=5000000,verbose=True,log=Log()):
33
35
  '''
@@ -35,9 +37,12 @@ def rsidtochrpos(sumstats,
35
37
  '''
36
38
  #########################################################################################################
37
39
  if verbose: log.write("Start to update chromosome and position information based on rsID...{}".format(_get_version()))
38
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
40
+ check_dataframe_shape(sumstats, log, verbose)
39
41
  if verbose: log.write(" -rsID dictionary file: "+ path)
40
42
 
43
+ if ref_rsid_to_chrpos_tsv is not None:
44
+ path = ref_rsid_to_chrpos_tsv
45
+
41
46
  if snpid in sumstats.columns and sum(sumstats[rsid].isna())>0:
42
47
  if verbose: log.write(" -Filling na in rsID columns with SNPID...")
43
48
  sumstats.loc[sumstats[rsid].isna(),rsid] = sumstats.loc[sumstats[rsid].isna(),snpid]
@@ -75,6 +80,7 @@ def rsidtochrpos(sumstats,
75
80
  if verbose: log.write(" -Updating CHR and POS finished.Start to re-fixing CHR and POS... ")
76
81
  sumstats = fixchr(sumstats,verbose=verbose)
77
82
  sumstats = fixpos(sumstats,verbose=verbose)
83
+ sumstats = sortcolumn(sumstats,verbose=verbose)
78
84
  return sumstats
79
85
  ####################################################################################################
80
86
 
@@ -96,9 +102,19 @@ def merge_chrpos(sumstats_part,all_groups_max,path,build,status):
96
102
  return sumstats_part
97
103
 
98
104
 
99
- def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None,build="99",status="STATUS",
105
+ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None, ref_rsid_to_chrpos_vcf = None, ref_rsid_to_chrpos_hdf5 = None, build="99",status="STATUS",
100
106
  n_cores=4,block_size=20000000,verbose=True,log=Log()):
107
+
108
+ if ref_rsid_to_chrpos_hdf5 is not None:
109
+ path = ref_rsid_to_chrpos_hdf5
110
+ elif ref_rsid_to_chrpos_vcf is not None:
111
+ vcf_file_name = os.path.basename(ref_rsid_to_chrpos_vcf)
112
+ vcf_dir_path = os.path.dirname(ref_rsid_to_chrpos_vcf)
113
+ path = "{}/{}.rsID_CHR_POS_groups_{}.h5".format(vcf_dir_path,vcf_file_name,int(block_size))
114
+
101
115
  if verbose: log.write("Start to assign CHR and POS using rsIDs...{}".format(_get_version()))
116
+ check_dataframe_shape(sumstats, log, verbose)
117
+
102
118
  if path is None:
103
119
  raise ValueError("Please provide path to hdf5 file.")
104
120
 
@@ -164,13 +180,16 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
164
180
  # merge back
165
181
  if verbose: log.write(" -Append data... ")
166
182
  sumstats = pd.concat([sumstats_rs,sumstats_nonrs],ignore_index=True)
183
+
167
184
  del sumstats_rs
168
185
  del sumstats_nonrs
169
186
  gc.collect()
170
187
 
171
188
  # check
172
- sumstats = fixchr(sumstats,verbose=True)
173
- sumstats = fixpos(sumstats,verbose=True)
189
+ sumstats = fixchr(sumstats,verbose=verbose)
190
+ sumstats = fixpos(sumstats,verbose=verbose)
191
+ sumstats = sortcolumn(sumstats,verbose=verbose)
192
+
174
193
  pool.close()
175
194
  pool.join()
176
195
  gc.collect()
@@ -235,7 +254,7 @@ def check_status(row,record):
235
254
 
236
255
  def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="STATUS",chr_dict=get_chr_to_number(),remove=False,verbose=True,log=Log()):
237
256
  if verbose: log.write("Start to check if NEA is aligned with reference sequence...{}".format(_get_version()))
238
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
257
+ check_dataframe_shape(sumstats, log, verbose)
239
258
  if verbose: log.write(" -Reference genome fasta file: "+ ref_path)
240
259
  if verbose: log.write(" -Checking records: ", end="")
241
260
  chromlist = get_chr_list(add_number=True)
@@ -381,7 +400,7 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
381
400
  assign rsID based on chr:pos
382
401
  '''
383
402
  if verbose: log.write("Start to annotate rsID based on chromosome and position information...{}".format(_get_version()))
384
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
403
+ check_dataframe_shape(sumstats, log, verbose)
385
404
  if verbose: log.write(" -SNPID-rsID text file: "+ path)
386
405
 
387
406
  standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234][0126]\w", case=False, flags=0, na=False)
@@ -517,7 +536,7 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
517
536
  chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
518
537
  chr_dict=None,verbose=True,log=Log()):
519
538
  if verbose: log.write("Start to infer strand for palindromic SNPs...{}".format(_get_version()))
520
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
539
+ check_dataframe_shape(sumstats, log, verbose)
521
540
  if verbose: log.write(" -Reference vcf file:", ref_infer)
522
541
 
523
542
  chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
@@ -528,8 +547,8 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
528
547
  if "p" in mode:
529
548
  # ref_alt_freq INFO in vcf was provided
530
549
  if ref_alt_freq is not None:
550
+
531
551
  if verbose: log.write(" -Alternative allele frequency in INFO:", ref_alt_freq)
532
-
533
552
  ## checking \w\w\w\w[0]\w\w -> standardized and normalized snp
534
553
  good_chrpos = sumstats[status].str.match(r'\w\w\w[0][0]\w\w', case=False, flags=0, na=False)
535
554
  palindromic = good_chrpos & is_palindromic(sumstats[[ref,alt]],a1=ref,a2=alt)
@@ -538,22 +557,28 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
538
557
  ##not palindromic : change status
539
558
  sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
540
559
  if verbose: log.write(" -Identified ", sum(palindromic)," palindromic SNPs...")
541
-
560
+
542
561
  #palindromic but can not infer
543
- maf_can_infer = (sumstats.loc[:,eaf] < maf_threshold) | (sumstats.loc[:,eaf] > 1 - maf_threshold)
562
+ maf_can_infer = (sumstats.loc[:,eaf] < maf_threshold) | (sumstats.loc[:,eaf] > 1 - maf_threshold)
563
+
544
564
  sumstats.loc[palindromic&(~maf_can_infer),status] = vchange_status(sumstats.loc[palindromic&(~maf_can_infer),status],7,"9","7")
565
+
566
+ #palindromic WITH UNKNWON OR UNCHECKED STATUS
567
+ unknow_palindromic = sumstats[status].str.match(r'\w\w\w\w\w[012][89]', case=False, flags=0, na=False)
545
568
 
569
+ unknow_palindromic_to_check = palindromic & maf_can_infer & unknow_palindromic
570
+
571
+ if verbose: log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)))
546
572
 
547
- if verbose: log.write(" -After filtering by MAF< ", maf_threshold ," , the strand of ", sum(palindromic & maf_can_infer)," palindromic SNPs will be inferred...")
548
573
  #########################################################################################
549
- if sum(palindromic & maf_can_infer)>0:
550
- if sum(palindromic & maf_can_infer)<10000:
574
+ if sum(unknow_palindromic_to_check)>0:
575
+ if sum(unknow_palindromic_to_check)<10000:
551
576
  n_cores=1
552
- df_split = np.array_split(sumstats.loc[(palindromic & maf_can_infer),[chr,pos,ref,alt,eaf,status]], n_cores)
577
+ df_split = np.array_split(sumstats.loc[unknow_palindromic_to_check,[chr,pos,ref,alt,eaf,status]], n_cores)
553
578
  pool = Pool(n_cores)
554
579
  map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
555
580
  status_inferred = pd.concat(pool.map(map_func,df_split))
556
- sumstats.loc[(palindromic & maf_can_infer),status] = status_inferred.values
581
+ sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
557
582
  pool.close()
558
583
  pool.join()
559
584
  #########################################################################################
@@ -650,7 +675,7 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
650
675
  def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,column_name="DAF",suffix="",n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
651
676
 
652
677
  if verbose: log.write("Start to check the difference between EAF and reference vcf alt frequency ...{}".format(_get_version()))
653
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
678
+ check_dataframe_shape(sumstats, log, verbose)
654
679
  if verbose: log.write(" -Reference vcf file:", ref_infer)
655
680
  if verbose: log.write(" -CPU Cores to use :",n_cores)
656
681
 
@@ -718,7 +743,7 @@ def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
718
743
  def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
719
744
 
720
745
  if verbose: log.write("Start to infer the AF and reference vcf alt frequency ...{}".format(_get_version()))
721
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
746
+ check_dataframe_shape(sumstats, log, verbose)
722
747
  if verbose: log.write(" -Reference vcf file:", ref_infer)
723
748
  if verbose: log.write(" -CPU Cores to use :",n_cores)
724
749
 
@@ -8,6 +8,7 @@ from gwaslab.bd_common_data import get_format_dict
8
8
  from gwaslab.qc_fix_sumstats import sortcolumn
9
9
  from gwaslab.qc_fix_sumstats import _process_build
10
10
  from gwaslab.qc_check_datatype import check_datatype
11
+ from gwaslab.qc_check_datatype import check_dataframe_memory_usage
11
12
 
12
13
  #20221030
13
14
  def preformat(sumstats,
@@ -353,6 +354,8 @@ def preformat(sumstats,
353
354
  sumstats = sortcolumn(sumstats=sumstats,log=log,verbose=verbose)
354
355
  check_datatype(sumstats,log=log,verbose=verbose)
355
356
  gc.collect()
357
+ check_dataframe_memory_usage(sumstats,log=log,verbose=verbose)
358
+
356
359
  if verbose: log.write("Finished loading data successfully!")
357
360
  return sumstats
358
361
 
@@ -87,3 +87,17 @@ def verify_datatype(header, dtype):
87
87
  return "F"
88
88
  else:
89
89
  return "NA"
90
+
91
+ def check_dataframe_shape(sumstats, log, verbose):
92
+ memory_in_mb = sumstats.memory_usage().sum()/1024/1024
93
+ try:
94
+ log.write(" -Current Dataframe shape : {} x {} ; Memory usage: {:.2f} MB".format(len(sumstats),len(sumstats.columns),memory_in_mb), verbose=verbose)
95
+ except:
96
+ log.write(" -WARNING! Error: cannot get Dataframe shape...", verbose=verbose)
97
+
98
+ def check_dataframe_memory_usage(sumstats, log, verbose):
99
+ memory_in_mb = sumstats.memory_usage().sum()/1024/1024
100
+ try:
101
+ log.write(" -Current Dataframe memory usage: {:.2f} MB".format(memory_in_mb), verbose=verbose)
102
+ except:
103
+ log.write(" -WARNING! Error: cannot get Memory usage...", verbose=verbose)