gwaslab 3.4.36__py3-none-any.whl → 3.4.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (42) hide show
  1. gwaslab/__init__.py +1 -1
  2. gwaslab/data/formatbook.json +722 -721
  3. gwaslab/g_Log.py +8 -0
  4. gwaslab/g_Sumstats.py +80 -178
  5. gwaslab/g_SumstatsPair.py +6 -2
  6. gwaslab/g_Sumstats_summary.py +3 -3
  7. gwaslab/g_meta.py +13 -3
  8. gwaslab/g_version.py +2 -2
  9. gwaslab/hm_casting.py +29 -15
  10. gwaslab/hm_harmonize_sumstats.py +312 -159
  11. gwaslab/hm_rsid_to_chrpos.py +1 -1
  12. gwaslab/io_preformat_input.py +46 -37
  13. gwaslab/io_to_formats.py +428 -295
  14. gwaslab/qc_check_datatype.py +15 -1
  15. gwaslab/qc_fix_sumstats.py +956 -719
  16. gwaslab/util_ex_calculate_ldmatrix.py +29 -11
  17. gwaslab/util_ex_gwascatalog.py +1 -1
  18. gwaslab/util_ex_ldproxyfinder.py +1 -1
  19. gwaslab/util_ex_process_h5.py +26 -17
  20. gwaslab/util_ex_process_ref.py +3 -3
  21. gwaslab/util_ex_run_coloc.py +26 -4
  22. gwaslab/util_in_convert_h2.py +1 -1
  23. gwaslab/util_in_fill_data.py +44 -5
  24. gwaslab/util_in_filter_value.py +122 -34
  25. gwaslab/util_in_get_density.py +2 -2
  26. gwaslab/util_in_get_sig.py +41 -9
  27. gwaslab/viz_aux_quickfix.py +26 -21
  28. gwaslab/viz_aux_reposition_text.py +7 -4
  29. gwaslab/viz_aux_save_figure.py +6 -5
  30. gwaslab/viz_plot_compare_af.py +5 -5
  31. gwaslab/viz_plot_compare_effect.py +22 -5
  32. gwaslab/viz_plot_miamiplot2.py +28 -20
  33. gwaslab/viz_plot_mqqplot.py +214 -98
  34. gwaslab/viz_plot_qqplot.py +11 -8
  35. gwaslab/viz_plot_regionalplot.py +16 -9
  36. gwaslab/viz_plot_trumpetplot.py +15 -6
  37. {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/METADATA +3 -3
  38. gwaslab-3.4.38.dist-info/RECORD +72 -0
  39. gwaslab-3.4.36.dist-info/RECORD +0 -72
  40. {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
  41. {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
  42. {gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0
gwaslab/g_Log.py CHANGED
@@ -2,6 +2,7 @@ import time
2
2
  class Log():
3
3
  def __init__(self):
4
4
  self.log_text=str(time.ctime(time.time()))+ " " + "Sumstats Object created."+ "\n"
5
+
5
6
  def write(self,*message,end="\n",show_time=True, verbose=True):
6
7
  if show_time is True:
7
8
  if verbose: print(str(time.ctime(time.time())),*message,end=end)
@@ -9,6 +10,13 @@ class Log():
9
10
  else:
10
11
  if verbose: print(*message,end=end)
11
12
  self.log_text = self.log_text + " ".join(map(str,message)) + end
13
+
14
+ def warning(self,*message,end="\n",show_time=True, verbose=True):
15
+ self.write(" #WARNING! {}".format(" ".join(map(str,message))),
16
+ end=end,
17
+ show_time=show_time,
18
+ verbose=verbose)
19
+
12
20
  def show(self):
13
21
  print(self.log_text)
14
22
  def save(self,path,verbose=True):
gwaslab/g_Sumstats.py CHANGED
@@ -5,7 +5,7 @@ import copy
5
5
  from gwaslab.g_Sumstats_summary import summarize
6
6
  from gwaslab.g_Sumstats_summary import lookupstatus
7
7
  from gwaslab.io_preformat_input import preformat
8
- from gwaslab.io_to_formats import tofmt
8
+ from gwaslab.io_to_formats import _to_format
9
9
  from gwaslab.g_Log import Log
10
10
  from gwaslab.qc_fix_sumstats import fixID
11
11
  from gwaslab.qc_fix_sumstats import removedup
@@ -35,6 +35,8 @@ from gwaslab.util_in_filter_value import filterregionout
35
35
  from gwaslab.util_in_filter_value import inferbuild
36
36
  from gwaslab.util_in_filter_value import sampling
37
37
  from gwaslab.util_in_filter_value import _get_flanking
38
+ from gwaslab.util_in_filter_value import _get_flanking_by_chrpos
39
+ from gwaslab.util_in_filter_value import _get_flanking_by_id
38
40
  from gwaslab.util_in_calculate_gc import lambdaGC
39
41
  from gwaslab.util_in_convert_h2 import _get_per_snp_r2
40
42
  from gwaslab.util_in_get_sig import getsig
@@ -52,7 +54,8 @@ from gwaslab.bd_common_data import get_format_dict
52
54
  from gwaslab.bd_common_data import get_formats_list
53
55
  from gwaslab.g_version import _show_version
54
56
  from gwaslab.g_version import gwaslab_info
55
- from gwaslab.g_meta import init_meta
57
+ from gwaslab.g_meta import _init_meta
58
+ from gwaslab.g_meta import _append_meta_record
56
59
  from gwaslab.util_ex_run_clumping import _clump
57
60
  from gwaslab.util_ex_calculate_ldmatrix import tofinemapping
58
61
  from gwaslab.util_ex_calculate_prs import _calculate_prs
@@ -60,6 +63,7 @@ from gwaslab.viz_plot_mqqplot import mqqplot
60
63
  from gwaslab.viz_plot_trumpetplot import plottrumpet
61
64
  from gwaslab.viz_plot_compare_af import plotdaf
62
65
  from gwaslab.util_ex_run_susie import _run_susie_rss
66
+ from gwaslab.qc_fix_sumstats import _check_data_consistency
63
67
  import gc
64
68
 
65
69
  #20220309
@@ -119,10 +123,9 @@ class Sumstats():
119
123
  self.log = Log()
120
124
 
121
125
  # meta information
122
- self.meta = init_meta()
126
+ self.meta = _init_meta()
123
127
  self.build = build
124
- self.meta["gwaslab"]["study_name"] = study
125
- #self.meta["gwaslab"]["genome_build"] = build
128
+ self.meta["gwaslab"]["study_name"] = study
126
129
  self.meta["gwaslab"]["species"] = species
127
130
 
128
131
  # initialize attributes for clumping and finmapping
@@ -217,8 +220,22 @@ class Sumstats():
217
220
  return lookupstatus(self.data[status])
218
221
 
219
222
  def set_build(self, build, verbose=True):
220
- self.data = _set_build(self.data, build=build, log=self.log,verbose=verbose)
223
+ self.data, self.meta["gwaslab"]["genome_build"] = _set_build(self.data, build=build, log=self.log,verbose=verbose)
221
224
  gc.collect()
225
+
226
+ def infer_build(self,**args):
227
+ self.data, self.meta["gwaslab"]["genome_build"] = inferbuild(self.data,**args)
228
+
229
+ def liftover(self,to_build, from_build=None,**args):
230
+ if from_build is None:
231
+ if self.meta["gwaslab"]["genome_build"]=="99":
232
+ self.data, self.meta["gwaslab"]["genome_build"] = inferbuild(self.data,**args)
233
+ from_build = self.meta["gwaslab"]["genome_build"]
234
+ self.data = parallelizeliftovervariant(self.data,from_build=from_build, to_build=to_build, log=self.log,**args)
235
+ self.meta["is_sorted"] = False
236
+ self.meta["is_harmonised"] = False
237
+ self.meta["gwaslab"]["genome_build"]=to_build
238
+
222
239
  # QC ######################################################################################
223
240
  #clean the sumstats with one line
224
241
  def basic_check(self,
@@ -231,6 +248,7 @@ class Sumstats():
231
248
  fixpos_args={},
232
249
  fixallele_args={},
233
250
  sanitycheckstats_args={},
251
+ consistencycheck_args={},
234
252
  normalize=True,
235
253
  normalizeallele_args={},
236
254
  verbose=True):
@@ -241,6 +259,8 @@ class Sumstats():
241
259
  self.data = fixpos(self.data,log=self.log,remove=remove,verbose=verbose,**fixpos_args)
242
260
  self.data = fixallele(self.data,log=self.log,remove=remove,verbose=verbose,**fixallele_args)
243
261
  self.data = sanitycheckstats(self.data,log=self.log,verbose=verbose,**sanitycheckstats_args)
262
+ _check_data_consistency(self.data,log=self.log,verbose=verbose,**consistencycheck_args)
263
+
244
264
  if normalize is True:
245
265
  self.data = parallelnormalizeallele(self.data,n_cores=n_cores,verbose=verbose,log=self.log,**normalizeallele_args)
246
266
  if remove_dup is True:
@@ -329,9 +349,9 @@ class Sumstats():
329
349
 
330
350
  self.data= parallelinferstrand(self.data,ref_infer = ref_infer,ref_alt_freq=ref_alt_freq,maf_threshold=maf_threshold,
331
351
  n_cores=n_cores,log=self.log,**inferstrand_args)
332
-
333
- self.meta["gwaslab"]["references"]["ref_infer"] = ref_infer
334
352
 
353
+ self.meta["gwaslab"]["references"]["ref_infer"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer"] , ref_infer)
354
+
335
355
  self.data =flipallelestats(self.data,log=self.log,**flipallelestats_args)
336
356
 
337
357
  gc.collect()
@@ -341,13 +361,18 @@ class Sumstats():
341
361
 
342
362
  self.data = parallelizeassignrsid(self.data,path=ref_rsid_tsv,ref_mode="tsv",
343
363
  n_cores=n_cores,log=self.log,**assignrsid_args)
364
+
365
+
366
+
344
367
  self.meta["gwaslab"]["references"]["ref_rsid_tsv"] = ref_rsid_tsv
345
368
  gc.collect()
369
+
346
370
  if ref_rsid_vcf is not None:
347
-
348
371
  self.data = parallelizeassignrsid(self.data,path=ref_rsid_vcf,ref_mode="vcf",
349
372
  n_cores=n_cores,log=self.log,**assignrsid_args)
350
- self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = ref_rsid_vcf
373
+
374
+ self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_rsid_vcf"] , ref_rsid_vcf)
375
+
351
376
  gc.collect()
352
377
  ######################################################
353
378
  if remove is True:
@@ -376,17 +401,23 @@ class Sumstats():
376
401
  self.data = removedup(self.data,log=self.log,**args)
377
402
  def check_sanity(self,**args):
378
403
  self.data = sanitycheckstats(self.data,log=self.log,**args)
379
- #
404
+ def check_data_consistency(self, **args):
405
+ _check_data_consistency(self.data,log=self.log,**args)
380
406
  def check_id(self,**args):
381
407
  pass
382
- def check_ref(self,**args):
383
- self.data = checkref(self.data,log=self.log,**args)
384
- def infer_strand(self,**args):
385
- self.data = parallelinferstrand(self.data,log=self.log,**args)
408
+
409
+ def check_ref(self,ref_seq,**args):
410
+ self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
411
+ self.data = checkref(self.data,ref_seq,log=self.log,**args)
412
+ def infer_strand(self,ref_infer,**args):
413
+ self.meta["gwaslab"]["references"]["ref_infer"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer"] , ref_infer)
414
+ self.data = parallelinferstrand(self.data,ref_infer=ref_infer,log=self.log,**args)
415
+
386
416
  def flip_allele_stats(self,**args):
387
417
  self.data = flipallelestats(self.data,log=self.log,**args)
388
418
  def normalize_allele(self,**args):
389
419
  self.data = parallelnormalizeallele(self.data,log=self.log,**args)
420
+
390
421
  def assign_rsid(self,
391
422
  ref_rsid_tsv=None,
392
423
  ref_rsid_vcf=None,
@@ -396,21 +427,15 @@ class Sumstats():
396
427
  self.meta["gwaslab"]["references"]["ref_rsid_tsv"] = ref_rsid_tsv
397
428
  if ref_rsid_vcf is not None:
398
429
  self.data = parallelizeassignrsid(self.data,path=ref_rsid_vcf,ref_mode="vcf",log=self.log,**args)
399
- self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = ref_rsid_vcf
430
+ self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_rsid_vcf"] , ref_rsid_vcf)
431
+
400
432
  def rsid_to_chrpos(self,**args):
401
433
  self.data = rsidtochrpos(self.data,log=self.log,**args)
434
+
402
435
  def rsid_to_chrpos2(self,**args):
403
436
  self.data = parallelrsidtochrpos(self.data,log=self.log,**args)
404
437
 
405
- def liftover(self,to_build, from_build=None,**args):
406
- if from_build is None:
407
- if self.meta["gwaslab"]["genome_build"]=="99":
408
- self.data, self.meta["gwaslab"]["genome_build"] = inferbuild(self.data,**args)
409
- from_build = self.meta["gwaslab"]["genome_build"]
410
- self.data = parallelizeliftovervariant(self.data,from_build=from_build, to_build=to_build, log=self.log,**args)
411
- self.meta["is_sorted"] = False
412
- self.meta["is_harmonised"] = False
413
- self.meta["gwaslab"]["genome_build"]=to_build
438
+
414
439
  ############################################################################################################
415
440
 
416
441
  def sort_coordinate(self,**sort_args):
@@ -420,14 +445,13 @@ class Sumstats():
420
445
  self.data = sortcolumn(self.data,log=self.log,**args)
421
446
 
422
447
  ############################################################################################################
423
- def fill_data(self, **args):
424
- self.data = filldata(self.data,**args)
425
-
426
- def infer_build(self,**args):
427
- self.data, self.meta["gwaslab"]["genome_build"] = inferbuild(self.data,**args)
448
+ def fill_data(self, verbose=True, **args):
449
+ self.data = filldata(self.data, verbose=verbose, **args)
450
+ self.data = sortcolumn(self.data, verbose=verbose, log=self.log)
451
+
428
452
  # utilities ############################################################################################################
429
453
  # filter series ######################################################################
430
- def get_flanking(self, inplace=False,**args):
454
+ def filter_flanking(self, inplace=False,**args):
431
455
  if inplace is False:
432
456
  new_Sumstats_object = copy.deepcopy(self)
433
457
  new_Sumstats_object.data = _get_flanking(new_Sumstats_object.data, **args)
@@ -435,6 +459,22 @@ class Sumstats():
435
459
  else:
436
460
  self.data = _get_flanking(self.data, **args)
437
461
 
462
+ def filter_flanking_by_chrpos(self, chrpos, inplace=False,**args):
463
+ if inplace is False:
464
+ new_Sumstats_object = copy.deepcopy(self)
465
+ new_Sumstats_object.data = _get_flanking_by_chrpos(new_Sumstats_object.data, chrpos, **args)
466
+ return new_Sumstats_object
467
+ else:
468
+ self.data = _get_flanking_by_chrpos(self.data, chrpos,**args)
469
+
470
+ def filter_flanking_by_id(self, snpid, inplace=False,**args):
471
+ if inplace is False:
472
+ new_Sumstats_object = copy.deepcopy(self)
473
+ new_Sumstats_object.data = _get_flanking_by_id(new_Sumstats_object.data, snpid, **args)
474
+ return new_Sumstats_object
475
+ else:
476
+ self.data = _get_flanking_by_id(self.data, snpid, **args)
477
+
438
478
  def filter_value(self, expr, inplace=False, **args):
439
479
  if inplace is False:
440
480
  new_Sumstats_object = copy.deepcopy(self)
@@ -485,11 +525,12 @@ class Sumstats():
485
525
 
486
526
  def check_af(self,ref_infer,**args):
487
527
  self.data = parallelecheckaf(self.data,ref_infer=ref_infer,log=self.log,**args)
488
- self.meta["gwaslab"]["references"]["ref_infer_daf"] = ref_infer
489
-
528
+ self.meta["gwaslab"]["references"]["ref_infer_daf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_daf"] , ref_infer)
529
+
490
530
  def infer_af(self,ref_infer,**args):
491
531
  self.data = paralleleinferaf(self.data,ref_infer=ref_infer,log=self.log,**args)
492
532
  self.meta["gwaslab"]["references"]["ref_infer_af"] = ref_infer
533
+ self.meta["gwaslab"]["references"]["ref_infer_af"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_af"] , ref_infer)
493
534
 
494
535
  def plot_daf(self, **args):
495
536
  fig,outliers = plotdaf(self.data, **args)
@@ -525,8 +566,10 @@ class Sumstats():
525
566
 
526
567
  return plot
527
568
 
528
- def plot_trumpet(self, **args):
529
- fig = plottrumpet(self.data, **args)
569
+ def plot_trumpet(self, build=None, **args):
570
+ if build is None:
571
+ build = self.meta["gwaslab"]["genome_build"]
572
+ fig = plottrumpet(self.data,build = build, **args)
530
573
  return fig
531
574
 
532
575
  def get_lead(self, build=None, gls=False, **args):
@@ -647,148 +690,7 @@ class Sumstats():
647
690
 
648
691
  # to_format ###############################################################################################
649
692
 
650
- def to_format(self,
651
- path="./sumstats",
652
- fmt="gwaslab",
653
- extract=None,
654
- exclude=None,
655
- cols=None,
656
- id_use="rsID",
657
- hapmap3=False,
658
- exclude_hla=False,
659
- hla_range=(25,34),
660
- build=None,
661
- n=None,
662
- verbose=True,
663
- no_status=False,
664
- output_log=True,
665
- to_csvargs=None,
666
- float_formats=None,
667
- xymt_number=False,
668
- xymt=None,
669
- chr_prefix="",
670
- ssfmeta=False,
671
- md5sum=False,
672
- bgzip=False,
673
- tabix=False,
674
- tabix_indexargs={}):
693
+ def to_format(self, path, build=None, **args):
675
694
  if build is None:
676
695
  build = self.meta["gwaslab"]["genome_build"]
677
- onetime_log = copy.deepcopy(self.log)
678
- if to_csvargs is None:
679
- to_csvargs = {}
680
- if float_formats is None:
681
- float_formats={}
682
- if cols is None:
683
- cols=[]
684
- if xymt is None:
685
- xymt = ["X","Y","MT"]
686
-
687
- formatlist= get_formats_list() + ["vep","bed","annovar","vcf"]
688
- if fmt in formatlist:
689
- if verbose: onetime_log.write("Start to format the output sumstats in: ",fmt, " format")
690
- else:
691
- raise ValueError("Please select a format to output")
692
-
693
-
694
- #######################################################################################################
695
- # filter
696
- output = self.data.copy()
697
- if extract is not None:
698
- output = output.loc[output[id_use].isin(extract),:]
699
-
700
- if exclude is not None:
701
- output = output.loc[~output[id_use].isin(exclude),:]
702
-
703
- #hla and hapmap3 #######################################################################################
704
- suffix=fmt
705
-
706
- #exclude hla
707
- if exclude_hla is True:
708
- if verbose: onetime_log.write(" -Excluding variants in MHC (HLA) region ...")
709
- before = len(output)
710
- is_hla = (output["CHR"].astype("string") == "6") & (output["POS"].astype("Int64") > hla_range[0]*1000000) & (output["POS"].astype("Int64") < hla_range[1]*1000000)
711
- output = output.loc[~is_hla,:]
712
- after = len(output)
713
- if verbose: onetime_log.write(" -Exclude "+ str(before - after) + " variants in MHC (HLA) region : {}Mb - {}Mb.".format(hla_range[0], hla_range[1]))
714
- suffix = "noMHC."+suffix
715
-
716
- #extract hapmap3 SNPs
717
- if hapmap3 is True:
718
- output = gethapmap3(output,build=build,verbose=True)
719
- after = len(output)
720
- if verbose: onetime_log.write(" -Extract "+ str(after) + " variants in Hapmap3 datasets for build "+build+".")
721
- suffix = "hapmap3."+suffix
722
-
723
- # add a n column
724
- if n is not None:
725
- output["N"] = n
726
-
727
- #######################################################################################################
728
- #formatting float statistics
729
- if verbose: onetime_log.write(" -Formatting statistics ...")
730
-
731
- formats = {'EAF': '{:.4g}',
732
- 'BETA': '{:.4f}',
733
- 'Z': '{:.4f}',
734
- 'CHISQ': '{:.4f}',
735
- 'SE': '{:.4f}',
736
- 'OR': '{:.4f}',
737
- 'OR_95U': '{:.4f}',
738
- 'OR_95L': '{:.4f}',
739
- 'INFO': '{:.4f}',
740
- 'P': '{:.4e}',
741
- 'MLOG10P': '{:.4f}',
742
- 'DAF': '{:.4f}'
743
- }
744
-
745
- for col, f in float_formats.items():
746
- if col in output.columns:
747
- formats[col]=f
748
- for col, f in formats.items():
749
- if col in output.columns:
750
- if output[col].dtype in ["float64","float32","float16","float"]:
751
- output[col] = output[col].map(f.format)
752
- if verbose:
753
- onetime_log.write(" - Float statistics formats:")
754
- keys=[]
755
- values=[]
756
- for key,value in formats.items():
757
- if key in output.columns:
758
- keys.append(key)
759
- values.append(value)
760
- onetime_log.write(" - Columns:",keys)
761
- onetime_log.write(" - Output formats:",values)
762
-
763
- ##########################################################################################################
764
- # output, mapping column names
765
-
766
- if fmt in get_formats_list() + ["vep","bed","annovar","vcf"]:
767
- tofmt(output,
768
- path=path,
769
- fmt=fmt,
770
- cols=cols,
771
- suffix=suffix,
772
- build=build,
773
- verbose=True,
774
- no_status=no_status,
775
- log=onetime_log,
776
- to_csvargs=to_csvargs,
777
- chr_prefix=chr_prefix,
778
- meta = self.meta,
779
- ssfmeta=ssfmeta,
780
- bgzip=bgzip,
781
- tabix=tabix,
782
- tabix_indexargs=tabix_indexargs,
783
- md5sum=md5sum,
784
- xymt_number=xymt_number,
785
- xymt=xymt)
786
- if output_log is True:
787
- log_path = path + "."+ suffix + ".log"
788
- if verbose: onetime_log.write(" -Saving log file to: {}".format(log_path))
789
- if verbose: onetime_log.write("Finished outputting successfully!")
790
- try:
791
- onetime_log.save(log_path, verbose=False)
792
- except:
793
- pass
794
-
696
+ _to_format(self.data, path, log=self.log, meta=self.meta, build=build, **args)
gwaslab/g_SumstatsPair.py CHANGED
@@ -28,8 +28,10 @@ class SumstatsPair( ):
28
28
  raise ValueError("Please provide GWASLab Sumstats Object #1.")
29
29
  if not isinstance(sumstatsObject2, Sumstats):
30
30
  raise ValueError("Please provide GWASLab Sumstats Object #2.")
31
-
32
- self.study_name = "{}_{}".format(sumstatsObject1.meta["gwaslab"]["study_name"], sumstatsObject2.meta["gwaslab"]["study_name"])
31
+ if sumstatsObject1.meta["gwaslab"]["study_name"]!=sumstatsObject2.meta["gwaslab"]["study_name"]:
32
+ self.study_name = "{}_{}".format(sumstatsObject1.meta["gwaslab"]["study_name"], sumstatsObject2.meta["gwaslab"]["study_name"])
33
+ else:
34
+ self.study_name = "{}_{}".format("STUDY1", "STUDY2")
33
35
  self.snp_info_cols = []
34
36
  self.stats_cols =[]
35
37
  self.other_cols=[]
@@ -42,6 +44,8 @@ class SumstatsPair( ):
42
44
  self.clumps ={}
43
45
  self.ns = None
44
46
 
47
+ self.log.write( "Start to create SumstatsPair object..." )
48
+
45
49
  for i in sumstatsObject1.data.columns:
46
50
  if i in ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]:
47
51
  self.snp_info_cols.append(i)
@@ -15,7 +15,7 @@ def summarize(insumstats,
15
15
  for i in [snpid,rsid,eaf,p,n,status]:
16
16
  if i in insumstats.columns:
17
17
  cols.append(i)
18
- sumstats= insumstats.loc[:,cols].copy()
18
+ sumstats= insumstats[cols].copy()
19
19
  ###############################################################################
20
20
  numeric_cols=[]
21
21
  output = {}
@@ -68,7 +68,7 @@ def summarize(insumstats,
68
68
  sumstats.drop(columns='uniq_index',inplace=True)
69
69
  status_dic = {}
70
70
  for index,row in status_summary.iterrows():
71
- status_dic[str(index)]=row[0]
71
+ status_dic[str(index)]=row.iloc[0]
72
72
  output["STATUS"]=status_dic
73
73
  numeric_cols.append("STATUS")
74
74
  df = pd.DataFrame.from_dict({(i,j): output[i][j]
@@ -84,7 +84,7 @@ def summarize(insumstats,
84
84
  return df
85
85
 
86
86
  def sum_status(id_to_use, sumstats):
87
- results = sumstats.groupby("STATUS").count()
87
+ results = sumstats.groupby("STATUS",observed=True).count()
88
88
  results = results.loc[results[id_to_use]>0,:].sort_values(id_to_use,ascending=False)
89
89
  return results
90
90
 
gwaslab/g_meta.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from gwaslab.g_version import gwaslab_info
2
2
 
3
- def init_meta():
3
+ def _init_meta():
4
4
  metadata = {"gwaslab":{
5
5
  "gwaslab_version": gwaslab_info()["version"],
6
6
  "study_name":"Sumstats_1",
@@ -23,7 +23,11 @@ def init_meta():
23
23
  "ref_rsid_tsv":"Unknown",
24
24
  "ref_rsid_vcf":"Unknown",
25
25
  "ref_seq":"Unknown",
26
- "ref_infer":"Unknown"
26
+ "ref_infer":"Unknown",
27
+ "ref_infer_af":"Unknown",
28
+ "ref_infer_daf":"Unknown",
29
+ "ref_rsid_to_chrpos_tsv":"Unknown",
30
+ "ref_rsid_to_chrpos_vcf":"Unknown"
27
31
  }
28
32
  },
29
33
  "genotyping_technology":"Unknown",
@@ -45,4 +49,10 @@ def init_meta():
45
49
  "coordinate_system":"1-based",
46
50
  "sex": "M|F|combined"
47
51
  }
48
- return metadata.copy()
52
+ return metadata.copy()
53
+
54
+ def _append_meta_record(old, new):
55
+ if old == "Unknown" or old== "Unchecked":
56
+ return new
57
+ else:
58
+ return "{}, {}".format(old, new)
gwaslab/g_version.py CHANGED
@@ -15,8 +15,8 @@ def _get_version():
15
15
  def gwaslab_info():
16
16
  # version meta information
17
17
  dic={
18
- "version":"3.4.36",
19
- "release_date":"20240123"
18
+ "version":"3.4.38",
19
+ "release_date":"20240203"
20
20
  }
21
21
  return dic
22
22
 
gwaslab/hm_casting.py CHANGED
@@ -14,9 +14,11 @@ def _merge_mold_with_sumstats(mold, sumstats, ref_path=None, windowsizeb=10, log
14
14
  for i in sumstats.columns:
15
15
  if i in ["SNPID","rsID"]:
16
16
  cols_to_drop.append(i)
17
+
18
+ log.write("Start to merge sumstats...", verbose=verbose)
17
19
 
18
20
  if len(cols_to_drop)>0:
19
- log.write("Dropping old IDs:{}".format(cols_to_drop))
21
+ log.write(" -Dropping old IDs:{}".format(cols_to_drop), verbose=verbose)
20
22
  sumstats = sumstats.drop(columns=cols_to_drop)
21
23
 
22
24
  if ref_path is not None :
@@ -30,17 +32,18 @@ def _merge_mold_with_sumstats(mold, sumstats, ref_path=None, windowsizeb=10, log
30
32
  mold["_IDENTIFIER_FOR_VARIANT"] = range(len(mold))
31
33
 
32
34
  mold_sumstats = pd.merge(mold, sumstats, on=["CHR","POS"], how="inner",suffixes=suffixes)
33
- log.write("After merging by CHR and POS:{}".format(len(mold_sumstats)))
35
+ log.write(" -After merging by CHR and POS:{}".format(len(mold_sumstats)), verbose=verbose)
34
36
 
35
37
  mold_sumstats = _keep_variants_with_same_allele_set(mold_sumstats,suffixes=suffixes)
36
- log.write("Matched variants:{}".format(len(mold_sumstats)))
38
+
39
+ log.write(" -Matched variants:{}".format(len(mold_sumstats)), verbose=verbose)
37
40
 
38
- if ref_path is not None:
39
- # match removed sumstats
40
- mold_removed = mold.loc[~mold[index1].isin(mold_sumstats[index1]),:]
41
- iron_removed = sumstats.loc[~sumstats[index2].isin(mold_sumstats[index2]),:]
42
- _match_two_sumstats(mold_removed,iron_removed,ref_path,windowsizeb=windowsizeb)
43
- mold_sumstats.drop(columns=["_INDEX",""])
41
+ #if ref_path is not None:
42
+ # # match removed sumstats
43
+ # mold_removed = mold.loc[~mold[index1].isin(mold_sumstats[index1]),:]
44
+ # iron_removed = sumstats.loc[~sumstats[index2].isin(mold_sumstats[index2]),:]
45
+ # _match_two_sumstats(mold_removed,iron_removed,ref_path,windowsizeb=windowsizeb)
46
+ # mold_sumstats.drop(columns=["_INDEX",""])
44
47
 
45
48
  if return_not_matched_mold == True:
46
49
  sumstats1 = mold.loc[~mold["_IDENTIFIER_FOR_VARIANT"].isin(mold_sumstats["_IDENTIFIER_FOR_VARIANT"]),:]
@@ -59,14 +62,17 @@ def _keep_variants_with_same_allele_set(sumstats, log=Log(),verbose=True,suffixe
59
62
 
60
63
  all_alleles = set(list(sumstats[ea1].unique())+list(sumstats[nea1].unique())+list(sumstats[ea2].unique())+list(sumstats[nea2].unique()))
61
64
  allele_type = CategoricalDtype(categories=all_alleles, ordered=False)
62
- sumstats.loc[:, [nea1,ea1,nea2,ea2]] = sumstats.loc[:, [nea1,ea1,nea2,ea2]].astype(allele_type)
65
+ sumstats[[nea1,ea1,nea2,ea2]] = sumstats[[nea1,ea1,nea2,ea2]].astype(allele_type)
63
66
 
64
67
  is_perfect_match = (sumstats[ea2] == sumstats[ea1]) & (sumstats[nea2] == sumstats[nea1])
65
68
  is_flipped_match = (sumstats[ea2] == sumstats[nea1]) & (sumstats[nea2] == sumstats[ea1])
66
69
  is_allele_set_match = is_flipped_match | is_perfect_match
67
70
 
68
- sumstats.loc[~is_allele_set_match,:]
69
-
71
+ log.write(" -Matching alleles and keeping only variants with same allele set: ", verbose=verbose)
72
+ log.write(" -Perfect match: {}".format(sum(is_perfect_match)), verbose=verbose)
73
+ log.write(" -Flipped match: {}".format(sum(is_flipped_match)), verbose=verbose)
74
+ log.write(" -Unmatched : {}".format(sum(~is_allele_set_match)), verbose=verbose)
75
+
70
76
  return sumstats.loc[is_allele_set_match,:]
71
77
 
72
78
  def _align_with_mold(sumstats, log=Log(),verbose=True, suffixes=("_MOLD","")):
@@ -77,10 +83,18 @@ def _align_with_mold(sumstats, log=Log(),verbose=True, suffixes=("_MOLD","")):
77
83
  nea2="NEA"+suffixes[1]
78
84
  status1="STATUS"+suffixes[0]
79
85
  status2="STATUS"+suffixes[1]
86
+
80
87
  is_perfect_match = (sumstats[ea2] == sumstats[ea1]) & (sumstats[nea2] == sumstats[nea1])
81
88
  is_flipped_match = (sumstats[ea2] == sumstats[nea1]) & (sumstats[nea2] == sumstats[ea1])
82
89
 
90
+ log.write(" -Aligning alleles with reference: ", verbose=verbose)
91
+ log.write(" -Perfect match: {}".format(sum(is_perfect_match)), verbose=verbose)
92
+ log.write(" -Flipped match: {}".format(sum(is_flipped_match)), verbose=verbose)
93
+
94
+ log.write(" -For perfect match: copy STATUS from reference...", verbose=verbose)
83
95
  sumstats.loc[is_perfect_match,status2] = copy_status(sumstats.loc[is_perfect_match,status1], sumstats.loc[is_perfect_match,status2],6)
96
+
97
+ log.write(" -For Flipped match: convert STATUS xxxxx[456789]x to xxxxx3x...", verbose=verbose)
84
98
  sumstats.loc[is_flipped_match,status2] = vchange_status(sumstats.loc[is_flipped_match,status2],6,"456789","333333")
85
99
 
86
100
  return sumstats
@@ -119,9 +133,9 @@ def _sort_pair_cols(molded_sumstats, verbose=True, log=Log(), order=None, stats_
119
133
  if i not in order:
120
134
  output_columns.append(i)
121
135
 
122
- if verbose: log.write(" -Reordering columns to :", ",".join(output_columns))
123
- molded_sumstats = molded_sumstats.loc[:, output_columns]
124
- if verbose: log.write("Finished sorting columns successfully!")
136
+ if verbose: log.write(" -Reordering columns to :", ",".join(output_columns), verbose=verbose)
137
+ molded_sumstats = molded_sumstats[ output_columns]
138
+ if verbose: log.write("Finished sorting columns successfully!", verbose=verbose)
125
139
 
126
140
  return molded_sumstats
127
141