gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (57) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/data/formatbook.json +722 -721
  5. gwaslab/g_Log.py +22 -5
  6. gwaslab/g_Sumstats.py +110 -163
  7. gwaslab/g_SumstatsPair.py +76 -25
  8. gwaslab/g_SumstatsT.py +2 -2
  9. gwaslab/g_Sumstats_summary.py +3 -3
  10. gwaslab/g_version.py +10 -10
  11. gwaslab/hm_casting.py +36 -17
  12. gwaslab/hm_harmonize_sumstats.py +354 -221
  13. gwaslab/hm_rsid_to_chrpos.py +1 -1
  14. gwaslab/io_preformat_input.py +49 -43
  15. gwaslab/io_read_ldsc.py +49 -1
  16. gwaslab/io_to_formats.py +428 -295
  17. gwaslab/ldsc_irwls.py +198 -0
  18. gwaslab/ldsc_jackknife.py +514 -0
  19. gwaslab/ldsc_ldscore.py +417 -0
  20. gwaslab/ldsc_parse.py +294 -0
  21. gwaslab/ldsc_regressions.py +747 -0
  22. gwaslab/ldsc_sumstats.py +629 -0
  23. gwaslab/qc_check_datatype.py +3 -3
  24. gwaslab/qc_fix_sumstats.py +891 -778
  25. gwaslab/util_ex_calculate_ldmatrix.py +31 -13
  26. gwaslab/util_ex_gwascatalog.py +25 -25
  27. gwaslab/util_ex_ldproxyfinder.py +10 -10
  28. gwaslab/util_ex_ldsc.py +189 -0
  29. gwaslab/util_ex_process_ref.py +3 -3
  30. gwaslab/util_ex_run_coloc.py +26 -4
  31. gwaslab/util_in_calculate_gc.py +6 -6
  32. gwaslab/util_in_calculate_power.py +42 -43
  33. gwaslab/util_in_convert_h2.py +8 -8
  34. gwaslab/util_in_fill_data.py +30 -30
  35. gwaslab/util_in_filter_value.py +201 -74
  36. gwaslab/util_in_get_density.py +10 -10
  37. gwaslab/util_in_get_sig.py +445 -71
  38. gwaslab/viz_aux_annotate_plot.py +12 -12
  39. gwaslab/viz_aux_quickfix.py +42 -37
  40. gwaslab/viz_aux_reposition_text.py +10 -7
  41. gwaslab/viz_aux_save_figure.py +18 -8
  42. gwaslab/viz_plot_compare_af.py +32 -33
  43. gwaslab/viz_plot_compare_effect.py +63 -71
  44. gwaslab/viz_plot_miamiplot2.py +34 -26
  45. gwaslab/viz_plot_mqqplot.py +126 -75
  46. gwaslab/viz_plot_qqplot.py +11 -8
  47. gwaslab/viz_plot_regionalplot.py +36 -33
  48. gwaslab/viz_plot_rg_heatmap.py +28 -26
  49. gwaslab/viz_plot_stackedregional.py +40 -21
  50. gwaslab/viz_plot_trumpetplot.py +65 -61
  51. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  52. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
  53. gwaslab-3.4.39.dist-info/RECORD +80 -0
  54. gwaslab-3.4.37.dist-info/RECORD +0 -72
  55. /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  56. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  57. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/g_Log.py CHANGED
@@ -1,17 +1,34 @@
1
1
  import time
2
2
  class Log():
3
3
  def __init__(self):
4
- self.log_text=str(time.ctime(time.time()))+ " " + "Sumstats Object created."+ "\n"
4
+ self.log_text=str(time.strftime('%Y/%m/%d %H:%M:%S'))+ " " + "Sumstats Object created."+ "\n"
5
+
5
6
  def write(self,*message,end="\n",show_time=True, verbose=True):
6
7
  if show_time is True:
7
- if verbose: print(str(time.ctime(time.time())),*message,end=end)
8
- self.log_text = self.log_text + str(time.ctime(time.time())) + " " + " ".join(map(str,message)) + end
8
+ if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')),*message,end=end)
9
+ self.log_text = self.log_text + str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " ".join(map(str,message)) + end
9
10
  else:
10
11
  if verbose: print(*message,end=end)
11
12
  self.log_text = self.log_text + " ".join(map(str,message)) + end
13
+
14
+ def warning(self,*message,end="\n",show_time=True, verbose=True):
15
+ self.write(" #WARNING! {}".format(" ".join(map(str,message))),
16
+ end=end,
17
+ show_time=show_time,
18
+ verbose=verbose)
19
+
12
20
  def show(self):
13
21
  print(self.log_text)
14
22
  def save(self,path,verbose=True):
15
23
  with open(path,"w") as f:
16
- if verbose: print(str(time.ctime(time.time())) + " " + " -Save log file to : ", path)
17
- f.write(self.log_text)
24
+ if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " -Save log file to : ", path)
25
+ f.write(self.log_text)
26
+
27
+
28
+ def log(self,*message,end="\n",show_time=True, verbose=True):
29
+ if show_time is True:
30
+ if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')),*message,end=end)
31
+ self.log_text = self.log_text + str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " ".join(map(str,message)) + end
32
+ else:
33
+ if verbose: print(*message,end=end)
34
+ self.log_text = self.log_text + " ".join(map(str,message)) + end
gwaslab/g_Sumstats.py CHANGED
@@ -5,7 +5,7 @@ import copy
5
5
  from gwaslab.g_Sumstats_summary import summarize
6
6
  from gwaslab.g_Sumstats_summary import lookupstatus
7
7
  from gwaslab.io_preformat_input import preformat
8
- from gwaslab.io_to_formats import tofmt
8
+ from gwaslab.io_to_formats import _to_format
9
9
  from gwaslab.g_Log import Log
10
10
  from gwaslab.qc_fix_sumstats import fixID
11
11
  from gwaslab.qc_fix_sumstats import removedup
@@ -32,9 +32,14 @@ from gwaslab.util_in_filter_value import filterout
32
32
  from gwaslab.util_in_filter_value import filterin
33
33
  from gwaslab.util_in_filter_value import filterregionin
34
34
  from gwaslab.util_in_filter_value import filterregionout
35
+ from gwaslab.util_in_filter_value import _filter_indel
36
+ from gwaslab.util_in_filter_value import _filter_palindromic
37
+ from gwaslab.util_in_filter_value import _filter_snp
35
38
  from gwaslab.util_in_filter_value import inferbuild
36
39
  from gwaslab.util_in_filter_value import sampling
37
40
  from gwaslab.util_in_filter_value import _get_flanking
41
+ from gwaslab.util_in_filter_value import _get_flanking_by_chrpos
42
+ from gwaslab.util_in_filter_value import _get_flanking_by_id
38
43
  from gwaslab.util_in_calculate_gc import lambdaGC
39
44
  from gwaslab.util_in_convert_h2 import _get_per_snp_r2
40
45
  from gwaslab.util_in_get_sig import getsig
@@ -42,6 +47,8 @@ from gwaslab.util_in_get_density import getsignaldensity
42
47
  from gwaslab.util_in_get_density import assigndensity
43
48
  from gwaslab.util_in_get_sig import annogene
44
49
  from gwaslab.util_in_get_sig import getnovel
50
+ from gwaslab.util_in_get_sig import _check_cis
51
+ from gwaslab.util_in_get_sig import _check_novel_set
45
52
  from gwaslab.util_in_fill_data import filldata
46
53
  from gwaslab.bd_get_hapmap3 import gethapmap3
47
54
  from gwaslab.bd_common_data import get_chr_list
@@ -62,6 +69,9 @@ from gwaslab.viz_plot_trumpetplot import plottrumpet
62
69
  from gwaslab.viz_plot_compare_af import plotdaf
63
70
  from gwaslab.util_ex_run_susie import _run_susie_rss
64
71
  from gwaslab.qc_fix_sumstats import _check_data_consistency
72
+ from gwaslab.util_ex_ldsc import _estimate_h2_by_ldsc
73
+ from gwaslab.util_ex_ldsc import _estimate_rg_by_ldsc
74
+ from gwaslab.bd_get_hapmap3 import gethapmap3
65
75
  import gc
66
76
 
67
77
  #20220309
@@ -119,7 +129,8 @@ class Sumstats():
119
129
  # basic attributes
120
130
  self.data = pd.DataFrame()
121
131
  self.log = Log()
122
-
132
+ self.ldsc_h2 = None
133
+ self.ldsc_rg = None
123
134
  # meta information
124
135
  self.meta = _init_meta()
125
136
  self.build = build
@@ -133,7 +144,7 @@ class Sumstats():
133
144
  self.pipcs = pd.DataFrame()
134
145
 
135
146
  # print gwaslab version information
136
- if verbose: _show_version(self.log)
147
+ _show_version(self.log, verbose=verbose)
137
148
 
138
149
  #preformat the data
139
150
  self.data = preformat(
@@ -403,19 +414,16 @@ class Sumstats():
403
414
  _check_data_consistency(self.data,log=self.log,**args)
404
415
  def check_id(self,**args):
405
416
  pass
406
-
407
417
  def check_ref(self,ref_seq,**args):
408
418
  self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
409
419
  self.data = checkref(self.data,ref_seq,log=self.log,**args)
410
420
  def infer_strand(self,ref_infer,**args):
411
421
  self.meta["gwaslab"]["references"]["ref_infer"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer"] , ref_infer)
412
422
  self.data = parallelinferstrand(self.data,ref_infer=ref_infer,log=self.log,**args)
413
-
414
423
  def flip_allele_stats(self,**args):
415
424
  self.data = flipallelestats(self.data,log=self.log,**args)
416
425
  def normalize_allele(self,**args):
417
426
  self.data = parallelnormalizeallele(self.data,log=self.log,**args)
418
-
419
427
  def assign_rsid(self,
420
428
  ref_rsid_tsv=None,
421
429
  ref_rsid_vcf=None,
@@ -426,14 +434,11 @@ class Sumstats():
426
434
  if ref_rsid_vcf is not None:
427
435
  self.data = parallelizeassignrsid(self.data,path=ref_rsid_vcf,ref_mode="vcf",log=self.log,**args)
428
436
  self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_rsid_vcf"] , ref_rsid_vcf)
429
-
430
437
  def rsid_to_chrpos(self,**args):
431
438
  self.data = rsidtochrpos(self.data,log=self.log,**args)
432
-
433
439
  def rsid_to_chrpos2(self,**args):
434
440
  self.data = parallelrsidtochrpos(self.data,log=self.log,**args)
435
441
 
436
-
437
442
  ############################################################################################################
438
443
 
439
444
  def sort_coordinate(self,**sort_args):
@@ -449,14 +454,27 @@ class Sumstats():
449
454
 
450
455
  # utilities ############################################################################################################
451
456
  # filter series ######################################################################
452
- def get_flanking(self, inplace=False,**args):
457
+ def filter_flanking(self, inplace=False,**args):
453
458
  if inplace is False:
454
459
  new_Sumstats_object = copy.deepcopy(self)
455
460
  new_Sumstats_object.data = _get_flanking(new_Sumstats_object.data, **args)
456
461
  return new_Sumstats_object
457
462
  else:
458
463
  self.data = _get_flanking(self.data, **args)
459
-
464
+ def filter_flanking_by_chrpos(self, chrpos, inplace=False,**args):
465
+ if inplace is False:
466
+ new_Sumstats_object = copy.deepcopy(self)
467
+ new_Sumstats_object.data = _get_flanking_by_chrpos(new_Sumstats_object.data, chrpos, **args)
468
+ return new_Sumstats_object
469
+ else:
470
+ self.data = _get_flanking_by_chrpos(self.data, chrpos,**args)
471
+ def filter_flanking_by_id(self, snpid, inplace=False,**args):
472
+ if inplace is False:
473
+ new_Sumstats_object = copy.deepcopy(self)
474
+ new_Sumstats_object.data = _get_flanking_by_id(new_Sumstats_object.data, snpid, **args)
475
+ return new_Sumstats_object
476
+ else:
477
+ self.data = _get_flanking_by_id(self.data, snpid, **args)
460
478
  def filter_value(self, expr, inplace=False, **args):
461
479
  if inplace is False:
462
480
  new_Sumstats_object = copy.deepcopy(self)
@@ -464,7 +482,6 @@ class Sumstats():
464
482
  return new_Sumstats_object
465
483
  else:
466
484
  self.data = filtervalues(self.data, expr,log=self.log,**args)
467
-
468
485
  def filter_out(self, inplace=False, **args):
469
486
  if inplace is False:
470
487
  new_Sumstats_object = copy.deepcopy(self)
@@ -472,7 +489,6 @@ class Sumstats():
472
489
  return new_Sumstats_object
473
490
  else:
474
491
  self.data = filterout(self.data,log=self.log,**args)
475
-
476
492
  def filter_in(self, inplace=False, **args):
477
493
  if inplace is False:
478
494
  new_Sumstats_object = copy.deepcopy(self)
@@ -494,7 +510,28 @@ class Sumstats():
494
510
  return new_Sumstats_object
495
511
  else:
496
512
  self.data = filterregionout(self.data,log=self.log,**args)
497
-
513
+ def filter_palindromic(self, inplace=False, **args):
514
+ if inplace is False:
515
+ new_Sumstats_object = copy.deepcopy(self)
516
+ new_Sumstats_object.data = _filter_palindromic(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
517
+ return new_Sumstats_object
518
+ else:
519
+ self.data = _filter_palindromic(self.data,log=self.log,**args)
520
+ def filter_snp(self, inplace=False, **args):
521
+ if inplace is False:
522
+ new_Sumstats_object = copy.deepcopy(self)
523
+ new_Sumstats_object.data = _filter_snp(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
524
+ return new_Sumstats_object
525
+ else:
526
+ self.data = _filter_snp(self.data,log=self.log,**args)
527
+ def filter_indel(self, inplace=False, **args):
528
+ if inplace is False:
529
+ new_Sumstats_object = copy.deepcopy(self)
530
+ new_Sumstats_object.data = _filter_indel(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
531
+ return new_Sumstats_object
532
+ else:
533
+ self.data = _filter_indel(self.data,log=self.log,**args)
534
+
498
535
  def random_variants(self,inplace=False,n=1,p=None,**args):
499
536
  if inplace is True:
500
537
  self.data = sampling(self.data,n=n,p=p,log=self.log,**args)
@@ -502,18 +539,25 @@ class Sumstats():
502
539
  new_Sumstats_object = copy.deepcopy(self)
503
540
  new_Sumstats_object.data = sampling(new_Sumstats_object.data,n=n,p=p,log=new_Sumstats_object.log,**args)
504
541
  return new_Sumstats_object
505
-
542
+
543
+ def filter_hapmap3(self, inplace=False, build=None, **args ):
544
+ if build is None:
545
+ build = self.meta["gwaslab"]["genome_build"]
546
+ if inplace is True:
547
+ self.data = gethapmap3(self.data, build=build,log=self.log, **args)
548
+ else:
549
+ new_Sumstats_object = copy.deepcopy(self)
550
+ new_Sumstats_object.data = gethapmap3(new_Sumstats_object.data, build=build,log=self.log, **args)
551
+ return new_Sumstats_object
506
552
  ######################################################################
507
553
 
508
554
  def check_af(self,ref_infer,**args):
509
555
  self.data = parallelecheckaf(self.data,ref_infer=ref_infer,log=self.log,**args)
510
556
  self.meta["gwaslab"]["references"]["ref_infer_daf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_daf"] , ref_infer)
511
-
512
557
  def infer_af(self,ref_infer,**args):
513
558
  self.data = paralleleinferaf(self.data,ref_infer=ref_infer,log=self.log,**args)
514
559
  self.meta["gwaslab"]["references"]["ref_infer_af"] = ref_infer
515
560
  self.meta["gwaslab"]["references"]["ref_infer_af"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_af"] , ref_infer)
516
-
517
561
  def plot_daf(self, **args):
518
562
  fig,outliers = plotdaf(self.data, **args)
519
563
  return fig, outliers
@@ -548,8 +592,10 @@ class Sumstats():
548
592
 
549
593
  return plot
550
594
 
551
- def plot_trumpet(self, **args):
552
- fig = plottrumpet(self.data, **args)
595
+ def plot_trumpet(self, build=None, **args):
596
+ if build is None:
597
+ build = self.meta["gwaslab"]["genome_build"]
598
+ fig = plottrumpet(self.data,build = build, **args)
553
599
  return fig
554
600
 
555
601
  def get_lead(self, build=None, gls=False, **args):
@@ -617,7 +663,37 @@ class Sumstats():
617
663
  **args)
618
664
  # return sumstats object
619
665
  return output
620
-
666
+
667
+ def check_cis(self, **args):
668
+ if "SNPID" in self.data.columns:
669
+ id_to_use = "SNPID"
670
+ else:
671
+ id_to_use = "rsID"
672
+ output = _check_cis(self.data,
673
+ id=id_to_use,
674
+ chrom="CHR",
675
+ pos="POS",
676
+ p="P",
677
+ log=self.log,
678
+ **args)
679
+ # return sumstats object
680
+ return output
681
+
682
+ def check_novel_set(self, **args):
683
+ if "SNPID" in self.data.columns:
684
+ id_to_use = "SNPID"
685
+ else:
686
+ id_to_use = "rsID"
687
+ output = _check_novel_set(self.data,
688
+ id=id_to_use,
689
+ chrom="CHR",
690
+ pos="POS",
691
+ p="P",
692
+ log=self.log,
693
+ **args)
694
+ # return sumstats object
695
+ return output
696
+
621
697
  def anno_gene(self, **args):
622
698
  if "SNPID" in self.data.columns:
623
699
  id_to_use = "SNPID"
@@ -653,6 +729,18 @@ class Sumstats():
653
729
  output = lambdaGC(self.data[["CHR",mode]],mode=mode,**args)
654
730
  self.meta["Genomic inflation factor"] = output
655
731
  return output
732
+
733
+ def estimate_h2_by_ldsc(self, build=None, verbose=True, match_allele=True, **args):
734
+ if build is None:
735
+ build = self.meta["gwaslab"]["genome_build"]
736
+ insumstats = gethapmap3(self.data.copy(), build=build, verbose=verbose , match_allele=True )
737
+ self.ldsc_h2 = _estimate_h2_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **args)
738
+
739
+ def estimate_rg_by_ldsc(self, build=None, verbose=True, match_allele=True, **args):
740
+ if build is None:
741
+ build = self.meta["gwaslab"]["genome_build"]
742
+ insumstats = gethapmap3(self.data.copy(), build=build, verbose=verbose , match_allele=True )
743
+ self.ldsc_rg = _estimate_rg_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **args)
656
744
  # external ################################################################################################
657
745
 
658
746
  def to_finemapping(self,**args):
@@ -670,148 +758,7 @@ class Sumstats():
670
758
 
671
759
  # to_format ###############################################################################################
672
760
 
673
- def to_format(self,
674
- path="./sumstats",
675
- fmt="gwaslab",
676
- extract=None,
677
- exclude=None,
678
- cols=None,
679
- id_use="rsID",
680
- hapmap3=False,
681
- exclude_hla=False,
682
- hla_range=(25,34),
683
- build=None,
684
- n=None,
685
- verbose=True,
686
- no_status=False,
687
- output_log=True,
688
- to_csvargs=None,
689
- float_formats=None,
690
- xymt_number=False,
691
- xymt=None,
692
- chr_prefix="",
693
- ssfmeta=False,
694
- md5sum=False,
695
- bgzip=False,
696
- tabix=False,
697
- tabix_indexargs={}):
761
+ def to_format(self, path, build=None, **args):
698
762
  if build is None:
699
763
  build = self.meta["gwaslab"]["genome_build"]
700
- onetime_log = copy.deepcopy(self.log)
701
- if to_csvargs is None:
702
- to_csvargs = {}
703
- if float_formats is None:
704
- float_formats={}
705
- if cols is None:
706
- cols=[]
707
- if xymt is None:
708
- xymt = ["X","Y","MT"]
709
-
710
- formatlist= get_formats_list() + ["vep","bed","annovar","vcf"]
711
- if fmt in formatlist:
712
- if verbose: onetime_log.write("Start to format the output sumstats in: ",fmt, " format")
713
- else:
714
- raise ValueError("Please select a format to output")
715
-
716
-
717
- #######################################################################################################
718
- # filter
719
- output = self.data.copy()
720
- if extract is not None:
721
- output = output.loc[output[id_use].isin(extract),:]
722
-
723
- if exclude is not None:
724
- output = output.loc[~output[id_use].isin(exclude),:]
725
-
726
- #hla and hapmap3 #######################################################################################
727
- suffix=fmt
728
-
729
- #exclude hla
730
- if exclude_hla is True:
731
- if verbose: onetime_log.write(" -Excluding variants in MHC (HLA) region ...")
732
- before = len(output)
733
- is_hla = (output["CHR"].astype("string") == "6") & (output["POS"].astype("Int64") > hla_range[0]*1000000) & (output["POS"].astype("Int64") < hla_range[1]*1000000)
734
- output = output.loc[~is_hla,:]
735
- after = len(output)
736
- if verbose: onetime_log.write(" -Exclude "+ str(before - after) + " variants in MHC (HLA) region : {}Mb - {}Mb.".format(hla_range[0], hla_range[1]))
737
- suffix = "noMHC."+suffix
738
-
739
- #extract hapmap3 SNPs
740
- if hapmap3 is True:
741
- output = gethapmap3(output,build=build,verbose=True)
742
- after = len(output)
743
- if verbose: onetime_log.write(" -Extract "+ str(after) + " variants in Hapmap3 datasets for build "+build+".")
744
- suffix = "hapmap3."+suffix
745
-
746
- # add a n column
747
- if n is not None:
748
- output["N"] = n
749
-
750
- #######################################################################################################
751
- #formatting float statistics
752
- if verbose: onetime_log.write(" -Formatting statistics ...")
753
-
754
- formats = {'EAF': '{:.4g}',
755
- 'BETA': '{:.4f}',
756
- 'Z': '{:.4f}',
757
- 'CHISQ': '{:.4f}',
758
- 'SE': '{:.4f}',
759
- 'OR': '{:.4f}',
760
- 'OR_95U': '{:.4f}',
761
- 'OR_95L': '{:.4f}',
762
- 'INFO': '{:.4f}',
763
- 'P': '{:.4e}',
764
- 'MLOG10P': '{:.4f}',
765
- 'DAF': '{:.4f}'
766
- }
767
-
768
- for col, f in float_formats.items():
769
- if col in output.columns:
770
- formats[col]=f
771
- for col, f in formats.items():
772
- if col in output.columns:
773
- if output[col].dtype in ["float64","float32","float16","float"]:
774
- output[col] = output[col].map(f.format)
775
- if verbose:
776
- onetime_log.write(" - Float statistics formats:")
777
- keys=[]
778
- values=[]
779
- for key,value in formats.items():
780
- if key in output.columns:
781
- keys.append(key)
782
- values.append(value)
783
- onetime_log.write(" - Columns:",keys)
784
- onetime_log.write(" - Output formats:",values)
785
-
786
- ##########################################################################################################
787
- # output, mapping column names
788
-
789
- if fmt in get_formats_list() + ["vep","bed","annovar","vcf"]:
790
- tofmt(output,
791
- path=path,
792
- fmt=fmt,
793
- cols=cols,
794
- suffix=suffix,
795
- build=build,
796
- verbose=True,
797
- no_status=no_status,
798
- log=onetime_log,
799
- to_csvargs=to_csvargs,
800
- chr_prefix=chr_prefix,
801
- meta = self.meta,
802
- ssfmeta=ssfmeta,
803
- bgzip=bgzip,
804
- tabix=tabix,
805
- tabix_indexargs=tabix_indexargs,
806
- md5sum=md5sum,
807
- xymt_number=xymt_number,
808
- xymt=xymt)
809
- if output_log is True:
810
- log_path = path + "."+ suffix + ".log"
811
- if verbose: onetime_log.write(" -Saving log file to: {}".format(log_path))
812
- if verbose: onetime_log.write("Finished outputting successfully!")
813
- try:
814
- onetime_log.save(log_path, verbose=False)
815
- except:
816
- pass
817
-
764
+ _to_format(self.data, path, log=self.log, meta=self.meta, build=build, **args)
gwaslab/g_SumstatsPair.py CHANGED
@@ -6,33 +6,40 @@ from gwaslab.util_in_filter_value import filtervalues
6
6
  from gwaslab.g_Log import Log
7
7
  from math import floor
8
8
  from gwaslab.g_Sumstats import Sumstats
9
- from gwaslab.hm_casting import _merge_mold_with_sumstats
9
+ from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
10
10
  from gwaslab.hm_casting import _align_with_mold
11
11
  from gwaslab.hm_casting import _fill_missing_columns
12
12
  from gwaslab.hm_casting import _check_daf
13
13
  from gwaslab.hm_casting import _assign_warning_code
14
14
  from gwaslab.qc_fix_sumstats import flipallelestats
15
+ from gwaslab.qc_check_datatype import check_datatype
16
+ from gwaslab.qc_check_datatype import check_dataframe_shape
15
17
  from gwaslab.hm_casting import _renaming_cols
16
18
  from gwaslab.hm_casting import _sort_pair_cols
17
19
  from gwaslab.util_ex_calculate_ldmatrix import tofinemapping
18
20
  from gwaslab.util_ex_run_coloc import _run_coloc_susie
19
21
  from gwaslab.viz_plot_miamiplot2 import plot_miami2
22
+ from gwaslab.viz_plot_compare_af import plotdaf
20
23
  from gwaslab.util_ex_run_2samplemr import _run_two_sample_mr
21
24
  from gwaslab.util_ex_run_clumping import _clump
22
25
  from gwaslab.util_ex_ldproxyfinder import _extract_with_ld_proxy
23
26
 
24
27
  class SumstatsPair( ):
25
- def __init__(self, sumstatsObject1, sumstatsObject2, study=None, suffixes = ("_1","_2") ):
28
+ def __init__(self, sumstatsObject1, sumstatsObject2, study=None, suffixes = ("_1","_2") ,verbose=True ):
26
29
 
27
30
  if not isinstance(sumstatsObject1, Sumstats):
28
31
  raise ValueError("Please provide GWASLab Sumstats Object #1.")
29
32
  if not isinstance(sumstatsObject2, Sumstats):
30
33
  raise ValueError("Please provide GWASLab Sumstats Object #2.")
31
-
32
- self.study_name = "{}_{}".format(sumstatsObject1.meta["gwaslab"]["study_name"], sumstatsObject2.meta["gwaslab"]["study_name"])
34
+ if sumstatsObject1.meta["gwaslab"]["study_name"]!=sumstatsObject2.meta["gwaslab"]["study_name"]:
35
+ self.study_name = "{}_{}".format(sumstatsObject1.meta["gwaslab"]["study_name"], sumstatsObject2.meta["gwaslab"]["study_name"])
36
+ else:
37
+ self.study_name = "{}_{}".format("STUDY1", "STUDY2")
33
38
  self.snp_info_cols = []
34
39
  self.stats_cols =[]
35
- self.other_cols=[]
40
+ self.stats_cols2 =[]
41
+ self.other_cols =[]
42
+ self.other_cols2 =[]
36
43
  self.log = Log()
37
44
  self.suffixes = suffixes
38
45
  self.colocalization=pd.DataFrame()
@@ -41,26 +48,53 @@ class SumstatsPair( ):
41
48
  self.mr = {}
42
49
  self.clumps ={}
43
50
  self.ns = None
51
+ self.to_finemapping_file_path = ""
52
+ self.plink_log = ""
53
+
54
+ self.log.write( "Start to create SumstatsPair object..." )
55
+
56
+ self.log.write( " -Checking sumstats 1..." , verbose=verbose)
57
+ check_datatype(sumstatsObject1.data, log=self.log, verbose=verbose)
58
+ check_dataframe_shape(sumstats=sumstatsObject1.data,
59
+ log=self.log,
60
+ verbose=verbose)
61
+
62
+ self.log.write( " -Checking sumstats 2..." , verbose=verbose)
63
+ check_datatype(sumstatsObject2.data, log=self.log, verbose=verbose)
64
+ check_dataframe_shape(sumstats=sumstatsObject2.data,
65
+ log=self.log,
66
+ verbose=verbose)
44
67
 
45
68
  for i in sumstatsObject1.data.columns:
46
69
  if i in ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]:
47
70
  self.snp_info_cols.append(i)
48
- elif i in ["BETA","SE","P","MLOG10P","N","Z","OR","OR95L","OR95U","MAF","EAF"]:
71
+ elif i in ["BETA","SE","P","MLOG10P","N","N_CASE","N_CONTROL","Z","T","F","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","MAF","EAF","BETA_95L","BETA_95U"]:
49
72
  self.stats_cols.append(i)
50
73
  else:
51
74
  self.other_cols.append(i)
52
-
53
- self.data = sumstatsObject1.data.loc[:,self.snp_info_cols + self.stats_cols]
54
-
75
+ for i in sumstatsObject2.data.columns:
76
+ if i in ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]:
77
+ continue
78
+ elif i in ["BETA","SE","P","MLOG10P","N","N_CASE","N_CONTROL","Z","T","F","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","MAF","EAF","BETA_95L","BETA_95U"]:
79
+ self.stats_cols2.append(i)
80
+ else:
81
+ self.other_cols2.append(i)
82
+
83
+ self.log.write( " -Variant Info columns: {}".format(self.snp_info_cols) , verbose=verbose)
84
+ self.log.write( " -Variant statistics columns: {}".format(self.stats_cols) , verbose=verbose)
85
+ self.log.write( " -Sumstats1 other columns: {}".format(self.other_cols) , verbose=verbose)
86
+ self.log.write( " -Sumstats2 other columns: {}".format(self.other_cols2) , verbose=verbose)
87
+
88
+ # extract only info and stats cols
89
+ self.data = sumstatsObject1.data
90
+
91
+ #rename with _1
55
92
  self.data = self.data.rename(columns={"EA":"EA_1","NEA":"NEA_1"})
56
-
57
93
  self.data = self.data.rename(columns={i:i + suffixes[0] for i in self.stats_cols})
94
+ self.data = self.data.rename(columns={i:i + suffixes[0] for i in self.other_cols})
58
95
 
59
96
  self.data, self.sumstats1 = self._merge_two_sumstats(sumstatsObject2, suffixes=suffixes)
60
97
 
61
- self.to_finemapping_file_path = ""
62
- self.plink_log = ""
63
-
64
98
  if "N{}".format(self.suffixes[0]) in self.data.columns and "N{}".format(self.suffixes[1]) in self.data.columns:
65
99
  n1 = int(floor(self.data["N{}".format(self.suffixes[0])].mean()))
66
100
  n2 = int(floor(self.data["N{}".format(self.suffixes[1])].mean()))
@@ -70,8 +104,9 @@ class SumstatsPair( ):
70
104
 
71
105
  def _merge_two_sumstats(self, sumstatsObject2, threshold=0.2, verbose=True,windowsizeb=10, ref_path=None,suffixes=("_1","_2")):
72
106
 
73
- molded_sumstats, sumstats1 = _merge_mold_with_sumstats(self.data,
74
- sumstatsObject2.data,
107
+ # sumstats1 with suffix _1, sumstats2 with no suffix
108
+ molded_sumstats, sumstats1 = _merge_mold_with_sumstats_by_chrpos(mold=self.data,
109
+ sumstats=sumstatsObject2.data,
75
110
  log=self.log,
76
111
  verbose=verbose,
77
112
  suffixes=(suffixes[0],""),
@@ -79,16 +114,21 @@ class SumstatsPair( ):
79
114
 
80
115
  molded_sumstats = _align_with_mold(molded_sumstats, log=self.log, verbose=verbose,suffixes=(suffixes[0],""))
81
116
 
117
+ # flip sumstats2 statistics
82
118
  molded_sumstats = flipallelestats(molded_sumstats, log=self.log, verbose=verbose)
83
119
 
120
+ # drop sumstats2 EA NEA
84
121
  molded_sumstats = molded_sumstats.drop(columns=["EA","NEA"])
122
+
123
+ # rename sumstats1 EA NEA
85
124
  molded_sumstats = molded_sumstats.rename(columns={"EA_1":"EA","NEA_1":"NEA"})
86
125
 
87
- if not len(set(self.stats_cols) & set (sumstatsObject2.data.columns)) == len(self.stats_cols):
88
- cols_to_fill = set(self.stats_cols).difference(set(sumstatsObject2.data.columns))
126
+ if not set(self.stats_cols2) == set(self.stats_cols):
127
+ cols_to_fill = set(self.stats_cols).difference(set(self.stats_cols2))
89
128
  molded_sumstats = _fill_missing_columns(molded_sumstats, cols_to_fill, log=self.log, verbose=verbose)
90
129
 
91
- molded_sumstats = _renaming_cols(molded_sumstats, self.stats_cols, log=self.log, verbose=verbose, suffixes=suffixes)
130
+ # rename sumstast2 with _2
131
+ molded_sumstats = _renaming_cols(molded_sumstats, self.stats_cols + self.other_cols2, log=self.log, verbose=verbose, suffixes=suffixes)
92
132
 
93
133
  molded_sumstats = _sort_pair_cols(molded_sumstats, verbose=verbose, log=self.log)
94
134
 
@@ -104,13 +144,7 @@ class SumstatsPair( ):
104
144
  def run_coloc_susie(self,**args):
105
145
 
106
146
  self.colocalization = _run_coloc_susie(self.to_finemapping_file_path,log=self.log,ncols=self.ns,**args)
107
-
108
- def plot_miami(self,**args):
109
147
 
110
- plot_miami2(merged_sumstats=self.data,
111
- suffixes=self.suffixes,
112
- **args)
113
-
114
148
  def run_two_sample_mr(self, clump=False, **args):
115
149
  exposure1 = self.study_name.split("_")[0]
116
150
  outcome2 = self.study_name.split("_")[1]
@@ -126,4 +160,21 @@ class SumstatsPair( ):
126
160
  return new_Sumstats_object
127
161
  else:
128
162
  self.data = filtervalues(self.data, expr,log=self.log,**args)
129
- gc.collect()
163
+ gc.collect()
164
+
165
+ ## Visualization #############################################################################################################################################
166
+ def plot_miami(self,**args):
167
+
168
+ plot_miami2(merged_sumstats=self.data,
169
+ suffixes=self.suffixes,
170
+ **args)
171
+
172
+ def compare_af(self, **args):
173
+
174
+ return plotdaf( self.data,
175
+ eaf="EAF_2",
176
+ raf="EAF_1",
177
+ xlabel="Effect Allele Frequency in Sumstats 1",
178
+ ylabel="Effect Allele Frequency in Sumstats 2",
179
+ **args)
180
+
gwaslab/g_SumstatsT.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import pandas as pd
2
2
  import numpy as np
3
3
  from gwaslab.g_Sumstats import Sumstats
4
- from gwaslab.hm_casting import _merge_mold_with_sumstats
4
+ from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
5
5
  from gwaslab.hm_casting import _align_with_mold
6
6
  from gwaslab.hm_casting import _fill_missing_columns
7
7
  from gwaslab.hm_casting import _check_daf
@@ -34,7 +34,7 @@ class SumstatsT( ):
34
34
 
35
35
  def cast(self, sumstatsObject, threshold=0.2, verbose=True,windowsizeb=10, ref_path=None):
36
36
 
37
- molded_sumstats = _merge_mold_with_sumstats(self.snp_info, sumstatsObject.data, log=sumstatsObject.log, verbose=verbose, windowsizeb=windowsizeb,ref_path=ref_path)
37
+ molded_sumstats = _merge_mold_with_sumstats_by_chrpos(self.snp_info, sumstatsObject.data, log=sumstatsObject.log, verbose=verbose, windowsizeb=windowsizeb,ref_path=ref_path)
38
38
 
39
39
  molded_sumstats = _align_with_mold(molded_sumstats, log=sumstatsObject.log, verbose=verbose)
40
40