gwaslab 3.4.37__py3-none-any.whl → 3.4.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (37) hide show
  1. gwaslab/data/formatbook.json +722 -721
  2. gwaslab/g_Log.py +8 -0
  3. gwaslab/g_Sumstats.py +26 -147
  4. gwaslab/g_SumstatsPair.py +6 -2
  5. gwaslab/g_Sumstats_summary.py +3 -3
  6. gwaslab/g_version.py +2 -2
  7. gwaslab/hm_casting.py +29 -15
  8. gwaslab/hm_harmonize_sumstats.py +291 -163
  9. gwaslab/hm_rsid_to_chrpos.py +1 -1
  10. gwaslab/io_preformat_input.py +43 -37
  11. gwaslab/io_to_formats.py +428 -295
  12. gwaslab/qc_check_datatype.py +3 -3
  13. gwaslab/qc_fix_sumstats.py +793 -682
  14. gwaslab/util_ex_calculate_ldmatrix.py +29 -11
  15. gwaslab/util_ex_gwascatalog.py +1 -1
  16. gwaslab/util_ex_ldproxyfinder.py +1 -1
  17. gwaslab/util_ex_process_ref.py +3 -3
  18. gwaslab/util_ex_run_coloc.py +26 -4
  19. gwaslab/util_in_convert_h2.py +1 -1
  20. gwaslab/util_in_fill_data.py +2 -2
  21. gwaslab/util_in_filter_value.py +122 -34
  22. gwaslab/util_in_get_density.py +2 -2
  23. gwaslab/util_in_get_sig.py +41 -9
  24. gwaslab/viz_aux_quickfix.py +24 -19
  25. gwaslab/viz_aux_reposition_text.py +7 -4
  26. gwaslab/viz_aux_save_figure.py +6 -5
  27. gwaslab/viz_plot_compare_af.py +5 -5
  28. gwaslab/viz_plot_miamiplot2.py +28 -20
  29. gwaslab/viz_plot_mqqplot.py +109 -72
  30. gwaslab/viz_plot_qqplot.py +11 -8
  31. gwaslab/viz_plot_regionalplot.py +3 -1
  32. gwaslab/viz_plot_trumpetplot.py +15 -6
  33. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/METADATA +2 -2
  34. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/RECORD +37 -37
  35. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
  36. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
  37. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0
gwaslab/g_Log.py CHANGED
@@ -2,6 +2,7 @@ import time
2
2
  class Log():
3
3
  def __init__(self):
4
4
  self.log_text=str(time.ctime(time.time()))+ " " + "Sumstats Object created."+ "\n"
5
+
5
6
  def write(self,*message,end="\n",show_time=True, verbose=True):
6
7
  if show_time is True:
7
8
  if verbose: print(str(time.ctime(time.time())),*message,end=end)
@@ -9,6 +10,13 @@ class Log():
9
10
  else:
10
11
  if verbose: print(*message,end=end)
11
12
  self.log_text = self.log_text + " ".join(map(str,message)) + end
13
+
14
+ def warning(self,*message,end="\n",show_time=True, verbose=True):
15
+ self.write(" #WARNING! {}".format(" ".join(map(str,message))),
16
+ end=end,
17
+ show_time=show_time,
18
+ verbose=verbose)
19
+
12
20
  def show(self):
13
21
  print(self.log_text)
14
22
  def save(self,path,verbose=True):
gwaslab/g_Sumstats.py CHANGED
@@ -5,7 +5,7 @@ import copy
5
5
  from gwaslab.g_Sumstats_summary import summarize
6
6
  from gwaslab.g_Sumstats_summary import lookupstatus
7
7
  from gwaslab.io_preformat_input import preformat
8
- from gwaslab.io_to_formats import tofmt
8
+ from gwaslab.io_to_formats import _to_format
9
9
  from gwaslab.g_Log import Log
10
10
  from gwaslab.qc_fix_sumstats import fixID
11
11
  from gwaslab.qc_fix_sumstats import removedup
@@ -35,6 +35,8 @@ from gwaslab.util_in_filter_value import filterregionout
35
35
  from gwaslab.util_in_filter_value import inferbuild
36
36
  from gwaslab.util_in_filter_value import sampling
37
37
  from gwaslab.util_in_filter_value import _get_flanking
38
+ from gwaslab.util_in_filter_value import _get_flanking_by_chrpos
39
+ from gwaslab.util_in_filter_value import _get_flanking_by_id
38
40
  from gwaslab.util_in_calculate_gc import lambdaGC
39
41
  from gwaslab.util_in_convert_h2 import _get_per_snp_r2
40
42
  from gwaslab.util_in_get_sig import getsig
@@ -449,7 +451,7 @@ class Sumstats():
449
451
 
450
452
  # utilities ############################################################################################################
451
453
  # filter series ######################################################################
452
- def get_flanking(self, inplace=False,**args):
454
+ def filter_flanking(self, inplace=False,**args):
453
455
  if inplace is False:
454
456
  new_Sumstats_object = copy.deepcopy(self)
455
457
  new_Sumstats_object.data = _get_flanking(new_Sumstats_object.data, **args)
@@ -457,6 +459,22 @@ class Sumstats():
457
459
  else:
458
460
  self.data = _get_flanking(self.data, **args)
459
461
 
462
+ def filter_flanking_by_chrpos(self, chrpos, inplace=False,**args):
463
+ if inplace is False:
464
+ new_Sumstats_object = copy.deepcopy(self)
465
+ new_Sumstats_object.data = _get_flanking_by_chrpos(new_Sumstats_object.data, chrpos, **args)
466
+ return new_Sumstats_object
467
+ else:
468
+ self.data = _get_flanking_by_chrpos(self.data, chrpos,**args)
469
+
470
+ def filter_flanking_by_id(self, snpid, inplace=False,**args):
471
+ if inplace is False:
472
+ new_Sumstats_object = copy.deepcopy(self)
473
+ new_Sumstats_object.data = _get_flanking_by_id(new_Sumstats_object.data, snpid, **args)
474
+ return new_Sumstats_object
475
+ else:
476
+ self.data = _get_flanking_by_id(self.data, snpid, **args)
477
+
460
478
  def filter_value(self, expr, inplace=False, **args):
461
479
  if inplace is False:
462
480
  new_Sumstats_object = copy.deepcopy(self)
@@ -548,8 +566,10 @@ class Sumstats():
548
566
 
549
567
  return plot
550
568
 
551
- def plot_trumpet(self, **args):
552
- fig = plottrumpet(self.data, **args)
569
+ def plot_trumpet(self, build=None, **args):
570
+ if build is None:
571
+ build = self.meta["gwaslab"]["genome_build"]
572
+ fig = plottrumpet(self.data,build = build, **args)
553
573
  return fig
554
574
 
555
575
  def get_lead(self, build=None, gls=False, **args):
@@ -670,148 +690,7 @@ class Sumstats():
670
690
 
671
691
  # to_format ###############################################################################################
672
692
 
673
- def to_format(self,
674
- path="./sumstats",
675
- fmt="gwaslab",
676
- extract=None,
677
- exclude=None,
678
- cols=None,
679
- id_use="rsID",
680
- hapmap3=False,
681
- exclude_hla=False,
682
- hla_range=(25,34),
683
- build=None,
684
- n=None,
685
- verbose=True,
686
- no_status=False,
687
- output_log=True,
688
- to_csvargs=None,
689
- float_formats=None,
690
- xymt_number=False,
691
- xymt=None,
692
- chr_prefix="",
693
- ssfmeta=False,
694
- md5sum=False,
695
- bgzip=False,
696
- tabix=False,
697
- tabix_indexargs={}):
693
+ def to_format(self, path, build=None, **args):
698
694
  if build is None:
699
695
  build = self.meta["gwaslab"]["genome_build"]
700
- onetime_log = copy.deepcopy(self.log)
701
- if to_csvargs is None:
702
- to_csvargs = {}
703
- if float_formats is None:
704
- float_formats={}
705
- if cols is None:
706
- cols=[]
707
- if xymt is None:
708
- xymt = ["X","Y","MT"]
709
-
710
- formatlist= get_formats_list() + ["vep","bed","annovar","vcf"]
711
- if fmt in formatlist:
712
- if verbose: onetime_log.write("Start to format the output sumstats in: ",fmt, " format")
713
- else:
714
- raise ValueError("Please select a format to output")
715
-
716
-
717
- #######################################################################################################
718
- # filter
719
- output = self.data.copy()
720
- if extract is not None:
721
- output = output.loc[output[id_use].isin(extract),:]
722
-
723
- if exclude is not None:
724
- output = output.loc[~output[id_use].isin(exclude),:]
725
-
726
- #hla and hapmap3 #######################################################################################
727
- suffix=fmt
728
-
729
- #exclude hla
730
- if exclude_hla is True:
731
- if verbose: onetime_log.write(" -Excluding variants in MHC (HLA) region ...")
732
- before = len(output)
733
- is_hla = (output["CHR"].astype("string") == "6") & (output["POS"].astype("Int64") > hla_range[0]*1000000) & (output["POS"].astype("Int64") < hla_range[1]*1000000)
734
- output = output.loc[~is_hla,:]
735
- after = len(output)
736
- if verbose: onetime_log.write(" -Exclude "+ str(before - after) + " variants in MHC (HLA) region : {}Mb - {}Mb.".format(hla_range[0], hla_range[1]))
737
- suffix = "noMHC."+suffix
738
-
739
- #extract hapmap3 SNPs
740
- if hapmap3 is True:
741
- output = gethapmap3(output,build=build,verbose=True)
742
- after = len(output)
743
- if verbose: onetime_log.write(" -Extract "+ str(after) + " variants in Hapmap3 datasets for build "+build+".")
744
- suffix = "hapmap3."+suffix
745
-
746
- # add a n column
747
- if n is not None:
748
- output["N"] = n
749
-
750
- #######################################################################################################
751
- #formatting float statistics
752
- if verbose: onetime_log.write(" -Formatting statistics ...")
753
-
754
- formats = {'EAF': '{:.4g}',
755
- 'BETA': '{:.4f}',
756
- 'Z': '{:.4f}',
757
- 'CHISQ': '{:.4f}',
758
- 'SE': '{:.4f}',
759
- 'OR': '{:.4f}',
760
- 'OR_95U': '{:.4f}',
761
- 'OR_95L': '{:.4f}',
762
- 'INFO': '{:.4f}',
763
- 'P': '{:.4e}',
764
- 'MLOG10P': '{:.4f}',
765
- 'DAF': '{:.4f}'
766
- }
767
-
768
- for col, f in float_formats.items():
769
- if col in output.columns:
770
- formats[col]=f
771
- for col, f in formats.items():
772
- if col in output.columns:
773
- if output[col].dtype in ["float64","float32","float16","float"]:
774
- output[col] = output[col].map(f.format)
775
- if verbose:
776
- onetime_log.write(" - Float statistics formats:")
777
- keys=[]
778
- values=[]
779
- for key,value in formats.items():
780
- if key in output.columns:
781
- keys.append(key)
782
- values.append(value)
783
- onetime_log.write(" - Columns:",keys)
784
- onetime_log.write(" - Output formats:",values)
785
-
786
- ##########################################################################################################
787
- # output, mapping column names
788
-
789
- if fmt in get_formats_list() + ["vep","bed","annovar","vcf"]:
790
- tofmt(output,
791
- path=path,
792
- fmt=fmt,
793
- cols=cols,
794
- suffix=suffix,
795
- build=build,
796
- verbose=True,
797
- no_status=no_status,
798
- log=onetime_log,
799
- to_csvargs=to_csvargs,
800
- chr_prefix=chr_prefix,
801
- meta = self.meta,
802
- ssfmeta=ssfmeta,
803
- bgzip=bgzip,
804
- tabix=tabix,
805
- tabix_indexargs=tabix_indexargs,
806
- md5sum=md5sum,
807
- xymt_number=xymt_number,
808
- xymt=xymt)
809
- if output_log is True:
810
- log_path = path + "."+ suffix + ".log"
811
- if verbose: onetime_log.write(" -Saving log file to: {}".format(log_path))
812
- if verbose: onetime_log.write("Finished outputting successfully!")
813
- try:
814
- onetime_log.save(log_path, verbose=False)
815
- except:
816
- pass
817
-
696
+ _to_format(self.data, path, log=self.log, meta=self.meta, build=build, **args)
gwaslab/g_SumstatsPair.py CHANGED
@@ -28,8 +28,10 @@ class SumstatsPair( ):
28
28
  raise ValueError("Please provide GWASLab Sumstats Object #1.")
29
29
  if not isinstance(sumstatsObject2, Sumstats):
30
30
  raise ValueError("Please provide GWASLab Sumstats Object #2.")
31
-
32
- self.study_name = "{}_{}".format(sumstatsObject1.meta["gwaslab"]["study_name"], sumstatsObject2.meta["gwaslab"]["study_name"])
31
+ if sumstatsObject1.meta["gwaslab"]["study_name"]!=sumstatsObject2.meta["gwaslab"]["study_name"]:
32
+ self.study_name = "{}_{}".format(sumstatsObject1.meta["gwaslab"]["study_name"], sumstatsObject2.meta["gwaslab"]["study_name"])
33
+ else:
34
+ self.study_name = "{}_{}".format("STUDY1", "STUDY2")
33
35
  self.snp_info_cols = []
34
36
  self.stats_cols =[]
35
37
  self.other_cols=[]
@@ -42,6 +44,8 @@ class SumstatsPair( ):
42
44
  self.clumps ={}
43
45
  self.ns = None
44
46
 
47
+ self.log.write( "Start to create SumstatsPair object..." )
48
+
45
49
  for i in sumstatsObject1.data.columns:
46
50
  if i in ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]:
47
51
  self.snp_info_cols.append(i)
@@ -15,7 +15,7 @@ def summarize(insumstats,
15
15
  for i in [snpid,rsid,eaf,p,n,status]:
16
16
  if i in insumstats.columns:
17
17
  cols.append(i)
18
- sumstats= insumstats.loc[:,cols].copy()
18
+ sumstats= insumstats[cols].copy()
19
19
  ###############################################################################
20
20
  numeric_cols=[]
21
21
  output = {}
@@ -68,7 +68,7 @@ def summarize(insumstats,
68
68
  sumstats.drop(columns='uniq_index',inplace=True)
69
69
  status_dic = {}
70
70
  for index,row in status_summary.iterrows():
71
- status_dic[str(index)]=row[0]
71
+ status_dic[str(index)]=row.iloc[0]
72
72
  output["STATUS"]=status_dic
73
73
  numeric_cols.append("STATUS")
74
74
  df = pd.DataFrame.from_dict({(i,j): output[i][j]
@@ -84,7 +84,7 @@ def summarize(insumstats,
84
84
  return df
85
85
 
86
86
  def sum_status(id_to_use, sumstats):
87
- results = sumstats.groupby("STATUS").count()
87
+ results = sumstats.groupby("STATUS",observed=True).count()
88
88
  results = results.loc[results[id_to_use]>0,:].sort_values(id_to_use,ascending=False)
89
89
  return results
90
90
 
gwaslab/g_version.py CHANGED
@@ -15,8 +15,8 @@ def _get_version():
15
15
  def gwaslab_info():
16
16
  # version meta information
17
17
  dic={
18
- "version":"3.4.37",
19
- "release_date":"20240129"
18
+ "version":"3.4.38",
19
+ "release_date":"20240203"
20
20
  }
21
21
  return dic
22
22
 
gwaslab/hm_casting.py CHANGED
@@ -14,9 +14,11 @@ def _merge_mold_with_sumstats(mold, sumstats, ref_path=None, windowsizeb=10, log
14
14
  for i in sumstats.columns:
15
15
  if i in ["SNPID","rsID"]:
16
16
  cols_to_drop.append(i)
17
+
18
+ log.write("Start to merge sumstats...", verbose=verbose)
17
19
 
18
20
  if len(cols_to_drop)>0:
19
- log.write("Dropping old IDs:{}".format(cols_to_drop))
21
+ log.write(" -Dropping old IDs:{}".format(cols_to_drop), verbose=verbose)
20
22
  sumstats = sumstats.drop(columns=cols_to_drop)
21
23
 
22
24
  if ref_path is not None :
@@ -30,17 +32,18 @@ def _merge_mold_with_sumstats(mold, sumstats, ref_path=None, windowsizeb=10, log
30
32
  mold["_IDENTIFIER_FOR_VARIANT"] = range(len(mold))
31
33
 
32
34
  mold_sumstats = pd.merge(mold, sumstats, on=["CHR","POS"], how="inner",suffixes=suffixes)
33
- log.write("After merging by CHR and POS:{}".format(len(mold_sumstats)))
35
+ log.write(" -After merging by CHR and POS:{}".format(len(mold_sumstats)), verbose=verbose)
34
36
 
35
37
  mold_sumstats = _keep_variants_with_same_allele_set(mold_sumstats,suffixes=suffixes)
36
- log.write("Matched variants:{}".format(len(mold_sumstats)))
38
+
39
+ log.write(" -Matched variants:{}".format(len(mold_sumstats)), verbose=verbose)
37
40
 
38
- if ref_path is not None:
39
- # match removed sumstats
40
- mold_removed = mold.loc[~mold[index1].isin(mold_sumstats[index1]),:]
41
- iron_removed = sumstats.loc[~sumstats[index2].isin(mold_sumstats[index2]),:]
42
- _match_two_sumstats(mold_removed,iron_removed,ref_path,windowsizeb=windowsizeb)
43
- mold_sumstats.drop(columns=["_INDEX",""])
41
+ #if ref_path is not None:
42
+ # # match removed sumstats
43
+ # mold_removed = mold.loc[~mold[index1].isin(mold_sumstats[index1]),:]
44
+ # iron_removed = sumstats.loc[~sumstats[index2].isin(mold_sumstats[index2]),:]
45
+ # _match_two_sumstats(mold_removed,iron_removed,ref_path,windowsizeb=windowsizeb)
46
+ # mold_sumstats.drop(columns=["_INDEX",""])
44
47
 
45
48
  if return_not_matched_mold == True:
46
49
  sumstats1 = mold.loc[~mold["_IDENTIFIER_FOR_VARIANT"].isin(mold_sumstats["_IDENTIFIER_FOR_VARIANT"]),:]
@@ -59,14 +62,17 @@ def _keep_variants_with_same_allele_set(sumstats, log=Log(),verbose=True,suffixe
59
62
 
60
63
  all_alleles = set(list(sumstats[ea1].unique())+list(sumstats[nea1].unique())+list(sumstats[ea2].unique())+list(sumstats[nea2].unique()))
61
64
  allele_type = CategoricalDtype(categories=all_alleles, ordered=False)
62
- sumstats.loc[:, [nea1,ea1,nea2,ea2]] = sumstats.loc[:, [nea1,ea1,nea2,ea2]].astype(allele_type)
65
+ sumstats[[nea1,ea1,nea2,ea2]] = sumstats[[nea1,ea1,nea2,ea2]].astype(allele_type)
63
66
 
64
67
  is_perfect_match = (sumstats[ea2] == sumstats[ea1]) & (sumstats[nea2] == sumstats[nea1])
65
68
  is_flipped_match = (sumstats[ea2] == sumstats[nea1]) & (sumstats[nea2] == sumstats[ea1])
66
69
  is_allele_set_match = is_flipped_match | is_perfect_match
67
70
 
68
- sumstats.loc[~is_allele_set_match,:]
69
-
71
+ log.write(" -Matching alleles and keeping only variants with same allele set: ", verbose=verbose)
72
+ log.write(" -Perfect match: {}".format(sum(is_perfect_match)), verbose=verbose)
73
+ log.write(" -Flipped match: {}".format(sum(is_flipped_match)), verbose=verbose)
74
+ log.write(" -Unmatched : {}".format(sum(~is_allele_set_match)), verbose=verbose)
75
+
70
76
  return sumstats.loc[is_allele_set_match,:]
71
77
 
72
78
  def _align_with_mold(sumstats, log=Log(),verbose=True, suffixes=("_MOLD","")):
@@ -77,10 +83,18 @@ def _align_with_mold(sumstats, log=Log(),verbose=True, suffixes=("_MOLD","")):
77
83
  nea2="NEA"+suffixes[1]
78
84
  status1="STATUS"+suffixes[0]
79
85
  status2="STATUS"+suffixes[1]
86
+
80
87
  is_perfect_match = (sumstats[ea2] == sumstats[ea1]) & (sumstats[nea2] == sumstats[nea1])
81
88
  is_flipped_match = (sumstats[ea2] == sumstats[nea1]) & (sumstats[nea2] == sumstats[ea1])
82
89
 
90
+ log.write(" -Aligning alleles with reference: ", verbose=verbose)
91
+ log.write(" -Perfect match: {}".format(sum(is_perfect_match)), verbose=verbose)
92
+ log.write(" -Flipped match: {}".format(sum(is_flipped_match)), verbose=verbose)
93
+
94
+ log.write(" -For perfect match: copy STATUS from reference...", verbose=verbose)
83
95
  sumstats.loc[is_perfect_match,status2] = copy_status(sumstats.loc[is_perfect_match,status1], sumstats.loc[is_perfect_match,status2],6)
96
+
97
+ log.write(" -For Flipped match: convert STATUS xxxxx[456789]x to xxxxx3x...", verbose=verbose)
84
98
  sumstats.loc[is_flipped_match,status2] = vchange_status(sumstats.loc[is_flipped_match,status2],6,"456789","333333")
85
99
 
86
100
  return sumstats
@@ -119,9 +133,9 @@ def _sort_pair_cols(molded_sumstats, verbose=True, log=Log(), order=None, stats_
119
133
  if i not in order:
120
134
  output_columns.append(i)
121
135
 
122
- if verbose: log.write(" -Reordering columns to :", ",".join(output_columns))
123
- molded_sumstats = molded_sumstats.loc[:, output_columns]
124
- if verbose: log.write("Finished sorting columns successfully!")
136
+ if verbose: log.write(" -Reordering columns to :", ",".join(output_columns), verbose=verbose)
137
+ molded_sumstats = molded_sumstats[ output_columns]
138
+ if verbose: log.write("Finished sorting columns successfully!", verbose=verbose)
125
139
 
126
140
  return molded_sumstats
127
141