gwaslab 3.4.47__tar.gz → 3.4.49__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (91) hide show
  1. {gwaslab-3.4.47/src/gwaslab.egg-info → gwaslab-3.4.49}/PKG-INFO +2 -2
  2. {gwaslab-3.4.47 → gwaslab-3.4.49}/README.md +1 -1
  3. {gwaslab-3.4.47 → gwaslab-3.4.49}/pyproject.toml +1 -1
  4. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/bd_common_data.py +3 -1
  5. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/reference.json +10 -2
  6. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_Sumstats.py +22 -2
  7. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_vchange_status.py +1 -1
  8. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_version.py +2 -2
  9. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/hm_harmonize_sumstats.py +23 -7
  10. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/io_preformat_input.py +73 -8
  11. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/io_read_ldsc.py +16 -2
  12. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/io_to_formats.py +5 -5
  13. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/qc_fix_sumstats.py +109 -7
  14. gwaslab-3.4.49/src/gwaslab/util_abf_finemapping.py +67 -0
  15. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_ldsc.py +8 -1
  16. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_run_clumping.py +6 -6
  17. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_fill_data.py +20 -2
  18. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_aux_annotate_plot.py +2 -1
  19. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_aux_quickfix.py +2 -1
  20. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_compare_effect.py +4 -2
  21. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_miamiplot2.py +10 -9
  22. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_mqqplot.py +42 -21
  23. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_regional2.py +75 -29
  24. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_stackedregional.py +37 -16
  25. {gwaslab-3.4.47 → gwaslab-3.4.49/src/gwaslab.egg-info}/PKG-INFO +2 -2
  26. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab.egg-info/SOURCES.txt +1 -0
  27. {gwaslab-3.4.47 → gwaslab-3.4.49}/LICENSE +0 -0
  28. {gwaslab-3.4.47 → gwaslab-3.4.49}/LICENSE_before_v3.4.39 +0 -0
  29. {gwaslab-3.4.47 → gwaslab-3.4.49}/setup.cfg +0 -0
  30. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/__init__.py +0 -0
  31. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/bd_config.py +0 -0
  32. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/bd_download.py +0 -0
  33. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/bd_get_hapmap3.py +0 -0
  34. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/cache_manager.py +0 -0
  35. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/chrx_par/chrx_par_hg19.bed.gz +0 -0
  36. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/chrx_par/chrx_par_hg38.bed.gz +0 -0
  37. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/formatbook.json +0 -0
  38. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz +0 -0
  39. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz +0 -0
  40. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/high_ld/high_ld_hla_hg19.bed.gz +0 -0
  41. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/high_ld/high_ld_hla_hg38.bed.gz +0 -0
  42. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_Log.py +0 -0
  43. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_Phenotypes.py +0 -0
  44. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_SumstatsPair.py +0 -0
  45. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_SumstatsT.py +0 -0
  46. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_Sumstats_summary.py +0 -0
  47. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_meta.py +0 -0
  48. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/hm_casting.py +0 -0
  49. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/hm_rsid_to_chrpos.py +0 -0
  50. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/io_read_tabular.py +0 -0
  51. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/io_to_pickle.py +0 -0
  52. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/ldsc_irwls.py +0 -0
  53. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/ldsc_jackknife.py +0 -0
  54. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/ldsc_ldscore.py +0 -0
  55. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/ldsc_parse.py +0 -0
  56. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/ldsc_regressions.py +0 -0
  57. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/ldsc_sumstats.py +0 -0
  58. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/qc_check_datatype.py +0 -0
  59. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/run_script.py +0 -0
  60. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_calculate_ldmatrix.py +0 -0
  61. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_calculate_prs.py +0 -0
  62. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_gwascatalog.py +0 -0
  63. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_ldproxyfinder.py +0 -0
  64. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_plink_filter.py +0 -0
  65. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_process_h5.py +0 -0
  66. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_process_ref.py +0 -0
  67. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_run_2samplemr.py +0 -0
  68. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_run_coloc.py +0 -0
  69. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_run_susie.py +0 -0
  70. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_calculate_gc.py +0 -0
  71. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_calculate_power.py +0 -0
  72. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_convert_h2.py +0 -0
  73. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_correct_winnerscurse.py +0 -0
  74. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_filter_value.py +0 -0
  75. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_get_density.py +0 -0
  76. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_get_sig.py +0 -0
  77. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_meta.py +0 -0
  78. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_snphwe.py +0 -0
  79. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_aux_chromatin.py +0 -0
  80. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_aux_reposition_text.py +0 -0
  81. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_aux_save_figure.py +0 -0
  82. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_compare_af.py +0 -0
  83. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_forestplot.py +0 -0
  84. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_miamiplot.py +0 -0
  85. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_qqplot.py +0 -0
  86. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_regionalplot.py +0 -0
  87. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_rg_heatmap.py +0 -0
  88. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_trumpetplot.py +0 -0
  89. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab.egg-info/dependency_links.txt +0 -0
  90. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab.egg-info/requires.txt +0 -0
  91. {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: gwaslab
3
- Version: 3.4.47
3
+ Version: 3.4.49
4
4
  Summary: A collection of handy tools for GWAS SumStats
5
5
  Author-email: Yunye <yunye@gwaslab.com>
6
6
  Project-URL: Homepage, https://cloufield.github.io/gwaslab/
@@ -51,7 +51,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
51
51
  ### install via pip
52
52
 
53
53
  ```
54
- pip install gwaslab==3.4.45
54
+ pip install gwaslab==3.4.46
55
55
  ```
56
56
 
57
57
  ```python
@@ -23,7 +23,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
23
23
  ### install via pip
24
24
 
25
25
  ```
26
- pip install gwaslab==3.4.45
26
+ pip install gwaslab==3.4.46
27
27
  ```
28
28
 
29
29
  ```python
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
7
7
 
8
8
  [project]
9
9
  name = "gwaslab"
10
- version = "3.4.47"
10
+ version = "3.4.49"
11
11
  authors = [
12
12
  { name="Yunye", email="yunye@gwaslab.com" },
13
13
  ]
@@ -274,7 +274,9 @@ def get_gtf(chrom, build="19",source="ensembl"):
274
274
  gtf = pd.DataFrame(columns=["seqname","start","end","strand","feature","gene_biotype","gene_id","gene_name"])
275
275
  return gtf
276
276
 
277
-
277
+ def get_chain(from_build="19", to_build="38"):
278
+ chain_path = check_and_download("{}to{}".format(from_build, to_build))
279
+ return chain_path
278
280
  ####################################################################################################################
279
281
  def gtf_to_protein_coding(gtfpath,log=Log(),verbose=True):
280
282
  protein_coding_path = gtfpath[:-6]+"protein_coding.gtf.gz"
@@ -90,12 +90,20 @@
90
90
  "1kg_dbsnp151_hg38_x_md5":"48c05eeb1454c0dd4cbee3cb26382e8e",
91
91
  "recombination_hg19":"https://www.dropbox.com/s/wbesl8haxknonuc/recombination_hg19.tar.gz?dl=1",
92
92
  "recombination_hg38":"https://www.dropbox.com/s/vuo8mvqx0fpibzj/recombination_hg38.tar.gz?dl=1",
93
- "ensembl_hg19_gtf":"https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.chr.gtf.gz",
93
+ "ensembl_hg19_gtf":"https://ftp.ensembl.org/pub/grch37/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.chr.gtf.gz",
94
94
  "ensembl_hg38_gtf":"https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens//Homo_sapiens.GRCh38.109.chr.gtf.gz",
95
95
  "refseq_hg19_gtf":"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gtf.gz",
96
96
  "refseq_hg38_gtf":"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz",
97
97
  "testlink":"https://www.dropbox.com/s/8u7capwge0ihshu/EAS.chr22.split_norm_af.1kgp3v5.vcf.gz?dl=1",
98
- "testlink_tbi":"https://www.dropbox.com/s/hdneg53t6u1j6ib/EAS.chr22.split_norm_af.1kgp3v5.vcf.gz.tbi?dl=1"
98
+ "testlink_tbi":"https://www.dropbox.com/s/hdneg53t6u1j6ib/EAS.chr22.split_norm_af.1kgp3v5.vcf.gz.tbi?dl=1",
99
+ "19to38":"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz",
100
+ "19to13":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/hg19-chm13v2.chain",
101
+ "38to19":"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz",
102
+ "38to13":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/grch38-chm13v2.chain",
103
+ "13to19":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/chm13v2-hg19.chain",
104
+ "13to38":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/chm13v2-grch38.chain",
105
+ "18to19":"https://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz",
106
+ "18to38":"https://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg38.over.chain.gz"
99
107
  }
100
108
 
101
109
 
@@ -8,6 +8,8 @@ from gwaslab.io_preformat_input import preformat
8
8
  from gwaslab.io_to_formats import _to_format
9
9
  from gwaslab.g_Log import Log
10
10
  from gwaslab.qc_fix_sumstats import fixID
11
+ from gwaslab.qc_fix_sumstats import flipSNPID
12
+ from gwaslab.qc_fix_sumstats import stripSNPID
11
13
  from gwaslab.qc_fix_sumstats import removedup
12
14
  from gwaslab.qc_fix_sumstats import fixchr
13
15
  from gwaslab.qc_fix_sumstats import fixpos
@@ -76,6 +78,8 @@ from gwaslab.util_ex_ldsc import _estimate_rg_by_ldsc
76
78
  from gwaslab.util_ex_ldsc import _estimate_h2_cts_by_ldsc
77
79
  from gwaslab.util_ex_ldsc import _estimate_partitioned_h2_by_ldsc
78
80
  from gwaslab.bd_get_hapmap3 import gethapmap3
81
+ from gwaslab.util_abf_finemapping import abf_finemapping
82
+ from gwaslab.util_abf_finemapping import make_cs
79
83
  import gc
80
84
 
81
85
  #20220309
@@ -121,6 +125,8 @@ class Sumstats():
121
125
  snpr2=None,
122
126
  status=None,
123
127
  other=[],
128
+ chrom_pat=None,
129
+ snpid_pat=None,
124
130
  usekeys=None,
125
131
  direction=None,
126
132
  verbose=True,
@@ -135,6 +141,7 @@ class Sumstats():
135
141
  self.data = pd.DataFrame()
136
142
  self.log = Log()
137
143
  self.ldsc_h2 = None
144
+ self.ldsc_h2_results = None
138
145
  self.ldsc_rg = None
139
146
  self.ldsc_h2_cts = None
140
147
  self.ldsc_partitioned_h2_summary = None
@@ -202,6 +209,8 @@ class Sumstats():
202
209
  status=status,
203
210
  other=other,
204
211
  usekeys=usekeys,
212
+ chrom_pat=chrom_pat,
213
+ snpid_pat=snpid_pat,
205
214
  verbose=verbose,
206
215
  readargs=readargs,
207
216
  log=self.log)
@@ -415,6 +424,10 @@ class Sumstats():
415
424
  #customizable API to build your own QC pipeline
416
425
  def fix_id(self,**kwargs):
417
426
  self.data = fixID(self.data,log=self.log,**kwargs)
427
+ def flip_snpid(self,**kwargs):
428
+ self.data = flipSNPID(self.data,log=self.log,**kwargs)
429
+ def strip_snpid(self,**kwargs):
430
+ self.data = stripSNPID(self.data,log=self.log,**kwargs)
418
431
  def fix_chr(self,**kwargs):
419
432
  self.data = fixchr(self.data,log=self.log,**kwargs)
420
433
  def fix_pos(self,**kwargs):
@@ -756,13 +769,20 @@ class Sumstats():
756
769
  else:
757
770
  output = lambdaGC(self.data[["CHR",mode]],mode=mode,**kwargs)
758
771
  self.meta["Genomic inflation factor"] = output
759
- return output
772
+ return output
773
+
774
+ def abf_finemapping(self, region=None, chrpos=None, snpid=None,**kwargs):
775
+ region_data = abf_finemapping(self.data.copy(),region=region,chrpos=chrpos,snpid=snpid,log=self.log, **kwargs)
776
+ credible_sets = make_cs(region_data,threshold=0.95,log=self.log)
777
+ return region_data, credible_sets
778
+
779
+
760
780
  ## LDSC ##############################################################################################
761
781
  def estimate_h2_by_ldsc(self, build=None, verbose=True, match_allele=True, **kwargs):
762
782
  if build is None:
763
783
  build = self.meta["gwaslab"]["genome_build"]
764
784
  insumstats = gethapmap3(self.data.copy(), build=build, verbose=verbose , match_allele=True, how="right" )
765
- self.ldsc_h2 = _estimate_h2_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **kwargs)
785
+ self.ldsc_h2, self.ldsc_h2_results = _estimate_h2_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **kwargs)
766
786
 
767
787
  def estimate_rg_by_ldsc(self, build=None, verbose=True, match_allele=True, **kwargs):
768
788
  if build is None:
@@ -1,6 +1,6 @@
1
1
  import pandas as pd
2
2
 
3
- CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
3
+ CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
4
4
 
5
5
  def vchange_status(status,digit,before,after):
6
6
  dic={}
@@ -15,8 +15,8 @@ def _get_version():
15
15
  def gwaslab_info():
16
16
  # version meta information
17
17
  dic={
18
- "version":"3.4.47",
19
- "release_date":"20240703"
18
+ "version":"3.4.49",
19
+ "release_date":"20241019"
20
20
  }
21
21
  return dic
22
22
 
@@ -21,6 +21,7 @@ from gwaslab.qc_check_datatype import check_dataframe_shape
21
21
  from gwaslab.bd_common_data import get_number_to_chr
22
22
  from gwaslab.bd_common_data import get_chr_list
23
23
  from gwaslab.bd_common_data import get_chr_to_number
24
+ from gwaslab.bd_common_data import get_number_to_NC
24
25
  from gwaslab.bd_common_data import _maketrans
25
26
  from gwaslab.g_vchange_status import vchange_status
26
27
  from gwaslab.g_version import _get_version
@@ -355,7 +356,7 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
355
356
 
356
357
  log.write("\n",end="",show_time=False,verbose=verbose)
357
358
 
358
- CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
359
+ CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
359
360
  sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
360
361
  #sumstats[status] = sumstats[status].astype("string")
361
362
 
@@ -674,7 +675,7 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
674
675
  sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
675
676
  log.write(" -Finished checking records", verbose=verbose)
676
677
 
677
- CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
678
+ CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
678
679
  sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
679
680
  #sumstats[status] = sumstats[status].astype("string")
680
681
 
@@ -1496,17 +1497,21 @@ def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
1496
1497
  def auto_check_vcf_chr_dict(vcf_path, vcf_chr_dict, verbose, log):
1497
1498
  if vcf_path is not None:
1498
1499
  if vcf_chr_dict is None:
1499
- log.write(" -Checking prefix for chromosomes in vcf files..." ,verbose=verbose)
1500
- prefix = check_vcf_chr_prefix(vcf_path)
1500
+ log.write(" -Checking chromosome notations in VCF/BCF files..." ,verbose=verbose)
1501
+ vcf_chr_dict = check_vcf_chr_NC(vcf_path, log, verbose)
1502
+ if vcf_chr_dict is not None:
1503
+ return vcf_chr_dict
1504
+ log.write(" -Checking prefix for chromosomes in VCF/BCF files..." ,verbose=verbose)
1505
+ prefix = check_vcf_chr_prefix(vcf_path, log,verbose)
1501
1506
  if prefix is not None:
1502
1507
  log.write(" -Prefix for chromosomes: ",prefix)
1503
1508
  vcf_chr_dict = get_number_to_chr(prefix=prefix)
1504
1509
  else:
1505
- log.write(" -No prefix for chromosomes in the VCF files." ,verbose=verbose)
1510
+ log.write(" -No prefix for chromosomes in the VCF/BCF files." ,verbose=verbose)
1506
1511
  vcf_chr_dict = get_number_to_chr()
1507
1512
  return vcf_chr_dict
1508
1513
 
1509
- def check_vcf_chr_prefix(vcf_bcf_path):
1514
+ def check_vcf_chr_prefix(vcf_bcf_path,log,verbose):
1510
1515
  vcf_bcf = VariantFile(vcf_bcf_path)
1511
1516
  for i in list(vcf_bcf.header.contigs):
1512
1517
  m = re.search('(chr|Chr|CHR)([0-9xXyYmM]+)', i)
@@ -1514,5 +1519,16 @@ def check_vcf_chr_prefix(vcf_bcf_path):
1514
1519
  return m.group(1)
1515
1520
  else:
1516
1521
  return None
1517
-
1522
+
1523
+ def check_vcf_chr_NC(vcf_bcf_path,log,verbose):
1524
+ vcf_bcf = VariantFile(vcf_bcf_path)
1525
+ for i in list(vcf_bcf.header.contigs):
1526
+ if i in get_number_to_NC(build="19").values():
1527
+ log.write(" -RefSeq ID detected (hg19) in VCF/BCF...",verbose=verbose)
1528
+ return get_number_to_NC(build="19")
1529
+ elif i in get_number_to_NC(build="38").values():
1530
+ log.write(" -RefSeq ID detected (hg38) in VCF/BCF...",verbose=verbose)
1531
+ return get_number_to_NC(build="38")
1532
+ else:
1533
+ return None
1518
1534
 
@@ -56,6 +56,8 @@ def preformat(sumstats,
56
56
  build=None,
57
57
  other=[],
58
58
  usekeys=None,
59
+ chrom_pat=None,
60
+ snpid_pat=None,
59
61
  verbose=False,
60
62
  readargs=None,
61
63
  log=None):
@@ -84,7 +86,10 @@ def preformat(sumstats,
84
86
  if "format_separator" in meta_data.keys():
85
87
  if "sep" not in readargs.keys():
86
88
  readargs["sep"] = meta_data["format_separator"]
87
-
89
+ else:
90
+ if readargs["sep"] != meta_data["format_separator"]:
91
+ log.write(' - format_separator will be changed to: "{}"'.format(readargs["sep"]),verbose=verbose)
92
+
88
93
  if "format_na" in meta_data.keys():
89
94
  readargs["na_values"] = meta_data["format_na"]
90
95
 
@@ -92,7 +97,7 @@ def preformat(sumstats,
92
97
  readargs["comment"] = meta_data["format_comment"]
93
98
 
94
99
  if "sep" not in readargs.keys():
95
- readargs["sep"] = "\t"
100
+ readargs["sep"] = "\t"
96
101
 
97
102
  #########################################################################################################################################################
98
103
 
@@ -323,10 +328,30 @@ def preformat(sumstats,
323
328
  skip_rows = get_skip_rows(inpath)
324
329
  readargs["skiprows"] = skip_rows
325
330
  log.write("Start to initialize gl.Sumstats from file :" + inpath,verbose=verbose)
326
- sumstats = pd.read_table(inpath,
327
- usecols=set(usecols),
328
- dtype=dtype_dictionary,
329
- **readargs)
331
+ if chrom_pat is not None:
332
+ sumstats = _load_single_chr(inpath,
333
+ usecols,
334
+ dtype_dictionary,
335
+ readargs=readargs,
336
+ rename_dictionary=rename_dictionary,
337
+ chrom_pat=chrom_pat,
338
+ log=log,
339
+ verbose=verbose)
340
+ elif snpid_pat is not None:
341
+
342
+ sumstats = _load_variants_with_pattern(inpath,
343
+ usecols,
344
+ dtype_dictionary,
345
+ readargs=readargs,
346
+ rename_dictionary=rename_dictionary,
347
+ snpid_pat=snpid_pat,
348
+ log=log,
349
+ verbose=verbose)
350
+ else:
351
+ sumstats = pd.read_table(inpath,
352
+ usecols=set(usecols),
353
+ dtype=dtype_dictionary,
354
+ **readargs)
330
355
 
331
356
  elif type(sumstats) is pd.DataFrame:
332
357
  ## loading data from dataframe
@@ -520,9 +545,49 @@ def process_status(sumstats,build,log,verbose):
520
545
  #sumstats["STATUS"] = int(build)*(10**5) +99999
521
546
  build = _process_build(build,log,verbose)
522
547
  sumstats["STATUS"] = build +"99999"
523
- categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
548
+ categories = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
524
549
  sumstats["STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
525
550
  return sumstats
526
551
 
527
552
 
528
-
553
+ def _load_single_chr(inpath,usecols,dtype_dictionary,readargs,rename_dictionary,chrom_pat,log,verbose):
554
+
555
+ sumstats_iter = pd.read_table(inpath,
556
+ usecols=set(usecols),
557
+ dtype=dtype_dictionary,
558
+ iterator=True,
559
+ chunksize=500000,
560
+ **readargs)
561
+ # get chr
562
+ for k,v in rename_dictionary.items():
563
+ if v=="CHR":
564
+ if k in usecols:
565
+ log.write(" -Columns used to filter variants: {}".format(k),verbose=verbose)
566
+ chunk_chrom = k
567
+ break
568
+
569
+ log.write(" -Loading only variants on chromosome with pattern : {} ...".format(chrom_pat),verbose=verbose)
570
+ sumstats_filtered = pd.concat([chunk[chunk[chunk_chrom].str.match(chrom_pat, case=False,na=False) ] for chunk in sumstats_iter])
571
+ log.write(" -Loaded {} variants on chromosome with pattern :{} ...".format(len(sumstats_filtered), chrom_pat),verbose=verbose)
572
+ return sumstats_filtered
573
+
574
+ def _load_variants_with_pattern(inpath,usecols,dtype_dictionary,readargs,rename_dictionary,snpid_pat,log,verbose):
575
+
576
+ sumstats_iter = pd.read_table(inpath,
577
+ usecols=set(usecols),
578
+ dtype=dtype_dictionary,
579
+ iterator=True,
580
+ chunksize=500000,
581
+ **readargs)
582
+ # get chr
583
+ for k,v in rename_dictionary.items():
584
+ if v=="SNPID":
585
+ if k in usecols:
586
+ log.write(" -Columns used to filter variants: {}".format(k),verbose=verbose)
587
+ chunk_snpid = k
588
+ break
589
+
590
+ log.write(" -Loading only variants with pattern : {} ...".format(snpid_pat),verbose=verbose)
591
+ sumstats_filtered = pd.concat([chunk[chunk[chunk_snpid].str.match(snpid_pat, case=False,na=False) ] for chunk in sumstats_iter])
592
+ log.write(" -Loaded {} variants with pattern : {} ...".format(len(sumstats_filtered), snpid_pat),verbose=verbose)
593
+ return sumstats_filtered
@@ -198,16 +198,29 @@ def read_greml(filelist=[]):
198
198
  return summary
199
199
 
200
200
  def parse_ldsc_summary(ldsc_summary):
201
- summary = pd.DataFrame(columns = ['h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se"])
201
+
202
202
  lines = ldsc_summary.split("\n")
203
+
204
+ columns = ['h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se","Catagories"]
205
+
206
+ summary = pd.DataFrame(columns = columns)
207
+
203
208
  row={}
209
+
204
210
  try:
205
211
  objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[0])
206
212
  row["h2_obs"]=objects[1]
207
213
  row["h2_se"]=objects[2]
208
214
 
209
- ##next line lambda gc
215
+ ##check categories
216
+ if len(lines) == 6:
217
+ objects = re.compile(' -Categories:(.+)').findall(lines[1])
218
+ row["Catagories"] = objects[0].strip()
219
+ lines.pop(1)
220
+ else:
221
+ row["Catagories"] = "NA"
210
222
 
223
+ ##next line lambda gc
211
224
  objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[1])
212
225
  row["Lambda_gc"] = objects[1]
213
226
  ##next line Mean_chi2
@@ -240,6 +253,7 @@ def parse_ldsc_summary(ldsc_summary):
240
253
  row["Intercept_se"]="NA"
241
254
  row["Ratio"]="NA"
242
255
  row["Ratio_se"]="NA"
256
+ row["Catagories"] = "NA"
243
257
 
244
258
  #summary = summary.append(row,ignore_index=True)
245
259
  row = pd.DataFrame([row], columns = summary.columns)
@@ -342,7 +342,7 @@ def tofmt(sumstats,
342
342
  meta_data,rename_dictionary = get_format_dict(fmt,inverse=True)
343
343
  print_format_info(fmt=fmt, meta_data=meta_data,rename_dictionary=rename_dictionary,verbose=verbose, log=log, output=True)
344
344
 
345
- ymal_path = path + "."+suffix+".tsv-meta.ymal"
345
+ yaml_path = path + "."+suffix+".tsv-meta.yaml"
346
346
  path = path + "."+suffix+".tsv.gz"
347
347
  log.write(" -Output path:",path, verbose=verbose)
348
348
 
@@ -361,7 +361,7 @@ def tofmt(sumstats,
361
361
  md5_value = calculate_md5sum_file(path)
362
362
 
363
363
  ## update ssf-style meta data and export to yaml file
364
- _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose)
364
+ _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, yaml_path, log, verbose)
365
365
 
366
366
  return sumstats
367
367
 
@@ -476,7 +476,7 @@ def _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status
476
476
  return sumstats, to_csvargs
477
477
 
478
478
 
479
- def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose):
479
+ def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, yaml_path, log, verbose):
480
480
  ### calculate meta data
481
481
  if "EAF" in sumstats.columns:
482
482
  min_maf = sumstats["EAF"].min()
@@ -506,8 +506,8 @@ def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value
506
506
  sumstats_meta_copy["gwaslab"]["samples"]["sample_size_min"] = n_min
507
507
  sumstats_meta_copy["gwaslab"]["samples"]["sample_size_median"] = n_median
508
508
  sumstats_meta_copy["gwaslab"]["variants"]["variant_number"] = len(sumstats)
509
- log.write(" -Exporting SSF-style meta data to {}".format(ymal_path),verbose=verbose)
510
- with open(ymal_path, 'w') as outfile:
509
+ log.write(" -Exporting SSF-style meta data to {}".format(yaml_path),verbose=verbose)
510
+ with open(yaml_path, 'w') as outfile:
511
511
  yaml.dump(sumstats_meta_copy, outfile)
512
512
 
513
513
 
@@ -5,6 +5,7 @@ import numpy as np
5
5
  from itertools import repeat
6
6
  from multiprocessing import Pool
7
7
  from liftover import get_lifter
8
+ from liftover import ChainFile
8
9
  from functools import partial
9
10
  from gwaslab.g_vchange_status import vchange_status
10
11
  from gwaslab.g_vchange_status import status_match
@@ -19,6 +20,7 @@ from gwaslab.g_version import _get_version
19
20
  from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
20
21
  from gwaslab.util_in_fill_data import _convert_betase_to_p
21
22
  from gwaslab.util_in_fill_data import _convert_mlog10p_to_p
23
+ from gwaslab.bd_common_data import get_chain
22
24
  #process build
23
25
  #setbuild
24
26
  #fixID
@@ -43,9 +45,15 @@ def _process_build(build,log,verbose):
43
45
  if str(build).lower() in ["hg19","19","37","b37","grch37"]:
44
46
  log.write(" -Genomic coordinates are based on GRCh37/hg19...", verbose=verbose)
45
47
  final_build = "19"
48
+ elif str(build).lower() in ["hg18","18","36","b36","grch36"]:
49
+ log.write(" -Genomic coordinates are based on GRCh36/hg18...", verbose=verbose)
50
+ final_build = "18"
46
51
  elif str(build).lower() in ["hg38","38","b38","grch38"]:
47
52
  log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
48
53
  final_build = "38"
54
+ elif str(build).lower() in ["t2t","hs1","chm13","13"]:
55
+ log.write(" -Genomic coordinates are based on T2T-CHM13...", verbose=verbose)
56
+ final_build = "13"
49
57
  else:
50
58
  log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
51
59
  final_build = "99"
@@ -358,6 +366,76 @@ def fixID(sumstats,
358
366
 
359
367
  ""
360
368
 
369
+ def stripSNPID(sumstats,snpid="SNPID",overwrite=False,verbose=True,log=Log()):
370
+ '''
371
+ flip EA and NEA SNPid CHR:POS:EA:NEA -> CHR:POS:NEA:EA
372
+ '''
373
+ ##start function with col checking##########################################################
374
+ _start_line = "strip SNPID"
375
+ _end_line = "stripping SNPID"
376
+ _start_cols =["SNPID"]
377
+ _start_function = ".strip_snpid()"
378
+ _must_args ={}
379
+
380
+ is_enough_info = start_to(sumstats=sumstats,
381
+ log=log,
382
+ verbose=verbose,
383
+ start_line=_start_line,
384
+ end_line=_end_line,
385
+ start_cols=_start_cols,
386
+ start_function=_start_function,
387
+ **_must_args)
388
+ if is_enough_info == False: return sumstats
389
+ log.write(" -Checking if SNPID is (xxx:)CHR:POS:ATCG_Allele:ATCG_Allele(:xxx)...(separator: - ,: , _)",verbose=verbose)
390
+ is_chrposrefalt = sumstats[snpid].str.contains(r'[:_-]?\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+[:_-]?', case=False, flags=0, na=False)
391
+ # check if SNPID is NA
392
+ is_snpid_na = sumstats[snpid].isna()
393
+
394
+ log.write(" -Stripping {} non-NA fixable SNPIDs...".format(sum(is_chrposrefalt)),verbose=verbose)
395
+
396
+ # flip
397
+ sumstats.loc[is_chrposrefalt,snpid] = \
398
+ sumstats.loc[is_chrposrefalt,snpid].str.extract(r'[:_-]?(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)[:_-]?',flags=re.IGNORECASE|re.ASCII)[1].astype("string")
399
+
400
+ finished(log,verbose,_end_line)
401
+ return sumstats
402
+
403
+ def flipSNPID(sumstats,snpid="SNPID",overwrite=False,verbose=True,log=Log()):
404
+ '''
405
+ flip EA and NEA SNPid CHR:POS:EA:NEA -> CHR:POS:NEA:EA
406
+ '''
407
+ ##start function with col checking##########################################################
408
+ _start_line = "flip SNPID"
409
+ _end_line = "flipping SNPID"
410
+ _start_cols =["SNPID"]
411
+ _start_function = ".flip_snpid()"
412
+ _must_args ={}
413
+
414
+ is_enough_info = start_to(sumstats=sumstats,
415
+ log=log,
416
+ verbose=verbose,
417
+ start_line=_start_line,
418
+ end_line=_end_line,
419
+ start_cols=_start_cols,
420
+ start_function=_start_function,
421
+ **_must_args)
422
+ if is_enough_info == False: return sumstats
423
+ log.warning("This function only flips alleles in SNPID without changing EA, NEA, STATUS or any statistics.")
424
+ log.write(" -Checking if SNPID is CHR:POS:ATCG_Allele:ATCG_Allele...(separator: - ,: , _)",verbose=verbose)
425
+ is_chrposrefalt = sumstats[snpid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
426
+ # check if SNPID is NA
427
+ is_snpid_na = sumstats[snpid].isna()
428
+
429
+ log.write(" -Flipping {} non-NA fixable SNPIDs...".format(sum(is_chrposrefalt)),verbose=verbose)
430
+
431
+ # flip
432
+ sumstats.loc[is_chrposrefalt,snpid] = \
433
+ sumstats.loc[is_chrposrefalt,snpid].str.extract(r'^(chr)?(\w+[:_-]\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1].astype("string") \
434
+ + ":"+sumstats.loc[is_chrposrefalt,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4].astype("string") \
435
+ + ":"+sumstats.loc[is_chrposrefalt,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3].astype("string")
436
+
437
+ finished(log,verbose,_end_line)
438
+ return sumstats
361
439
 
362
440
  ###############################################################################################################
363
441
  # 20230128
@@ -1041,7 +1119,7 @@ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, ver
1041
1119
  cols_to_check.append(header)
1042
1120
  if header=="STATUS":
1043
1121
  log.write(" -Checking STATUS and converting STATUS to categories....", verbose=verbose)
1044
- categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
1122
+ categories = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
1045
1123
  sumstats[header] = pd.Categorical(sumstats[header],categories=categories)
1046
1124
  return sumstats
1047
1125
 
@@ -1496,8 +1574,20 @@ def liftover_variant(sumstats,
1496
1574
  pos="POS",
1497
1575
  status="STATUS",
1498
1576
  from_build="19",
1499
- to_build="38"):
1500
- converter = get_lifter("hg"+from_build,"hg"+to_build)
1577
+ to_build="38",
1578
+ chain=None):
1579
+
1580
+ try:
1581
+ if chain is None:
1582
+ converter = get_lifter(from_build,to_build,one_based=True)
1583
+ else:
1584
+ converter = ChainFile(chain, one_based=True)
1585
+ except:
1586
+ if chain is None:
1587
+ converter = get_lifter(from_build,to_build)
1588
+ else:
1589
+ converter = ChainFile(chain)
1590
+
1501
1591
  dic= get_number_to_chr(in_chr=False,xymt=["X","Y","M"])
1502
1592
  dic2= get_chr_to_number(out_chr=False)
1503
1593
  for i in sumstats[chrom].unique():
@@ -1509,7 +1599,7 @@ def liftover_variant(sumstats,
1509
1599
  sumstats.loc[variants_on_chrom_to_convert,chrom] = lifted.str[0].map(dic2).astype("Int64")
1510
1600
  return sumstats
1511
1601
 
1512
- def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True, verbose=True,log=Log()):
1602
+ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True,chain=None, verbose=True,log=Log()):
1513
1603
  ##start function with col checking##########################################################
1514
1604
  _start_line = "perform liftover"
1515
1605
  _end_line = "liftover"
@@ -1528,8 +1618,21 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
1528
1618
  **_must_args)
1529
1619
  if is_enough_info == False: return sumstats
1530
1620
  ############################################################################################
1621
+
1622
+ lifter_from_build = _process_build(from_build,log=log,verbose=False)
1623
+ lifter_to_build = _process_build(to_build,log=log,verbose=False)
1531
1624
 
1532
- log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build, verbose=verbose)
1625
+ if chain is not None:
1626
+ log.write(" -Creating converter using ChainFile: {}".format(chain), verbose=verbose)
1627
+ else:
1628
+ try:
1629
+ chain = get_chain(from_build=from_build, to_build=to_build)
1630
+ log.write(" -Creating converter using ChainFile: {}".format(chain), verbose=verbose)
1631
+ except:
1632
+ chain = None
1633
+ lifter_from_build=from_build
1634
+ lifter_to_build=to_build
1635
+ log.write(" -Creating converter : {} -> {}".format(lifter_from_build, lifter_to_build), verbose=verbose)
1533
1636
  # valid chr and pos
1534
1637
  pattern = r"\w\w\w0\w\w\w"
1535
1638
  to_lift = sumstats[status].str.match(pattern)
@@ -1545,11 +1648,10 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
1545
1648
  pool = Pool(n_cores)
1546
1649
  #df = pd.concat(pool.starmap(func, df_split))
1547
1650
  func=liftover_variant
1548
- sumstats[[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status),df_split))
1651
+ sumstats[[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status,chain=chain),df_split))
1549
1652
  pool.close()
1550
1653
  pool.join()
1551
1654
  ############################################################################
1552
-
1553
1655
  unmap_num = len(sumstats.loc[sumstats[pos].isna(),:])
1554
1656
 
1555
1657
  if remove is True: