gwaslab 3.4.44__tar.gz → 3.4.46__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (89) hide show
  1. {gwaslab-3.4.44/src/gwaslab.egg-info → gwaslab-3.4.46}/PKG-INFO +7 -7
  2. {gwaslab-3.4.44 → gwaslab-3.4.46}/README.md +2 -2
  3. {gwaslab-3.4.44 → gwaslab-3.4.46}/pyproject.toml +5 -5
  4. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/__init__.py +2 -1
  5. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/bd_common_data.py +22 -0
  6. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/g_Sumstats.py +2 -0
  7. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/g_SumstatsPair.py +1 -1
  8. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/g_vchange_status.py +4 -2
  9. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/g_version.py +2 -2
  10. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/hm_harmonize_sumstats.py +14 -6
  11. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/io_preformat_input.py +22 -1
  12. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/qc_fix_sumstats.py +8 -1
  13. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_in_filter_value.py +38 -2
  14. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_in_get_sig.py +32 -8
  15. gwaslab-3.4.46/src/gwaslab/util_in_meta.py +234 -0
  16. gwaslab-3.4.46/src/gwaslab/util_in_snphwe.py +58 -0
  17. gwaslab-3.4.46/src/gwaslab/viz_aux_chromatin.py +111 -0
  18. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/viz_plot_compare_effect.py +4 -1
  19. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/viz_plot_mqqplot.py +2 -0
  20. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/viz_plot_regionalplot.py +4 -0
  21. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/viz_plot_stackedregional.py +69 -13
  22. {gwaslab-3.4.44 → gwaslab-3.4.46/src/gwaslab.egg-info}/PKG-INFO +7 -7
  23. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab.egg-info/SOURCES.txt +3 -0
  24. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab.egg-info/requires.txt +3 -3
  25. {gwaslab-3.4.44 → gwaslab-3.4.46}/LICENSE +0 -0
  26. {gwaslab-3.4.44 → gwaslab-3.4.46}/LICENSE_before_v3.4.39 +0 -0
  27. {gwaslab-3.4.44 → gwaslab-3.4.46}/setup.cfg +0 -0
  28. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/bd_config.py +0 -0
  29. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/bd_download.py +0 -0
  30. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/bd_get_hapmap3.py +0 -0
  31. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/cache_manager.py +0 -0
  32. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/data/chrx_par/chrx_par_hg19.bed.gz +0 -0
  33. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/data/chrx_par/chrx_par_hg38.bed.gz +0 -0
  34. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/data/formatbook.json +0 -0
  35. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz +0 -0
  36. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz +0 -0
  37. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/data/high_ld/high_ld_hla_hg19.bed.gz +0 -0
  38. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/data/high_ld/high_ld_hla_hg38.bed.gz +0 -0
  39. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/data/reference.json +0 -0
  40. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/g_Log.py +0 -0
  41. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/g_Phenotypes.py +0 -0
  42. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/g_SumstatsT.py +0 -0
  43. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/g_Sumstats_summary.py +0 -0
  44. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/g_meta.py +0 -0
  45. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/hm_casting.py +0 -0
  46. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/hm_rsid_to_chrpos.py +0 -0
  47. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/io_read_ldsc.py +0 -0
  48. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/io_read_tabular.py +0 -0
  49. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/io_to_formats.py +0 -0
  50. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/io_to_pickle.py +0 -0
  51. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/ldsc_irwls.py +0 -0
  52. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/ldsc_jackknife.py +0 -0
  53. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/ldsc_ldscore.py +0 -0
  54. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/ldsc_parse.py +0 -0
  55. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/ldsc_regressions.py +0 -0
  56. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/ldsc_sumstats.py +0 -0
  57. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/qc_check_datatype.py +0 -0
  58. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/run_script.py +0 -0
  59. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_ex_calculate_ldmatrix.py +0 -0
  60. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_ex_calculate_prs.py +0 -0
  61. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_ex_gwascatalog.py +0 -0
  62. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_ex_ldproxyfinder.py +0 -0
  63. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_ex_ldsc.py +0 -0
  64. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_ex_plink_filter.py +0 -0
  65. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_ex_process_h5.py +0 -0
  66. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_ex_process_ref.py +0 -0
  67. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_ex_run_2samplemr.py +0 -0
  68. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_ex_run_clumping.py +0 -0
  69. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_ex_run_coloc.py +0 -0
  70. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_ex_run_susie.py +0 -0
  71. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_in_calculate_gc.py +0 -0
  72. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_in_calculate_power.py +0 -0
  73. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_in_convert_h2.py +0 -0
  74. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_in_correct_winnerscurse.py +0 -0
  75. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_in_fill_data.py +0 -0
  76. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_in_get_density.py +0 -0
  77. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/viz_aux_annotate_plot.py +0 -0
  78. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/viz_aux_quickfix.py +0 -0
  79. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/viz_aux_reposition_text.py +0 -0
  80. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/viz_aux_save_figure.py +0 -0
  81. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/viz_plot_compare_af.py +0 -0
  82. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/viz_plot_forestplot.py +0 -0
  83. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/viz_plot_miamiplot.py +0 -0
  84. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/viz_plot_miamiplot2.py +0 -0
  85. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/viz_plot_qqplot.py +0 -0
  86. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/viz_plot_rg_heatmap.py +0 -0
  87. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/viz_plot_trumpetplot.py +0 -0
  88. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab.egg-info/dependency_links.txt +0 -0
  89. {gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: gwaslab
3
- Version: 3.4.44
3
+ Version: 3.4.46
4
4
  Summary: A collection of handy tools for GWAS SumStats
5
5
  Author-email: Yunye <yunye@gwaslab.com>
6
6
  Project-URL: Homepage, https://cloufield.github.io/gwaslab/
@@ -8,16 +8,16 @@ Project-URL: Github, https://github.com/Cloufield/gwaslab
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: License :: OSI Approved :: MIT License
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: <=3.10,>=3.9
11
+ Requires-Python: <3.11,>=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  License-File: LICENSE_before_v3.4.39
15
15
  Requires-Dist: pandas!=1.5,>=1.3
16
- Requires-Dist: numpy>=1.21.2
17
- Requires-Dist: matplotlib!=3.7.2,>=3.5
16
+ Requires-Dist: numpy<2,>=1.21.2
17
+ Requires-Dist: matplotlib!=3.7.2,<3.9,>=3.5
18
18
  Requires-Dist: seaborn>=0.12
19
19
  Requires-Dist: scipy>=1.12
20
- Requires-Dist: pySAM<0.20,>=0.18.1
20
+ Requires-Dist: pySAM==0.22.1
21
21
  Requires-Dist: Biopython>=1.79
22
22
  Requires-Dist: adjustText<=0.8,>=0.7.3
23
23
  Requires-Dist: liftover>=1.1.13
@@ -51,7 +51,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
51
51
  ### install via pip
52
52
 
53
53
  ```
54
- pip install gwaslab==3.4.43
54
+ pip install gwaslab==3.4.45
55
55
  ```
56
56
 
57
57
  ```python
@@ -90,7 +90,7 @@ Create a Python 3.9 environment and install gwaslab using pip:
90
90
  ```
91
91
  conda env create -n gwaslab_test -c conda-forge python=3.9
92
92
  conda activate gwaslab
93
- pip install gwaslab==3.4.43
93
+ pip install gwaslab==3.4.45
94
94
  ```
95
95
 
96
96
  or create a new environment using yml file [environment_3.4.40.yml](https://github.com/Cloufield/gwaslab/blob/main/environment_3.4.40.yml)
@@ -23,7 +23,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
23
23
  ### install via pip
24
24
 
25
25
  ```
26
- pip install gwaslab==3.4.43
26
+ pip install gwaslab==3.4.45
27
27
  ```
28
28
 
29
29
  ```python
@@ -62,7 +62,7 @@ Create a Python 3.9 environment and install gwaslab using pip:
62
62
  ```
63
63
  conda env create -n gwaslab_test -c conda-forge python=3.9
64
64
  conda activate gwaslab
65
- pip install gwaslab==3.4.43
65
+ pip install gwaslab==3.4.45
66
66
  ```
67
67
 
68
68
  or create a new environment using yml file [environment_3.4.40.yml](https://github.com/Cloufield/gwaslab/blob/main/environment_3.4.40.yml)
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
7
7
 
8
8
  [project]
9
9
  name = "gwaslab"
10
- version = "3.4.44"
10
+ version = "3.4.46"
11
11
  authors = [
12
12
  { name="Yunye", email="yunye@gwaslab.com" },
13
13
  ]
@@ -17,11 +17,11 @@ readme = "README.md"
17
17
 
18
18
  dependencies = [
19
19
  "pandas>=1.3,!=1.5",
20
- "numpy>=1.21.2",
21
- "matplotlib>=3.5,!=3.7.2",
20
+ "numpy>=1.21.2,<2",
21
+ "matplotlib>=3.5,!=3.7.2,<3.9",
22
22
  "seaborn>=0.12",
23
23
  "scipy>=1.12",
24
- "pySAM>=0.18.1,<0.20",
24
+ "pySAM==0.22.1",
25
25
  "Biopython>=1.79",
26
26
  "adjustText>=0.7.3, <=0.8",
27
27
  "liftover>=1.1.13",
@@ -31,7 +31,7 @@ dependencies = [
31
31
  "h5py>=3.10.0"
32
32
  ]
33
33
 
34
- requires-python = ">=3.9,<=3.10"
34
+ requires-python = ">=3.9,<3.11"
35
35
  classifiers = [
36
36
  "Programming Language :: Python :: 3",
37
37
  "License :: OSI Approved :: MIT License",
@@ -44,4 +44,5 @@ from gwaslab.viz_plot_trumpetplot import plot_power
44
44
  from gwaslab.viz_plot_trumpetplot import plot_power_x
45
45
  from gwaslab.util_ex_process_h5 import process_vcf_to_hfd5
46
46
  from gwaslab.util_ex_run_susie import _run_susie_rss as run_susie_rss
47
- from gwaslab.io_read_tabular import _read_tabular as read_tabular
47
+ from gwaslab.io_read_tabular import _read_tabular as read_tabular
48
+ from gwaslab.util_in_meta import meta_analyze
@@ -298,6 +298,28 @@ def gtf_to_protein_coding(gtfpath,log=Log(),verbose=True):
298
298
 
299
299
  return protein_coding_path
300
300
 
301
+ def gtf_to_all_gene(gtfpath,log=Log(),verbose=True):
302
+ all_gene_path = gtfpath[:-6]+"all_genes.gtf.gz"
303
+ # if not existing, extract protein coding records and output to a new file
304
+ if not path.isfile(all_gene_path):
305
+
306
+ # get gene list
307
+ log.write(" - Extracting genes from {}".format(gtfpath),verbose=verbose)
308
+ gtf = read_gtf(gtfpath,usecols=["feature","gene_biotype","gene_id","gene_name"])
309
+ gene_list = gtf.loc[gtf["feature"]=="gene","gene_id"].values
310
+ log.write(" - Loaded {} genes.".format(len(gene_list)),verbose=verbose)
311
+
312
+ # extract entry using csv
313
+ gtf_raw = pd.read_csv(gtfpath,sep="\t",header=None,comment="#",dtype="string")
314
+ gtf_raw["_gene_id"] = gtf_raw[8].str.extract(r'gene_id "([\w\.-]+)"')
315
+ gtf_raw = gtf_raw.loc[ gtf_raw["_gene_id"].isin(gene_list) ,:]
316
+ gtf_raw = gtf_raw.drop("_gene_id",axis=1)
317
+
318
+ log.write(" - Extracted records are saved to : {} ".format(all_gene_path),verbose=verbose)
319
+ gtf_raw.to_csv(all_gene_path, header=None, index=None, sep="\t")
320
+
321
+ return all_gene_path
322
+
301
323
  ####################################################################################################################
302
324
  # From BioPython: https://github.com/biopython/biopython/blob/c5a6b1374267d769b19c1022b4b45472316e78b4/Bio/Seq.py#L36
303
325
  def _maketrans(complement_mapping):
@@ -121,6 +121,7 @@ class Sumstats():
121
121
  snpr2=None,
122
122
  status=None,
123
123
  other=[],
124
+ usekeys=None,
124
125
  direction=None,
125
126
  verbose=True,
126
127
  study="Study_1",
@@ -200,6 +201,7 @@ class Sumstats():
200
201
  trait=trait,
201
202
  status=status,
202
203
  other=other,
204
+ usekeys=usekeys,
203
205
  verbose=verbose,
204
206
  readargs=readargs,
205
207
  log=self.log)
@@ -139,7 +139,7 @@ class SumstatsPair( ):
139
139
  self.clumps["clumps"], self.clumps["plink_log"] = _clump(self.data, log=self.log, p="P_1",mlog10p="MLOG10P_1", study = self.study_name, **kwargs)
140
140
 
141
141
  def to_coloc(self,**kwargs):
142
- self.to_finemapping_file_path, self.plink_log = tofinemapping(self.data,study=self.study_name,suffixes=self.suffixes,log=self.log,**kwargs)
142
+ self.to_finemapping_file_path, output_file_list, self.plink_log = tofinemapping(self.data,study=self.study_name,suffixes=self.suffixes,log=self.log,**kwargs)
143
143
 
144
144
  def run_coloc_susie(self,**kwargs):
145
145
 
@@ -1,13 +1,15 @@
1
1
  import pandas as pd
2
2
 
3
+ CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
4
+
3
5
  def vchange_status(status,digit,before,after):
4
6
  dic={}
5
7
  for i in range(len(before)):
6
8
  dic[before[i]]=after[i]
7
9
  if digit>1:
8
- return status.str[:digit-1]+status.str[digit-1].replace(dic)+status.str[digit:]
10
+ return pd.Categorical(status.str[:digit-1]+status.str[digit-1].replace(dic)+status.str[digit:],categories=CATEGORIES)
9
11
  else:
10
- return status.str[digit-1].replace(dic)+status.str[digit:]
12
+ return pd.Categorical(status.str[digit-1].replace(dic)+status.str[digit:],categories=CATEGORIES)
11
13
 
12
14
  def copy_status(from_status,to_status, digit):
13
15
  if digit>1:
@@ -15,8 +15,8 @@ def _get_version():
15
15
  def gwaslab_info():
16
16
  # version meta information
17
17
  dic={
18
- "version":"3.4.44",
19
- "release_date":"20240424"
18
+ "version":"3.4.46",
19
+ "release_date":"20240624"
20
20
  }
21
21
  return dic
22
22
 
@@ -355,7 +355,11 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
355
355
 
356
356
  log.write("\n",end="",show_time=False,verbose=verbose)
357
357
 
358
- sumstats[status] = sumstats[status].astype("string")
358
+ CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
359
+ sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
360
+ #sumstats[status] = sumstats[status].astype("string")
361
+
362
+
359
363
  available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
360
364
  status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
361
365
  status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
@@ -669,9 +673,11 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
669
673
  sumstats_to_check = sumstats.loc[to_check_ref,[chrom,pos,ea,nea,status]]
670
674
  sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
671
675
  log.write(" -Finished checking records", verbose=verbose)
672
-
673
- sumstats[status] = sumstats[status].astype("string")
674
-
676
+
677
+ CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
678
+ sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
679
+ #sumstats[status] = sumstats[status].astype("string")
680
+
675
681
  available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
676
682
  status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
677
683
  status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
@@ -700,6 +706,7 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
700
706
  if remove is True:
701
707
  sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
702
708
  log.write(" -Variants not on given reference sequence were removed.",verbose=verbose)
709
+
703
710
 
704
711
  finished(log, verbose, _end_line)
705
712
  return sumstats
@@ -861,8 +868,9 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
861
868
  if is_enough_info == False: return sumstats
862
869
  ############################################################################################
863
870
 
864
- standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
865
-
871
+ #standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
872
+ standardized_normalized = sumstats["STATUS"] == sumstats["STATUS"]
873
+
866
874
  if rsid not in sumstats.columns:
867
875
  sumstats[rsid]=pd.Series(dtype="string")
868
876
 
@@ -55,6 +55,7 @@ def preformat(sumstats,
55
55
  trait=None,
56
56
  build=None,
57
57
  other=[],
58
+ usekeys=None,
58
59
  verbose=False,
59
60
  readargs=None,
60
61
  log=None):
@@ -65,6 +66,11 @@ def preformat(sumstats,
65
66
  dtype_dictionary ={}
66
67
 
67
68
  #######################################################################################################################################################
69
+ # workflow:
70
+ # 1. formatbook
71
+ # 2. user specified header
72
+ # 3. usekeys
73
+
68
74
  if fmt is not None:
69
75
  # loading format parameters
70
76
  log.write("Start to load format from formatbook....",verbose=verbose)
@@ -129,6 +135,8 @@ def preformat(sumstats,
129
135
 
130
136
  ################################################
131
137
  for key,value in rename_dictionary.items():
138
+ # check avaiable keys key->raw header
139
+ # usecols : a list of raw headers to load from file/DataFrame
132
140
  if key in raw_cols:
133
141
  usecols.append(key)
134
142
  if value in ["EA","NEA"]:
@@ -137,7 +145,7 @@ def preformat(sumstats,
137
145
  dtype_dictionary[value]="string"
138
146
 
139
147
  except ValueError:
140
- raise ValueError("Please input a path or a pd.DataFrame, and make sure the columns you specified are in the file.")
148
+ raise ValueError("Please input a path or a pd.DataFrame, and make sure the separator is correct and the columns you specified are in the file.")
141
149
 
142
150
  ###################################################################################################################################################
143
151
  ## check columns/datatype to use
@@ -276,6 +284,19 @@ def preformat(sumstats,
276
284
  else:
277
285
  study = raw_cols[9]
278
286
  usecols = usecols + [study]
287
+
288
+ if usekeys is not None:
289
+ # extract only specified keys
290
+ usecols_new =[]
291
+ for i in usekeys:
292
+ for k, v in rename_dictionary.items():
293
+ if i == v:
294
+ usecols_new.append(k)
295
+ usecols_valid =[]
296
+ for i in usecols_new:
297
+ if i in usecols:
298
+ usecols_valid.append(i)
299
+ usecols = usecols_valid
279
300
  #loading data ##########################################################################################################
280
301
 
281
302
  try:
@@ -1061,6 +1061,13 @@ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, ver
1061
1061
  if sum(is_low_p) >0:
1062
1062
  log.warning("Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)))
1063
1063
  log.warning("Please consider using MLOG10P instead.")
1064
+
1065
+ if header=="INFO":
1066
+ is_high_info = sumstats["INFO"]>1
1067
+ if sum(is_high_info) >0:
1068
+ log.warning("High INFO detected (INFO>1) : {}".format(sum(is_high_info)))
1069
+ log.warning("max(INFO): {}".format(sumstats["INFO"].max()))
1070
+ log.warning("Please check if this is as expected.")
1064
1071
 
1065
1072
  if sum(~is_valid)>0:
1066
1073
  try:
@@ -1102,7 +1109,7 @@ def sanitycheckstats(sumstats,
1102
1109
  HR=(-100,100),
1103
1110
  HR_95L=(0,float("Inf")),
1104
1111
  HR_95U=(0,float("Inf")),
1105
- info=(0,1),
1112
+ info=(0,2),
1106
1113
  float_tolerence = 1e-7,
1107
1114
  verbose=True,
1108
1115
  log=Log()):
@@ -10,6 +10,7 @@ from gwaslab.g_vchange_status import vchange_status
10
10
  from gwaslab.qc_fix_sumstats import sortcoordinate
11
11
  from gwaslab.qc_fix_sumstats import start_to
12
12
  from gwaslab.qc_fix_sumstats import finished
13
+ from gwaslab.qc_fix_sumstats import _process_build
13
14
  from gwaslab.hm_harmonize_sumstats import is_palindromic
14
15
 
15
16
  import gc
@@ -430,8 +431,43 @@ def _filter_snp(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
430
431
  log.write("Finished filtering SNPs.",verbose=verbose)
431
432
  return snp
432
433
 
433
- def _exclude_hla(sumstats, chrom="CHR", pos="POS", lower=25000000 ,upper=34000000 ,log=Log(), verbose=True):
434
-
434
+ def _exclude_hla(sumstats, chrom="CHR", pos="POS", lower=None ,upper=None, build=None, mode="xmhc", log=Log(), verbose=True):
435
+
436
+ if build is not None:
437
+ build = _process_build(build = build,log = log,verbose = verbose)
438
+ # xMHC : HIST1H2AA ~ 7.6mb ~ RPL12P1
439
+ # reference: Horton, R., Wilming, L., Rand, V., Lovering, R. C., Bruford, E. A., Khodiyar, V. K., ... & Beck, S. (2004). Gene map of the extended human MHC. Nature Reviews Genetics, 5(12), 889-899.
440
+ # hg38: 25,726,063 ~ 33,400,644
441
+ # hg19 : 25,726,291 ~ 33,368,421
442
+
443
+ # HLA : GABBR1 ~ 3.78mb ~ KIFC1
444
+ # reference: Shiina, T., Hosomichi, K., Inoko, H., & Kulski, J. K. (2009). The HLA genomic loci map: expression, interaction, diversity and disease. Journal of human genetics, 54(1), 15-39.
445
+ # hg38: 29,602,238 ~ 33,409,896
446
+ # hg19: 29,570,015 ~ 33,377,673
447
+
448
+ if build == "19":
449
+ if mode =="xmhc":
450
+ lower=25000000
451
+ upper=34000000
452
+ if mode =="hla" or mode =="mhc":
453
+ lower=29500000
454
+ upper=33500000
455
+ if build == "38":
456
+ if mode =="xmhc":
457
+ lower=25000000
458
+ upper=34000000
459
+ if mode =="hla" or mode =="mhc":
460
+ lower=29500000
461
+ upper=33500000
462
+ else:
463
+ # -> 25,000,000 ~ 34,000,000
464
+ if mode =="xmhc":
465
+ lower=25000000
466
+ upper=34000000
467
+ if mode =="hla" or mode =="mhc":
468
+ lower=29500000
469
+ upper=33500000
470
+
435
471
  raw_len = len(sumstats)
436
472
 
437
473
  if str(sumstats[chrom].dtype) == "string":
@@ -11,6 +11,7 @@ from gwaslab.bd_common_data import get_chr_to_number
11
11
  from gwaslab.bd_common_data import get_number_to_chr
12
12
  from gwaslab.bd_common_data import get_chr_to_NC
13
13
  from gwaslab.bd_common_data import gtf_to_protein_coding
14
+ from gwaslab.bd_common_data import gtf_to_all_gene
14
15
  from gwaslab.bd_download import check_and_download
15
16
  from gwaslab.util_ex_gwascatalog import gwascatalog_trait
16
17
  from gwaslab.qc_fix_sumstats import check_dataframe_shape
@@ -38,6 +39,7 @@ def getsig(insumstats,
38
39
  wc_correction=False,
39
40
  build="19",
40
41
  source="ensembl",
42
+ gtf_path=None,
41
43
  verbose=True):
42
44
  """
43
45
  Extract the lead variants using a sliding window. P or MLOG10P will be used and converted to SCALEDP for sorting.
@@ -172,6 +174,7 @@ def getsig(insumstats,
172
174
  xymt=xymt,
173
175
  build=build,
174
176
  source=source,
177
+ gtf_path=gtf_path,
175
178
  verbose=verbose)
176
179
 
177
180
  # drop internal id
@@ -253,6 +256,7 @@ def annogene(
253
256
  xymt=["X","Y","MT"],
254
257
  build="19",
255
258
  source="ensembl",
259
+ gtf_path=None,
256
260
  verbose=True):
257
261
 
258
262
  log.write("Start to annotate variants with nearest gene name(s)...", verbose=verbose)
@@ -267,8 +271,13 @@ def annogene(
267
271
  #| gzip >Homo_sapiens.GRCh37.75.processed.chr.gtf.gz
268
272
 
269
273
  #gtf_path = check_and_download("ensembl_hg19_gtf_protein_coding")
270
- gtf_path = check_and_download("ensembl_hg19_gtf")
271
- gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
274
+ if gtf_path is None:
275
+ gtf_path = check_and_download("ensembl_hg19_gtf")
276
+ gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
277
+ else:
278
+ log.write(" -Using user-provided gtf:{}".format(gtf_path))
279
+ gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
280
+
272
281
  gtf_db_path = gtf_path[:-2]+"db"
273
282
 
274
283
  data = Genome(
@@ -283,8 +292,13 @@ def annogene(
283
292
  elif build=="38":
284
293
  log.write(" -Assigning Gene name using ensembl_hg38_gtf for protein coding genes", verbose=verbose)
285
294
  #gtf_path = check_and_download("ensembl_hg38_gtf_protein_coding")
286
- gtf_path = check_and_download("ensembl_hg38_gtf")
287
- gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
295
+ if gtf_path is None:
296
+ gtf_path = check_and_download("ensembl_hg38_gtf")
297
+ gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
298
+ else:
299
+ log.write(" -Using user-provided gtf:{}".format(gtf_path))
300
+ gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
301
+
288
302
  gtf_db_path = gtf_path[:-2]+"db"
289
303
  data = Genome(
290
304
  reference_name='GRCh38',
@@ -300,8 +314,13 @@ def annogene(
300
314
  if build=="19":
301
315
  log.write(" -Assigning Gene name using NCBI refseq latest GRCh37 for protein coding genes", verbose=verbose)
302
316
  #gtf_path = check_and_download("refseq_hg19_gtf_protein_coding")
303
- gtf_path = check_and_download("refseq_hg19_gtf")
304
- gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
317
+ if gtf_path is None:
318
+ gtf_path = check_and_download("refseq_hg19_gtf")
319
+ gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
320
+ else:
321
+ log.write(" -Using user-provided gtf:{}".format(gtf_path))
322
+ gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
323
+
305
324
  gtf_db_path = gtf_path[:-2]+"db"
306
325
  data = Genome(
307
326
  reference_name='GRCh37',
@@ -315,8 +334,13 @@ def annogene(
315
334
  elif build=="38":
316
335
  log.write(" -Assigning Gene name using NCBI refseq latest GRCh38 for protein coding genes", verbose=verbose)
317
336
  #gtf_path = check_and_download("refseq_hg38_gtf_protein_coding")
318
- gtf_path = check_and_download("refseq_hg38_gtf")
319
- gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
337
+ if gtf_path is None:
338
+ gtf_path = check_and_download("refseq_hg38_gtf")
339
+ gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
340
+ else:
341
+ log.write(" -Using user-provided gtf:{}".format(gtf_path))
342
+ gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
343
+
320
344
  gtf_db_path = gtf_path[:-2]+"db"
321
345
  data = Genome(
322
346
  reference_name='GRCh38',