gwaslab 3.4.47__tar.gz → 3.4.49__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- {gwaslab-3.4.47/src/gwaslab.egg-info → gwaslab-3.4.49}/PKG-INFO +2 -2
- {gwaslab-3.4.47 → gwaslab-3.4.49}/README.md +1 -1
- {gwaslab-3.4.47 → gwaslab-3.4.49}/pyproject.toml +1 -1
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/bd_common_data.py +3 -1
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/reference.json +10 -2
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_Sumstats.py +22 -2
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_vchange_status.py +1 -1
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_version.py +2 -2
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/hm_harmonize_sumstats.py +23 -7
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/io_preformat_input.py +73 -8
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/io_read_ldsc.py +16 -2
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/io_to_formats.py +5 -5
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/qc_fix_sumstats.py +109 -7
- gwaslab-3.4.49/src/gwaslab/util_abf_finemapping.py +67 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_ldsc.py +8 -1
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_run_clumping.py +6 -6
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_fill_data.py +20 -2
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_aux_annotate_plot.py +2 -1
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_aux_quickfix.py +2 -1
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_compare_effect.py +4 -2
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_miamiplot2.py +10 -9
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_mqqplot.py +42 -21
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_regional2.py +75 -29
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_stackedregional.py +37 -16
- {gwaslab-3.4.47 → gwaslab-3.4.49/src/gwaslab.egg-info}/PKG-INFO +2 -2
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab.egg-info/SOURCES.txt +1 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/LICENSE +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/setup.cfg +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/__init__.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/bd_config.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/bd_download.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/bd_get_hapmap3.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/cache_manager.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/chrx_par/chrx_par_hg19.bed.gz +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/chrx_par/chrx_par_hg38.bed.gz +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/formatbook.json +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/high_ld/high_ld_hla_hg19.bed.gz +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/high_ld/high_ld_hla_hg38.bed.gz +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_Log.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_Phenotypes.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_SumstatsPair.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_SumstatsT.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_Sumstats_summary.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_meta.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/hm_casting.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/hm_rsid_to_chrpos.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/io_read_tabular.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/io_to_pickle.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/ldsc_irwls.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/ldsc_jackknife.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/ldsc_ldscore.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/ldsc_parse.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/ldsc_regressions.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/ldsc_sumstats.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/qc_check_datatype.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/run_script.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_calculate_ldmatrix.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_calculate_prs.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_gwascatalog.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_ldproxyfinder.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_plink_filter.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_process_h5.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_process_ref.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_run_2samplemr.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_run_coloc.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_ex_run_susie.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_calculate_gc.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_calculate_power.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_convert_h2.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_correct_winnerscurse.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_filter_value.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_get_density.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_get_sig.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_meta.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/util_in_snphwe.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_aux_chromatin.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_aux_reposition_text.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_aux_save_figure.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_compare_af.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_forestplot.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_miamiplot.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_qqplot.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_regionalplot.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_rg_heatmap.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/viz_plot_trumpetplot.py +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab.egg-info/dependency_links.txt +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab.egg-info/requires.txt +0 -0
- {gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: gwaslab
|
|
3
|
-
Version: 3.4.
|
|
3
|
+
Version: 3.4.49
|
|
4
4
|
Summary: A collection of handy tools for GWAS SumStats
|
|
5
5
|
Author-email: Yunye <yunye@gwaslab.com>
|
|
6
6
|
Project-URL: Homepage, https://cloufield.github.io/gwaslab/
|
|
@@ -51,7 +51,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
|
|
|
51
51
|
### install via pip
|
|
52
52
|
|
|
53
53
|
```
|
|
54
|
-
pip install gwaslab==3.4.
|
|
54
|
+
pip install gwaslab==3.4.46
|
|
55
55
|
```
|
|
56
56
|
|
|
57
57
|
```python
|
|
@@ -274,7 +274,9 @@ def get_gtf(chrom, build="19",source="ensembl"):
|
|
|
274
274
|
gtf = pd.DataFrame(columns=["seqname","start","end","strand","feature","gene_biotype","gene_id","gene_name"])
|
|
275
275
|
return gtf
|
|
276
276
|
|
|
277
|
-
|
|
277
|
+
def get_chain(from_build="19", to_build="38"):
|
|
278
|
+
chain_path = check_and_download("{}to{}".format(from_build, to_build))
|
|
279
|
+
return chain_path
|
|
278
280
|
####################################################################################################################
|
|
279
281
|
def gtf_to_protein_coding(gtfpath,log=Log(),verbose=True):
|
|
280
282
|
protein_coding_path = gtfpath[:-6]+"protein_coding.gtf.gz"
|
|
@@ -90,12 +90,20 @@
|
|
|
90
90
|
"1kg_dbsnp151_hg38_x_md5":"48c05eeb1454c0dd4cbee3cb26382e8e",
|
|
91
91
|
"recombination_hg19":"https://www.dropbox.com/s/wbesl8haxknonuc/recombination_hg19.tar.gz?dl=1",
|
|
92
92
|
"recombination_hg38":"https://www.dropbox.com/s/vuo8mvqx0fpibzj/recombination_hg38.tar.gz?dl=1",
|
|
93
|
-
"ensembl_hg19_gtf":"https://ftp.ensembl.org/pub/grch37/
|
|
93
|
+
"ensembl_hg19_gtf":"https://ftp.ensembl.org/pub/grch37/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.chr.gtf.gz",
|
|
94
94
|
"ensembl_hg38_gtf":"https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens//Homo_sapiens.GRCh38.109.chr.gtf.gz",
|
|
95
95
|
"refseq_hg19_gtf":"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gtf.gz",
|
|
96
96
|
"refseq_hg38_gtf":"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz",
|
|
97
97
|
"testlink":"https://www.dropbox.com/s/8u7capwge0ihshu/EAS.chr22.split_norm_af.1kgp3v5.vcf.gz?dl=1",
|
|
98
|
-
"testlink_tbi":"https://www.dropbox.com/s/hdneg53t6u1j6ib/EAS.chr22.split_norm_af.1kgp3v5.vcf.gz.tbi?dl=1"
|
|
98
|
+
"testlink_tbi":"https://www.dropbox.com/s/hdneg53t6u1j6ib/EAS.chr22.split_norm_af.1kgp3v5.vcf.gz.tbi?dl=1",
|
|
99
|
+
"19to38":"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz",
|
|
100
|
+
"19to13":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/hg19-chm13v2.chain",
|
|
101
|
+
"38to19":"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz",
|
|
102
|
+
"38to13":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/grch38-chm13v2.chain",
|
|
103
|
+
"13to19":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/chm13v2-hg19.chain",
|
|
104
|
+
"13to38":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/chm13v2-grch38.chain",
|
|
105
|
+
"18to19":"https://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz",
|
|
106
|
+
"18to38":"https://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg38.over.chain.gz"
|
|
99
107
|
}
|
|
100
108
|
|
|
101
109
|
|
|
@@ -8,6 +8,8 @@ from gwaslab.io_preformat_input import preformat
|
|
|
8
8
|
from gwaslab.io_to_formats import _to_format
|
|
9
9
|
from gwaslab.g_Log import Log
|
|
10
10
|
from gwaslab.qc_fix_sumstats import fixID
|
|
11
|
+
from gwaslab.qc_fix_sumstats import flipSNPID
|
|
12
|
+
from gwaslab.qc_fix_sumstats import stripSNPID
|
|
11
13
|
from gwaslab.qc_fix_sumstats import removedup
|
|
12
14
|
from gwaslab.qc_fix_sumstats import fixchr
|
|
13
15
|
from gwaslab.qc_fix_sumstats import fixpos
|
|
@@ -76,6 +78,8 @@ from gwaslab.util_ex_ldsc import _estimate_rg_by_ldsc
|
|
|
76
78
|
from gwaslab.util_ex_ldsc import _estimate_h2_cts_by_ldsc
|
|
77
79
|
from gwaslab.util_ex_ldsc import _estimate_partitioned_h2_by_ldsc
|
|
78
80
|
from gwaslab.bd_get_hapmap3 import gethapmap3
|
|
81
|
+
from gwaslab.util_abf_finemapping import abf_finemapping
|
|
82
|
+
from gwaslab.util_abf_finemapping import make_cs
|
|
79
83
|
import gc
|
|
80
84
|
|
|
81
85
|
#20220309
|
|
@@ -121,6 +125,8 @@ class Sumstats():
|
|
|
121
125
|
snpr2=None,
|
|
122
126
|
status=None,
|
|
123
127
|
other=[],
|
|
128
|
+
chrom_pat=None,
|
|
129
|
+
snpid_pat=None,
|
|
124
130
|
usekeys=None,
|
|
125
131
|
direction=None,
|
|
126
132
|
verbose=True,
|
|
@@ -135,6 +141,7 @@ class Sumstats():
|
|
|
135
141
|
self.data = pd.DataFrame()
|
|
136
142
|
self.log = Log()
|
|
137
143
|
self.ldsc_h2 = None
|
|
144
|
+
self.ldsc_h2_results = None
|
|
138
145
|
self.ldsc_rg = None
|
|
139
146
|
self.ldsc_h2_cts = None
|
|
140
147
|
self.ldsc_partitioned_h2_summary = None
|
|
@@ -202,6 +209,8 @@ class Sumstats():
|
|
|
202
209
|
status=status,
|
|
203
210
|
other=other,
|
|
204
211
|
usekeys=usekeys,
|
|
212
|
+
chrom_pat=chrom_pat,
|
|
213
|
+
snpid_pat=snpid_pat,
|
|
205
214
|
verbose=verbose,
|
|
206
215
|
readargs=readargs,
|
|
207
216
|
log=self.log)
|
|
@@ -415,6 +424,10 @@ class Sumstats():
|
|
|
415
424
|
#customizable API to build your own QC pipeline
|
|
416
425
|
def fix_id(self,**kwargs):
|
|
417
426
|
self.data = fixID(self.data,log=self.log,**kwargs)
|
|
427
|
+
def flip_snpid(self,**kwargs):
|
|
428
|
+
self.data = flipSNPID(self.data,log=self.log,**kwargs)
|
|
429
|
+
def strip_snpid(self,**kwargs):
|
|
430
|
+
self.data = stripSNPID(self.data,log=self.log,**kwargs)
|
|
418
431
|
def fix_chr(self,**kwargs):
|
|
419
432
|
self.data = fixchr(self.data,log=self.log,**kwargs)
|
|
420
433
|
def fix_pos(self,**kwargs):
|
|
@@ -756,13 +769,20 @@ class Sumstats():
|
|
|
756
769
|
else:
|
|
757
770
|
output = lambdaGC(self.data[["CHR",mode]],mode=mode,**kwargs)
|
|
758
771
|
self.meta["Genomic inflation factor"] = output
|
|
759
|
-
return output
|
|
772
|
+
return output
|
|
773
|
+
|
|
774
|
+
def abf_finemapping(self, region=None, chrpos=None, snpid=None,**kwargs):
|
|
775
|
+
region_data = abf_finemapping(self.data.copy(),region=region,chrpos=chrpos,snpid=snpid,log=self.log, **kwargs)
|
|
776
|
+
credible_sets = make_cs(region_data,threshold=0.95,log=self.log)
|
|
777
|
+
return region_data, credible_sets
|
|
778
|
+
|
|
779
|
+
|
|
760
780
|
## LDSC ##############################################################################################
|
|
761
781
|
def estimate_h2_by_ldsc(self, build=None, verbose=True, match_allele=True, **kwargs):
|
|
762
782
|
if build is None:
|
|
763
783
|
build = self.meta["gwaslab"]["genome_build"]
|
|
764
784
|
insumstats = gethapmap3(self.data.copy(), build=build, verbose=verbose , match_allele=True, how="right" )
|
|
765
|
-
self.ldsc_h2 = _estimate_h2_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **kwargs)
|
|
785
|
+
self.ldsc_h2, self.ldsc_h2_results = _estimate_h2_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **kwargs)
|
|
766
786
|
|
|
767
787
|
def estimate_rg_by_ldsc(self, build=None, verbose=True, match_allele=True, **kwargs):
|
|
768
788
|
if build is None:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
|
|
3
|
-
CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
3
|
+
CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
4
4
|
|
|
5
5
|
def vchange_status(status,digit,before,after):
|
|
6
6
|
dic={}
|
|
@@ -21,6 +21,7 @@ from gwaslab.qc_check_datatype import check_dataframe_shape
|
|
|
21
21
|
from gwaslab.bd_common_data import get_number_to_chr
|
|
22
22
|
from gwaslab.bd_common_data import get_chr_list
|
|
23
23
|
from gwaslab.bd_common_data import get_chr_to_number
|
|
24
|
+
from gwaslab.bd_common_data import get_number_to_NC
|
|
24
25
|
from gwaslab.bd_common_data import _maketrans
|
|
25
26
|
from gwaslab.g_vchange_status import vchange_status
|
|
26
27
|
from gwaslab.g_version import _get_version
|
|
@@ -355,7 +356,7 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
|
|
|
355
356
|
|
|
356
357
|
log.write("\n",end="",show_time=False,verbose=verbose)
|
|
357
358
|
|
|
358
|
-
CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
359
|
+
CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
359
360
|
sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
|
|
360
361
|
#sumstats[status] = sumstats[status].astype("string")
|
|
361
362
|
|
|
@@ -674,7 +675,7 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
|
|
|
674
675
|
sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
|
|
675
676
|
log.write(" -Finished checking records", verbose=verbose)
|
|
676
677
|
|
|
677
|
-
CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
678
|
+
CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
678
679
|
sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
|
|
679
680
|
#sumstats[status] = sumstats[status].astype("string")
|
|
680
681
|
|
|
@@ -1496,17 +1497,21 @@ def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
|
|
|
1496
1497
|
def auto_check_vcf_chr_dict(vcf_path, vcf_chr_dict, verbose, log):
|
|
1497
1498
|
if vcf_path is not None:
|
|
1498
1499
|
if vcf_chr_dict is None:
|
|
1499
|
-
log.write(" -Checking
|
|
1500
|
-
|
|
1500
|
+
log.write(" -Checking chromosome notations in VCF/BCF files..." ,verbose=verbose)
|
|
1501
|
+
vcf_chr_dict = check_vcf_chr_NC(vcf_path, log, verbose)
|
|
1502
|
+
if vcf_chr_dict is not None:
|
|
1503
|
+
return vcf_chr_dict
|
|
1504
|
+
log.write(" -Checking prefix for chromosomes in VCF/BCF files..." ,verbose=verbose)
|
|
1505
|
+
prefix = check_vcf_chr_prefix(vcf_path, log,verbose)
|
|
1501
1506
|
if prefix is not None:
|
|
1502
1507
|
log.write(" -Prefix for chromosomes: ",prefix)
|
|
1503
1508
|
vcf_chr_dict = get_number_to_chr(prefix=prefix)
|
|
1504
1509
|
else:
|
|
1505
|
-
log.write(" -No prefix for chromosomes in the VCF files." ,verbose=verbose)
|
|
1510
|
+
log.write(" -No prefix for chromosomes in the VCF/BCF files." ,verbose=verbose)
|
|
1506
1511
|
vcf_chr_dict = get_number_to_chr()
|
|
1507
1512
|
return vcf_chr_dict
|
|
1508
1513
|
|
|
1509
|
-
def check_vcf_chr_prefix(vcf_bcf_path):
|
|
1514
|
+
def check_vcf_chr_prefix(vcf_bcf_path,log,verbose):
|
|
1510
1515
|
vcf_bcf = VariantFile(vcf_bcf_path)
|
|
1511
1516
|
for i in list(vcf_bcf.header.contigs):
|
|
1512
1517
|
m = re.search('(chr|Chr|CHR)([0-9xXyYmM]+)', i)
|
|
@@ -1514,5 +1519,16 @@ def check_vcf_chr_prefix(vcf_bcf_path):
|
|
|
1514
1519
|
return m.group(1)
|
|
1515
1520
|
else:
|
|
1516
1521
|
return None
|
|
1517
|
-
|
|
1522
|
+
|
|
1523
|
+
def check_vcf_chr_NC(vcf_bcf_path,log,verbose):
|
|
1524
|
+
vcf_bcf = VariantFile(vcf_bcf_path)
|
|
1525
|
+
for i in list(vcf_bcf.header.contigs):
|
|
1526
|
+
if i in get_number_to_NC(build="19").values():
|
|
1527
|
+
log.write(" -RefSeq ID detected (hg19) in VCF/BCF...",verbose=verbose)
|
|
1528
|
+
return get_number_to_NC(build="19")
|
|
1529
|
+
elif i in get_number_to_NC(build="38").values():
|
|
1530
|
+
log.write(" -RefSeq ID detected (hg38) in VCF/BCF...",verbose=verbose)
|
|
1531
|
+
return get_number_to_NC(build="38")
|
|
1532
|
+
else:
|
|
1533
|
+
return None
|
|
1518
1534
|
|
|
@@ -56,6 +56,8 @@ def preformat(sumstats,
|
|
|
56
56
|
build=None,
|
|
57
57
|
other=[],
|
|
58
58
|
usekeys=None,
|
|
59
|
+
chrom_pat=None,
|
|
60
|
+
snpid_pat=None,
|
|
59
61
|
verbose=False,
|
|
60
62
|
readargs=None,
|
|
61
63
|
log=None):
|
|
@@ -84,7 +86,10 @@ def preformat(sumstats,
|
|
|
84
86
|
if "format_separator" in meta_data.keys():
|
|
85
87
|
if "sep" not in readargs.keys():
|
|
86
88
|
readargs["sep"] = meta_data["format_separator"]
|
|
87
|
-
|
|
89
|
+
else:
|
|
90
|
+
if readargs["sep"] != meta_data["format_separator"]:
|
|
91
|
+
log.write(' - format_separator will be changed to: "{}"'.format(readargs["sep"]),verbose=verbose)
|
|
92
|
+
|
|
88
93
|
if "format_na" in meta_data.keys():
|
|
89
94
|
readargs["na_values"] = meta_data["format_na"]
|
|
90
95
|
|
|
@@ -92,7 +97,7 @@ def preformat(sumstats,
|
|
|
92
97
|
readargs["comment"] = meta_data["format_comment"]
|
|
93
98
|
|
|
94
99
|
if "sep" not in readargs.keys():
|
|
95
|
-
|
|
100
|
+
readargs["sep"] = "\t"
|
|
96
101
|
|
|
97
102
|
#########################################################################################################################################################
|
|
98
103
|
|
|
@@ -323,10 +328,30 @@ def preformat(sumstats,
|
|
|
323
328
|
skip_rows = get_skip_rows(inpath)
|
|
324
329
|
readargs["skiprows"] = skip_rows
|
|
325
330
|
log.write("Start to initialize gl.Sumstats from file :" + inpath,verbose=verbose)
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
331
|
+
if chrom_pat is not None:
|
|
332
|
+
sumstats = _load_single_chr(inpath,
|
|
333
|
+
usecols,
|
|
334
|
+
dtype_dictionary,
|
|
335
|
+
readargs=readargs,
|
|
336
|
+
rename_dictionary=rename_dictionary,
|
|
337
|
+
chrom_pat=chrom_pat,
|
|
338
|
+
log=log,
|
|
339
|
+
verbose=verbose)
|
|
340
|
+
elif snpid_pat is not None:
|
|
341
|
+
|
|
342
|
+
sumstats = _load_variants_with_pattern(inpath,
|
|
343
|
+
usecols,
|
|
344
|
+
dtype_dictionary,
|
|
345
|
+
readargs=readargs,
|
|
346
|
+
rename_dictionary=rename_dictionary,
|
|
347
|
+
snpid_pat=snpid_pat,
|
|
348
|
+
log=log,
|
|
349
|
+
verbose=verbose)
|
|
350
|
+
else:
|
|
351
|
+
sumstats = pd.read_table(inpath,
|
|
352
|
+
usecols=set(usecols),
|
|
353
|
+
dtype=dtype_dictionary,
|
|
354
|
+
**readargs)
|
|
330
355
|
|
|
331
356
|
elif type(sumstats) is pd.DataFrame:
|
|
332
357
|
## loading data from dataframe
|
|
@@ -520,9 +545,49 @@ def process_status(sumstats,build,log,verbose):
|
|
|
520
545
|
#sumstats["STATUS"] = int(build)*(10**5) +99999
|
|
521
546
|
build = _process_build(build,log,verbose)
|
|
522
547
|
sumstats["STATUS"] = build +"99999"
|
|
523
|
-
categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
548
|
+
categories = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
524
549
|
sumstats["STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
|
|
525
550
|
return sumstats
|
|
526
551
|
|
|
527
552
|
|
|
528
|
-
|
|
553
|
+
def _load_single_chr(inpath,usecols,dtype_dictionary,readargs,rename_dictionary,chrom_pat,log,verbose):
|
|
554
|
+
|
|
555
|
+
sumstats_iter = pd.read_table(inpath,
|
|
556
|
+
usecols=set(usecols),
|
|
557
|
+
dtype=dtype_dictionary,
|
|
558
|
+
iterator=True,
|
|
559
|
+
chunksize=500000,
|
|
560
|
+
**readargs)
|
|
561
|
+
# get chr
|
|
562
|
+
for k,v in rename_dictionary.items():
|
|
563
|
+
if v=="CHR":
|
|
564
|
+
if k in usecols:
|
|
565
|
+
log.write(" -Columns used to filter variants: {}".format(k),verbose=verbose)
|
|
566
|
+
chunk_chrom = k
|
|
567
|
+
break
|
|
568
|
+
|
|
569
|
+
log.write(" -Loading only variants on chromosome with pattern : {} ...".format(chrom_pat),verbose=verbose)
|
|
570
|
+
sumstats_filtered = pd.concat([chunk[chunk[chunk_chrom].str.match(chrom_pat, case=False,na=False) ] for chunk in sumstats_iter])
|
|
571
|
+
log.write(" -Loaded {} variants on chromosome with pattern :{} ...".format(len(sumstats_filtered), chrom_pat),verbose=verbose)
|
|
572
|
+
return sumstats_filtered
|
|
573
|
+
|
|
574
|
+
def _load_variants_with_pattern(inpath,usecols,dtype_dictionary,readargs,rename_dictionary,snpid_pat,log,verbose):
|
|
575
|
+
|
|
576
|
+
sumstats_iter = pd.read_table(inpath,
|
|
577
|
+
usecols=set(usecols),
|
|
578
|
+
dtype=dtype_dictionary,
|
|
579
|
+
iterator=True,
|
|
580
|
+
chunksize=500000,
|
|
581
|
+
**readargs)
|
|
582
|
+
# get chr
|
|
583
|
+
for k,v in rename_dictionary.items():
|
|
584
|
+
if v=="SNPID":
|
|
585
|
+
if k in usecols:
|
|
586
|
+
log.write(" -Columns used to filter variants: {}".format(k),verbose=verbose)
|
|
587
|
+
chunk_snpid = k
|
|
588
|
+
break
|
|
589
|
+
|
|
590
|
+
log.write(" -Loading only variants with pattern : {} ...".format(snpid_pat),verbose=verbose)
|
|
591
|
+
sumstats_filtered = pd.concat([chunk[chunk[chunk_snpid].str.match(snpid_pat, case=False,na=False) ] for chunk in sumstats_iter])
|
|
592
|
+
log.write(" -Loaded {} variants with pattern : {} ...".format(len(sumstats_filtered), snpid_pat),verbose=verbose)
|
|
593
|
+
return sumstats_filtered
|
|
@@ -198,16 +198,29 @@ def read_greml(filelist=[]):
|
|
|
198
198
|
return summary
|
|
199
199
|
|
|
200
200
|
def parse_ldsc_summary(ldsc_summary):
|
|
201
|
-
|
|
201
|
+
|
|
202
202
|
lines = ldsc_summary.split("\n")
|
|
203
|
+
|
|
204
|
+
columns = ['h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se","Catagories"]
|
|
205
|
+
|
|
206
|
+
summary = pd.DataFrame(columns = columns)
|
|
207
|
+
|
|
203
208
|
row={}
|
|
209
|
+
|
|
204
210
|
try:
|
|
205
211
|
objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[0])
|
|
206
212
|
row["h2_obs"]=objects[1]
|
|
207
213
|
row["h2_se"]=objects[2]
|
|
208
214
|
|
|
209
|
-
##
|
|
215
|
+
##check categories
|
|
216
|
+
if len(lines) == 6:
|
|
217
|
+
objects = re.compile(' -Categories:(.+)').findall(lines[1])
|
|
218
|
+
row["Catagories"] = objects[0].strip()
|
|
219
|
+
lines.pop(1)
|
|
220
|
+
else:
|
|
221
|
+
row["Catagories"] = "NA"
|
|
210
222
|
|
|
223
|
+
##next line lambda gc
|
|
211
224
|
objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[1])
|
|
212
225
|
row["Lambda_gc"] = objects[1]
|
|
213
226
|
##next line Mean_chi2
|
|
@@ -240,6 +253,7 @@ def parse_ldsc_summary(ldsc_summary):
|
|
|
240
253
|
row["Intercept_se"]="NA"
|
|
241
254
|
row["Ratio"]="NA"
|
|
242
255
|
row["Ratio_se"]="NA"
|
|
256
|
+
row["Catagories"] = "NA"
|
|
243
257
|
|
|
244
258
|
#summary = summary.append(row,ignore_index=True)
|
|
245
259
|
row = pd.DataFrame([row], columns = summary.columns)
|
|
@@ -342,7 +342,7 @@ def tofmt(sumstats,
|
|
|
342
342
|
meta_data,rename_dictionary = get_format_dict(fmt,inverse=True)
|
|
343
343
|
print_format_info(fmt=fmt, meta_data=meta_data,rename_dictionary=rename_dictionary,verbose=verbose, log=log, output=True)
|
|
344
344
|
|
|
345
|
-
|
|
345
|
+
yaml_path = path + "."+suffix+".tsv-meta.yaml"
|
|
346
346
|
path = path + "."+suffix+".tsv.gz"
|
|
347
347
|
log.write(" -Output path:",path, verbose=verbose)
|
|
348
348
|
|
|
@@ -361,7 +361,7 @@ def tofmt(sumstats,
|
|
|
361
361
|
md5_value = calculate_md5sum_file(path)
|
|
362
362
|
|
|
363
363
|
## update ssf-style meta data and export to yaml file
|
|
364
|
-
_configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value,
|
|
364
|
+
_configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, yaml_path, log, verbose)
|
|
365
365
|
|
|
366
366
|
return sumstats
|
|
367
367
|
|
|
@@ -476,7 +476,7 @@ def _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status
|
|
|
476
476
|
return sumstats, to_csvargs
|
|
477
477
|
|
|
478
478
|
|
|
479
|
-
def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value,
|
|
479
|
+
def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, yaml_path, log, verbose):
|
|
480
480
|
### calculate meta data
|
|
481
481
|
if "EAF" in sumstats.columns:
|
|
482
482
|
min_maf = sumstats["EAF"].min()
|
|
@@ -506,8 +506,8 @@ def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value
|
|
|
506
506
|
sumstats_meta_copy["gwaslab"]["samples"]["sample_size_min"] = n_min
|
|
507
507
|
sumstats_meta_copy["gwaslab"]["samples"]["sample_size_median"] = n_median
|
|
508
508
|
sumstats_meta_copy["gwaslab"]["variants"]["variant_number"] = len(sumstats)
|
|
509
|
-
log.write(" -Exporting SSF-style meta data to {}".format(
|
|
510
|
-
with open(
|
|
509
|
+
log.write(" -Exporting SSF-style meta data to {}".format(yaml_path),verbose=verbose)
|
|
510
|
+
with open(yaml_path, 'w') as outfile:
|
|
511
511
|
yaml.dump(sumstats_meta_copy, outfile)
|
|
512
512
|
|
|
513
513
|
|
|
@@ -5,6 +5,7 @@ import numpy as np
|
|
|
5
5
|
from itertools import repeat
|
|
6
6
|
from multiprocessing import Pool
|
|
7
7
|
from liftover import get_lifter
|
|
8
|
+
from liftover import ChainFile
|
|
8
9
|
from functools import partial
|
|
9
10
|
from gwaslab.g_vchange_status import vchange_status
|
|
10
11
|
from gwaslab.g_vchange_status import status_match
|
|
@@ -19,6 +20,7 @@ from gwaslab.g_version import _get_version
|
|
|
19
20
|
from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
|
|
20
21
|
from gwaslab.util_in_fill_data import _convert_betase_to_p
|
|
21
22
|
from gwaslab.util_in_fill_data import _convert_mlog10p_to_p
|
|
23
|
+
from gwaslab.bd_common_data import get_chain
|
|
22
24
|
#process build
|
|
23
25
|
#setbuild
|
|
24
26
|
#fixID
|
|
@@ -43,9 +45,15 @@ def _process_build(build,log,verbose):
|
|
|
43
45
|
if str(build).lower() in ["hg19","19","37","b37","grch37"]:
|
|
44
46
|
log.write(" -Genomic coordinates are based on GRCh37/hg19...", verbose=verbose)
|
|
45
47
|
final_build = "19"
|
|
48
|
+
elif str(build).lower() in ["hg18","18","36","b36","grch36"]:
|
|
49
|
+
log.write(" -Genomic coordinates are based on GRCh36/hg18...", verbose=verbose)
|
|
50
|
+
final_build = "18"
|
|
46
51
|
elif str(build).lower() in ["hg38","38","b38","grch38"]:
|
|
47
52
|
log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
|
|
48
53
|
final_build = "38"
|
|
54
|
+
elif str(build).lower() in ["t2t","hs1","chm13","13"]:
|
|
55
|
+
log.write(" -Genomic coordinates are based on T2T-CHM13...", verbose=verbose)
|
|
56
|
+
final_build = "13"
|
|
49
57
|
else:
|
|
50
58
|
log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
|
|
51
59
|
final_build = "99"
|
|
@@ -358,6 +366,76 @@ def fixID(sumstats,
|
|
|
358
366
|
|
|
359
367
|
""
|
|
360
368
|
|
|
369
|
+
def stripSNPID(sumstats,snpid="SNPID",overwrite=False,verbose=True,log=Log()):
|
|
370
|
+
'''
|
|
371
|
+
flip EA and NEA SNPid CHR:POS:EA:NEA -> CHR:POS:NEA:EA
|
|
372
|
+
'''
|
|
373
|
+
##start function with col checking##########################################################
|
|
374
|
+
_start_line = "strip SNPID"
|
|
375
|
+
_end_line = "stripping SNPID"
|
|
376
|
+
_start_cols =["SNPID"]
|
|
377
|
+
_start_function = ".strip_snpid()"
|
|
378
|
+
_must_args ={}
|
|
379
|
+
|
|
380
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
381
|
+
log=log,
|
|
382
|
+
verbose=verbose,
|
|
383
|
+
start_line=_start_line,
|
|
384
|
+
end_line=_end_line,
|
|
385
|
+
start_cols=_start_cols,
|
|
386
|
+
start_function=_start_function,
|
|
387
|
+
**_must_args)
|
|
388
|
+
if is_enough_info == False: return sumstats
|
|
389
|
+
log.write(" -Checking if SNPID is (xxx:)CHR:POS:ATCG_Allele:ATCG_Allele(:xxx)...(separator: - ,: , _)",verbose=verbose)
|
|
390
|
+
is_chrposrefalt = sumstats[snpid].str.contains(r'[:_-]?\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+[:_-]?', case=False, flags=0, na=False)
|
|
391
|
+
# check if SNPID is NA
|
|
392
|
+
is_snpid_na = sumstats[snpid].isna()
|
|
393
|
+
|
|
394
|
+
log.write(" -Stripping {} non-NA fixable SNPIDs...".format(sum(is_chrposrefalt)),verbose=verbose)
|
|
395
|
+
|
|
396
|
+
# flip
|
|
397
|
+
sumstats.loc[is_chrposrefalt,snpid] = \
|
|
398
|
+
sumstats.loc[is_chrposrefalt,snpid].str.extract(r'[:_-]?(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)[:_-]?',flags=re.IGNORECASE|re.ASCII)[1].astype("string")
|
|
399
|
+
|
|
400
|
+
finished(log,verbose,_end_line)
|
|
401
|
+
return sumstats
|
|
402
|
+
|
|
403
|
+
def flipSNPID(sumstats,snpid="SNPID",overwrite=False,verbose=True,log=Log()):
|
|
404
|
+
'''
|
|
405
|
+
flip EA and NEA SNPid CHR:POS:EA:NEA -> CHR:POS:NEA:EA
|
|
406
|
+
'''
|
|
407
|
+
##start function with col checking##########################################################
|
|
408
|
+
_start_line = "flip SNPID"
|
|
409
|
+
_end_line = "flipping SNPID"
|
|
410
|
+
_start_cols =["SNPID"]
|
|
411
|
+
_start_function = ".flip_snpid()"
|
|
412
|
+
_must_args ={}
|
|
413
|
+
|
|
414
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
415
|
+
log=log,
|
|
416
|
+
verbose=verbose,
|
|
417
|
+
start_line=_start_line,
|
|
418
|
+
end_line=_end_line,
|
|
419
|
+
start_cols=_start_cols,
|
|
420
|
+
start_function=_start_function,
|
|
421
|
+
**_must_args)
|
|
422
|
+
if is_enough_info == False: return sumstats
|
|
423
|
+
log.warning("This function only flips alleles in SNPID without changing EA, NEA, STATUS or any statistics.")
|
|
424
|
+
log.write(" -Checking if SNPID is CHR:POS:ATCG_Allele:ATCG_Allele...(separator: - ,: , _)",verbose=verbose)
|
|
425
|
+
is_chrposrefalt = sumstats[snpid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
|
|
426
|
+
# check if SNPID is NA
|
|
427
|
+
is_snpid_na = sumstats[snpid].isna()
|
|
428
|
+
|
|
429
|
+
log.write(" -Flipping {} non-NA fixable SNPIDs...".format(sum(is_chrposrefalt)),verbose=verbose)
|
|
430
|
+
|
|
431
|
+
# flip
|
|
432
|
+
sumstats.loc[is_chrposrefalt,snpid] = \
|
|
433
|
+
sumstats.loc[is_chrposrefalt,snpid].str.extract(r'^(chr)?(\w+[:_-]\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1].astype("string") \
|
|
434
|
+
+ ":"+sumstats.loc[is_chrposrefalt,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4].astype("string") \
|
|
435
|
+
+ ":"+sumstats.loc[is_chrposrefalt,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3].astype("string")
|
|
436
|
+
|
|
437
|
+
finished(log,verbose,_end_line)
|
|
438
|
+
return sumstats
|
|
361
439
|
|
|
362
440
|
###############################################################################################################
|
|
363
441
|
# 20230128
|
|
@@ -1041,7 +1119,7 @@ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, ver
|
|
|
1041
1119
|
cols_to_check.append(header)
|
|
1042
1120
|
if header=="STATUS":
|
|
1043
1121
|
log.write(" -Checking STATUS and converting STATUS to categories....", verbose=verbose)
|
|
1044
|
-
categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
1122
|
+
categories = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
|
|
1045
1123
|
sumstats[header] = pd.Categorical(sumstats[header],categories=categories)
|
|
1046
1124
|
return sumstats
|
|
1047
1125
|
|
|
@@ -1496,8 +1574,20 @@ def liftover_variant(sumstats,
|
|
|
1496
1574
|
pos="POS",
|
|
1497
1575
|
status="STATUS",
|
|
1498
1576
|
from_build="19",
|
|
1499
|
-
to_build="38"
|
|
1500
|
-
|
|
1577
|
+
to_build="38",
|
|
1578
|
+
chain=None):
|
|
1579
|
+
|
|
1580
|
+
try:
|
|
1581
|
+
if chain is None:
|
|
1582
|
+
converter = get_lifter(from_build,to_build,one_based=True)
|
|
1583
|
+
else:
|
|
1584
|
+
converter = ChainFile(chain, one_based=True)
|
|
1585
|
+
except:
|
|
1586
|
+
if chain is None:
|
|
1587
|
+
converter = get_lifter(from_build,to_build)
|
|
1588
|
+
else:
|
|
1589
|
+
converter = ChainFile(chain)
|
|
1590
|
+
|
|
1501
1591
|
dic= get_number_to_chr(in_chr=False,xymt=["X","Y","M"])
|
|
1502
1592
|
dic2= get_chr_to_number(out_chr=False)
|
|
1503
1593
|
for i in sumstats[chrom].unique():
|
|
@@ -1509,7 +1599,7 @@ def liftover_variant(sumstats,
|
|
|
1509
1599
|
sumstats.loc[variants_on_chrom_to_convert,chrom] = lifted.str[0].map(dic2).astype("Int64")
|
|
1510
1600
|
return sumstats
|
|
1511
1601
|
|
|
1512
|
-
def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True, verbose=True,log=Log()):
|
|
1602
|
+
def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True,chain=None, verbose=True,log=Log()):
|
|
1513
1603
|
##start function with col checking##########################################################
|
|
1514
1604
|
_start_line = "perform liftover"
|
|
1515
1605
|
_end_line = "liftover"
|
|
@@ -1528,8 +1618,21 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
|
|
|
1528
1618
|
**_must_args)
|
|
1529
1619
|
if is_enough_info == False: return sumstats
|
|
1530
1620
|
############################################################################################
|
|
1621
|
+
|
|
1622
|
+
lifter_from_build = _process_build(from_build,log=log,verbose=False)
|
|
1623
|
+
lifter_to_build = _process_build(to_build,log=log,verbose=False)
|
|
1531
1624
|
|
|
1532
|
-
|
|
1625
|
+
if chain is not None:
|
|
1626
|
+
log.write(" -Creating converter using ChainFile: {}".format(chain), verbose=verbose)
|
|
1627
|
+
else:
|
|
1628
|
+
try:
|
|
1629
|
+
chain = get_chain(from_build=from_build, to_build=to_build)
|
|
1630
|
+
log.write(" -Creating converter using ChainFile: {}".format(chain), verbose=verbose)
|
|
1631
|
+
except:
|
|
1632
|
+
chain = None
|
|
1633
|
+
lifter_from_build=from_build
|
|
1634
|
+
lifter_to_build=to_build
|
|
1635
|
+
log.write(" -Creating converter : {} -> {}".format(lifter_from_build, lifter_to_build), verbose=verbose)
|
|
1533
1636
|
# valid chr and pos
|
|
1534
1637
|
pattern = r"\w\w\w0\w\w\w"
|
|
1535
1638
|
to_lift = sumstats[status].str.match(pattern)
|
|
@@ -1545,11 +1648,10 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
|
|
|
1545
1648
|
pool = Pool(n_cores)
|
|
1546
1649
|
#df = pd.concat(pool.starmap(func, df_split))
|
|
1547
1650
|
func=liftover_variant
|
|
1548
|
-
sumstats[[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status),df_split))
|
|
1651
|
+
sumstats[[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status,chain=chain),df_split))
|
|
1549
1652
|
pool.close()
|
|
1550
1653
|
pool.join()
|
|
1551
1654
|
############################################################################
|
|
1552
|
-
|
|
1553
1655
|
unmap_num = len(sumstats.loc[sumstats[pos].isna(),:])
|
|
1554
1656
|
|
|
1555
1657
|
if remove is True:
|