PyPI - gwaslab - Versions diffs - 3.4.47__tar.gz → 3.4.49__tar.gz - Mend

gwaslab 3.4.47tar.gz → 3.4.49tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (91) hide show

{gwaslab-3.4.47/src/gwaslab.egg-info → gwaslab-3.4.49}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: gwaslab
-Version: 3.4.47
+Version: 3.4.49
 Summary: A collection of handy tools for GWAS SumStats
 Author-email: Yunye <yunye@gwaslab.com>
 Project-URL: Homepage, https://cloufield.github.io/gwaslab/
@@ -51,7 +51,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
 ### install via pip
 ```
-pip install gwaslab==3.4.45
+pip install gwaslab==3.4.46
 ```
 ```python

{gwaslab-3.4.47 → gwaslab-3.4.49}/README.md RENAMED Viewed

@@ -23,7 +23,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
 ### install via pip
 ```
-pip install gwaslab==3.4.45
+pip install gwaslab==3.4.46
 ```
 ```python

{gwaslab-3.4.47 → gwaslab-3.4.49}/pyproject.toml RENAMED Viewed

@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "gwaslab"
-version = "3.4.47"
+version = "3.4.49"
 authors = [
   { name="Yunye", email="yunye@gwaslab.com" },
 ]

{gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/bd_common_data.py RENAMED Viewed

@@ -274,7 +274,9 @@ def get_gtf(chrom, build="19",source="ensembl"):
         gtf = pd.DataFrame(columns=["seqname","start","end","strand","feature","gene_biotype","gene_id","gene_name"])
     return gtf
+def get_chain(from_build="19", to_build="38"):
+    chain_path = check_and_download("{}to{}".format(from_build, to_build))
+    return chain_path
 ####################################################################################################################
 def gtf_to_protein_coding(gtfpath,log=Log(),verbose=True):
     protein_coding_path = gtfpath[:-6]+"protein_coding.gtf.gz"

{gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/data/reference.json RENAMED Viewed

@@ -90,12 +90,20 @@
     "1kg_dbsnp151_hg38_x_md5":"48c05eeb1454c0dd4cbee3cb26382e8e",
     "recombination_hg19":"https://www.dropbox.com/s/wbesl8haxknonuc/recombination_hg19.tar.gz?dl=1",
     "recombination_hg38":"https://www.dropbox.com/s/vuo8mvqx0fpibzj/recombination_hg38.tar.gz?dl=1",
-    "ensembl_hg19_gtf":"https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.chr.gtf.gz",
+    "ensembl_hg19_gtf":"https://ftp.ensembl.org/pub/grch37/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.chr.gtf.gz",
     "ensembl_hg38_gtf":"https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens//Homo_sapiens.GRCh38.109.chr.gtf.gz",
     "refseq_hg19_gtf":"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gtf.gz",
     "refseq_hg38_gtf":"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz",
     "testlink":"https://www.dropbox.com/s/8u7capwge0ihshu/EAS.chr22.split_norm_af.1kgp3v5.vcf.gz?dl=1",
-    "testlink_tbi":"https://www.dropbox.com/s/hdneg53t6u1j6ib/EAS.chr22.split_norm_af.1kgp3v5.vcf.gz.tbi?dl=1"
+    "testlink_tbi":"https://www.dropbox.com/s/hdneg53t6u1j6ib/EAS.chr22.split_norm_af.1kgp3v5.vcf.gz.tbi?dl=1",
+    "19to38":"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz",
+    "19to13":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/hg19-chm13v2.chain",
+    "38to19":"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz",
+    "38to13":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/grch38-chm13v2.chain",
+    "13to19":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/chm13v2-hg19.chain",
+    "13to38":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/chm13v2-grch38.chain",
+    "18to19":"https://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz",
+    "18to38":"https://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg38.over.chain.gz"
 }

{gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_Sumstats.py RENAMED Viewed

@@ -8,6 +8,8 @@ from gwaslab.io_preformat_input import preformat
 from gwaslab.io_to_formats import _to_format
 from gwaslab.g_Log import Log
 from gwaslab.qc_fix_sumstats import fixID
+from gwaslab.qc_fix_sumstats import flipSNPID
+from gwaslab.qc_fix_sumstats import stripSNPID
 from gwaslab.qc_fix_sumstats import removedup
 from gwaslab.qc_fix_sumstats import fixchr
 from gwaslab.qc_fix_sumstats import fixpos
@@ -76,6 +78,8 @@ from gwaslab.util_ex_ldsc import _estimate_rg_by_ldsc
 from gwaslab.util_ex_ldsc import _estimate_h2_cts_by_ldsc
 from gwaslab.util_ex_ldsc import _estimate_partitioned_h2_by_ldsc
 from gwaslab.bd_get_hapmap3 import gethapmap3
+from gwaslab.util_abf_finemapping import abf_finemapping
+from gwaslab.util_abf_finemapping import make_cs
 import gc
 #20220309
@@ -121,6 +125,8 @@ class Sumstats():
              snpr2=None,
              status=None,
              other=[],
+             chrom_pat=None,
+             snpid_pat=None,
              usekeys=None,
              direction=None,
              verbose=True,
@@ -135,6 +141,7 @@ class Sumstats():
         self.data = pd.DataFrame()
         self.log = Log()
         self.ldsc_h2 = None
+        self.ldsc_h2_results = None
         self.ldsc_rg = None
         self.ldsc_h2_cts = None
         self.ldsc_partitioned_h2_summary = None
@@ -202,6 +209,8 @@ class Sumstats():
           status=status,
           other=other,
           usekeys=usekeys,
+          chrom_pat=chrom_pat,
+          snpid_pat=snpid_pat,
           verbose=verbose,
           readargs=readargs,
           log=self.log)
@@ -415,6 +424,10 @@ class Sumstats():
     #customizable API to build your own QC pipeline
     def fix_id(self,**kwargs):
         self.data = fixID(self.data,log=self.log,**kwargs)
+    def flip_snpid(self,**kwargs):
+        self.data = flipSNPID(self.data,log=self.log,**kwargs)
+    def strip_snpid(self,**kwargs):
+        self.data = stripSNPID(self.data,log=self.log,**kwargs)
     def fix_chr(self,**kwargs):
         self.data = fixchr(self.data,log=self.log,**kwargs)
     def fix_pos(self,**kwargs):
@@ -756,13 +769,20 @@ class Sumstats():
         else:
             output = lambdaGC(self.data[["CHR",mode]],mode=mode,**kwargs)
             self.meta["Genomic inflation factor"] = output
-            return output
+            return output
+    def abf_finemapping(self, region=None, chrpos=None, snpid=None,**kwargs):
+        region_data = abf_finemapping(self.data.copy(),region=region,chrpos=chrpos,snpid=snpid,log=self.log, **kwargs)
+        credible_sets = make_cs(region_data,threshold=0.95,log=self.log)
+        return region_data, credible_sets
 ## LDSC ##############################################################################################
     def estimate_h2_by_ldsc(self, build=None, verbose=True, match_allele=True, **kwargs):
         if build is None:
             build = self.meta["gwaslab"]["genome_build"]
         insumstats = gethapmap3(self.data.copy(), build=build, verbose=verbose , match_allele=True, how="right" )
-        self.ldsc_h2 = _estimate_h2_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **kwargs)
+        self.ldsc_h2, self.ldsc_h2_results = _estimate_h2_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **kwargs)
     def estimate_rg_by_ldsc(self, build=None, verbose=True, match_allele=True, **kwargs):
         if build is None:

{gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_vchange_status.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import pandas as pd
-CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
+CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
 def vchange_status(status,digit,before,after):
     dic={}

{gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/g_version.py RENAMED Viewed

@@ -15,8 +15,8 @@ def _get_version():
 def gwaslab_info():
     # version meta information
     dic={
-   "version":"3.4.47",
-   "release_date":"20240703"
+   "version":"3.4.49",
+   "release_date":"20241019"
     }
     return dic

{gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/hm_harmonize_sumstats.py RENAMED Viewed

@@ -21,6 +21,7 @@ from gwaslab.qc_check_datatype import check_dataframe_shape
 from gwaslab.bd_common_data import get_number_to_chr
 from gwaslab.bd_common_data import get_chr_list
 from gwaslab.bd_common_data import get_chr_to_number
+from gwaslab.bd_common_data import get_number_to_NC
 from gwaslab.bd_common_data import _maketrans
 from gwaslab.g_vchange_status import vchange_status
 from gwaslab.g_version import _get_version
@@ -355,7 +356,7 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
     log.write("\n",end="",show_time=False,verbose=verbose)
-    CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
+    CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
     sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
     #sumstats[status] = sumstats[status].astype("string")
@@ -674,7 +675,7 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
         sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
         log.write(" -Finished checking records", verbose=verbose)
-    CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
+    CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
     sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
     #sumstats[status] = sumstats[status].astype("string")
@@ -1496,17 +1497,21 @@ def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
 def auto_check_vcf_chr_dict(vcf_path, vcf_chr_dict, verbose, log):
     if vcf_path is not None:
         if vcf_chr_dict is None:
-            log.write(" -Checking prefix for chromosomes in vcf files..." ,verbose=verbose)
-            prefix = check_vcf_chr_prefix(vcf_path)
+            log.write(" -Checking chromosome notations in VCF/BCF files..." ,verbose=verbose)
+            vcf_chr_dict = check_vcf_chr_NC(vcf_path, log, verbose)
+            if vcf_chr_dict is not None:
+                return vcf_chr_dict
+            log.write(" -Checking prefix for chromosomes in VCF/BCF files..." ,verbose=verbose)
+            prefix = check_vcf_chr_prefix(vcf_path, log,verbose)
             if prefix is not None:
                 log.write(" -Prefix for chromosomes: ",prefix)
                 vcf_chr_dict = get_number_to_chr(prefix=prefix)
             else:
-                log.write(" -No prefix for chromosomes in the VCF files." ,verbose=verbose)
+                log.write(" -No prefix for chromosomes in the VCF/BCF files." ,verbose=verbose)
                 vcf_chr_dict = get_number_to_chr()
     return vcf_chr_dict
-def check_vcf_chr_prefix(vcf_bcf_path):
+def check_vcf_chr_prefix(vcf_bcf_path,log,verbose):
     vcf_bcf = VariantFile(vcf_bcf_path)
     for i in list(vcf_bcf.header.contigs):
         m = re.search('(chr|Chr|CHR)([0-9xXyYmM]+)', i)
@@ -1514,5 +1519,16 @@ def check_vcf_chr_prefix(vcf_bcf_path):
             return m.group(1)
     else:
         return None
+def check_vcf_chr_NC(vcf_bcf_path,log,verbose):
+    vcf_bcf = VariantFile(vcf_bcf_path)
+    for i in list(vcf_bcf.header.contigs):
+        if i in get_number_to_NC(build="19").values():
+            log.write("  -RefSeq ID detected (hg19) in VCF/BCF...",verbose=verbose)
+            return get_number_to_NC(build="19")
+        elif i in get_number_to_NC(build="38").values():
+            log.write("  -RefSeq ID detected (hg38) in VCF/BCF...",verbose=verbose)
+            return get_number_to_NC(build="38")
+    else:
+        return None

{gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/io_preformat_input.py RENAMED Viewed

@@ -56,6 +56,8 @@ def preformat(sumstats,
           build=None,
           other=[],
           usekeys=None,
+          chrom_pat=None,
+          snpid_pat=None,
           verbose=False,
           readargs=None,
           log=None):
@@ -84,7 +86,10 @@ def preformat(sumstats,
         if "format_separator" in meta_data.keys():
             if "sep" not in readargs.keys():
                 readargs["sep"] = meta_data["format_separator"]
+            else:
+                if readargs["sep"] != meta_data["format_separator"]:
+                    log.write('  - format_separator will be changed to: "{}"'.format(readargs["sep"]),verbose=verbose)
         if "format_na" in meta_data.keys():
             readargs["na_values"] = meta_data["format_na"]
@@ -92,7 +97,7 @@ def preformat(sumstats,
             readargs["comment"] = meta_data["format_comment"]
         if "sep" not in readargs.keys():
-             readargs["sep"] = "\t"
+            readargs["sep"] = "\t"
 #########################################################################################################################################################
@@ -323,10 +328,30 @@ def preformat(sumstats,
                 skip_rows = get_skip_rows(inpath)
                 readargs["skiprows"] = skip_rows
                 log.write("Start to initialize gl.Sumstats from file :" + inpath,verbose=verbose)
-                sumstats = pd.read_table(inpath,
-                                 usecols=set(usecols),
-                                 dtype=dtype_dictionary,
-                                 **readargs)
+                if chrom_pat is not None:
+                    sumstats = _load_single_chr(inpath,
+                                                usecols,
+                                                dtype_dictionary,
+                                                readargs=readargs,
+                                                rename_dictionary=rename_dictionary,
+                                                chrom_pat=chrom_pat,
+                                                log=log,
+                                                verbose=verbose)
+                elif snpid_pat is not None:
+                    sumstats = _load_variants_with_pattern(inpath,
+                                                usecols,
+                                                dtype_dictionary,
+                                                readargs=readargs,
+                                                rename_dictionary=rename_dictionary,
+                                                snpid_pat=snpid_pat,
+                                                log=log,
+                                                verbose=verbose)
+                else:
+                    sumstats = pd.read_table(inpath,
+                                    usecols=set(usecols),
+                                    dtype=dtype_dictionary,
+                                    **readargs)
         elif type(sumstats) is pd.DataFrame:
             ## loading data from dataframe
@@ -520,9 +545,49 @@ def process_status(sumstats,build,log,verbose):
     #sumstats["STATUS"] = int(build)*(10**5) +99999
     build = _process_build(build,log,verbose)
     sumstats["STATUS"] = build +"99999"
-    categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
+    categories = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
     sumstats["STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
     return sumstats
+def _load_single_chr(inpath,usecols,dtype_dictionary,readargs,rename_dictionary,chrom_pat,log,verbose):
+    sumstats_iter = pd.read_table(inpath,
+                usecols=set(usecols),
+                dtype=dtype_dictionary,
+                iterator=True,
+                chunksize=500000,
+                **readargs)
+    # get chr
+    for k,v in rename_dictionary.items():
+        if v=="CHR":
+            if k in usecols:
+                log.write(" -Columns used to filter variants: {}".format(k),verbose=verbose)
+                chunk_chrom = k
+                break
+    log.write(" -Loading only variants on chromosome with pattern : {} ...".format(chrom_pat),verbose=verbose)
+    sumstats_filtered = pd.concat([chunk[chunk[chunk_chrom].str.match(chrom_pat, case=False,na=False) ] for chunk in sumstats_iter])
+    log.write(" -Loaded {} variants on chromosome with pattern :{} ...".format(len(sumstats_filtered), chrom_pat),verbose=verbose)
+    return sumstats_filtered
+def _load_variants_with_pattern(inpath,usecols,dtype_dictionary,readargs,rename_dictionary,snpid_pat,log,verbose):
+    sumstats_iter = pd.read_table(inpath,
+                usecols=set(usecols),
+                dtype=dtype_dictionary,
+                iterator=True,
+                chunksize=500000,
+                **readargs)
+    # get chr
+    for k,v in rename_dictionary.items():
+        if v=="SNPID":
+            if k in usecols:
+                log.write(" -Columns used to filter variants: {}".format(k),verbose=verbose)
+                chunk_snpid = k
+                break
+    log.write(" -Loading only variants with pattern :  {} ...".format(snpid_pat),verbose=verbose)
+    sumstats_filtered = pd.concat([chunk[chunk[chunk_snpid].str.match(snpid_pat, case=False,na=False) ] for chunk in sumstats_iter])
+    log.write(" -Loaded {} variants with pattern : {} ...".format(len(sumstats_filtered), snpid_pat),verbose=verbose)
+    return sumstats_filtered

{gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/io_read_ldsc.py RENAMED Viewed

@@ -198,16 +198,29 @@ def read_greml(filelist=[]):
     return summary
 def parse_ldsc_summary(ldsc_summary):
-    summary = pd.DataFrame(columns = ['h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se"])
     lines = ldsc_summary.split("\n")
+    columns = ['h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se","Catagories"]
+    summary = pd.DataFrame(columns = columns)
     row={}
     try:
         objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[0])
         row["h2_obs"]=objects[1]
         row["h2_se"]=objects[2]
-        ##next line lambda gc
+        ##check categories
+        if len(lines) == 6:
+            objects = re.compile('  -Categories:(.+)').findall(lines[1])
+            row["Catagories"] = objects[0].strip()
+            lines.pop(1)
+        else:
+            row["Catagories"] = "NA"
+        ##next line lambda gc
         objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[1])
         row["Lambda_gc"] = objects[1]
         ##next line Mean_chi2
@@ -240,6 +253,7 @@ def parse_ldsc_summary(ldsc_summary):
         row["Intercept_se"]="NA"
         row["Ratio"]="NA"
         row["Ratio_se"]="NA"
+        row["Catagories"] = "NA"
     #summary = summary.append(row,ignore_index=True)
     row = pd.DataFrame([row], columns = summary.columns)

{gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/io_to_formats.py RENAMED Viewed

@@ -342,7 +342,7 @@ def tofmt(sumstats,
         meta_data,rename_dictionary = get_format_dict(fmt,inverse=True)
         print_format_info(fmt=fmt, meta_data=meta_data,rename_dictionary=rename_dictionary,verbose=verbose, log=log, output=True)
-        ymal_path = path + "."+suffix+".tsv-meta.ymal"
+        yaml_path = path + "."+suffix+".tsv-meta.yaml"
         path = path + "."+suffix+".tsv.gz"
         log.write(" -Output path:",path, verbose=verbose)
@@ -361,7 +361,7 @@ def tofmt(sumstats,
             md5_value = calculate_md5sum_file(path)
         ## update ssf-style meta data and export to yaml file
-        _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose)
+        _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, yaml_path, log, verbose)
         return sumstats
@@ -476,7 +476,7 @@ def _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status
     return sumstats, to_csvargs
-def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose):
+def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, yaml_path, log, verbose):
     ### calculate meta data
     if "EAF" in sumstats.columns:
         min_maf = sumstats["EAF"].min()
@@ -506,8 +506,8 @@ def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value
         sumstats_meta_copy["gwaslab"]["samples"]["sample_size_min"] = n_min
         sumstats_meta_copy["gwaslab"]["samples"]["sample_size_median"] = n_median
         sumstats_meta_copy["gwaslab"]["variants"]["variant_number"] = len(sumstats)
-        log.write(" -Exporting SSF-style meta data to {}".format(ymal_path),verbose=verbose)
-        with open(ymal_path, 'w') as outfile:
+        log.write(" -Exporting SSF-style meta data to {}".format(yaml_path),verbose=verbose)
+        with open(yaml_path, 'w') as outfile:
             yaml.dump(sumstats_meta_copy, outfile)

{gwaslab-3.4.47 → gwaslab-3.4.49}/src/gwaslab/qc_fix_sumstats.py RENAMED Viewed

@@ -5,6 +5,7 @@ import numpy as np
 from itertools import repeat
 from multiprocessing import  Pool
 from liftover import get_lifter
+from liftover import ChainFile
 from functools import partial
 from gwaslab.g_vchange_status import vchange_status
 from gwaslab.g_vchange_status import status_match
@@ -19,6 +20,7 @@ from gwaslab.g_version import _get_version
 from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
 from gwaslab.util_in_fill_data import _convert_betase_to_p
 from gwaslab.util_in_fill_data import _convert_mlog10p_to_p
+from gwaslab.bd_common_data import get_chain
 #process build
 #setbuild
 #fixID
@@ -43,9 +45,15 @@ def _process_build(build,log,verbose):
     if str(build).lower() in ["hg19","19","37","b37","grch37"]:
         log.write(" -Genomic coordinates are based on GRCh37/hg19...", verbose=verbose)
         final_build = "19"
+    elif str(build).lower() in ["hg18","18","36","b36","grch36"]:
+        log.write(" -Genomic coordinates are based on GRCh36/hg18...", verbose=verbose)
+        final_build = "18"
     elif str(build).lower() in ["hg38","38","b38","grch38"]:
         log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
         final_build = "38"
+    elif str(build).lower() in ["t2t","hs1","chm13","13"]:
+        log.write(" -Genomic coordinates are based on T2T-CHM13...", verbose=verbose)
+        final_build = "13"
     else:
         log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
         final_build = "99"
@@ -358,6 +366,76 @@ def fixID(sumstats,
 ""
+def stripSNPID(sumstats,snpid="SNPID",overwrite=False,verbose=True,log=Log()):
+    '''
+    flip EA and NEA SNPid   CHR:POS:EA:NEA -> CHR:POS:NEA:EA
+    '''
+    ##start function with col checking##########################################################
+    _start_line = "strip SNPID"
+    _end_line = "stripping SNPID"
+    _start_cols =["SNPID"]
+    _start_function = ".strip_snpid()"
+    _must_args ={}
+    is_enough_info = start_to(sumstats=sumstats,
+                            log=log,
+                            verbose=verbose,
+                            start_line=_start_line,
+                            end_line=_end_line,
+                            start_cols=_start_cols,
+                            start_function=_start_function,
+                            **_must_args)
+    if is_enough_info == False: return sumstats
+    log.write(" -Checking if SNPID is (xxx:)CHR:POS:ATCG_Allele:ATCG_Allele(:xxx)...(separator: - ,: , _)",verbose=verbose)
+    is_chrposrefalt = sumstats[snpid].str.contains(r'[:_-]?\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+[:_-]?', case=False, flags=0, na=False)
+    # check if SNPID is NA
+    is_snpid_na = sumstats[snpid].isna()
+    log.write(" -Stripping {} non-NA fixable SNPIDs...".format(sum(is_chrposrefalt)),verbose=verbose)
+    # flip
+    sumstats.loc[is_chrposrefalt,snpid] = \
+        sumstats.loc[is_chrposrefalt,snpid].str.extract(r'[:_-]?(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)[:_-]?',flags=re.IGNORECASE|re.ASCII)[1].astype("string")
+    finished(log,verbose,_end_line)
+    return sumstats
+def flipSNPID(sumstats,snpid="SNPID",overwrite=False,verbose=True,log=Log()):
+    '''
+    flip EA and NEA SNPid   CHR:POS:EA:NEA -> CHR:POS:NEA:EA
+    '''
+    ##start function with col checking##########################################################
+    _start_line = "flip SNPID"
+    _end_line = "flipping SNPID"
+    _start_cols =["SNPID"]
+    _start_function = ".flip_snpid()"
+    _must_args ={}
+    is_enough_info = start_to(sumstats=sumstats,
+                            log=log,
+                            verbose=verbose,
+                            start_line=_start_line,
+                            end_line=_end_line,
+                            start_cols=_start_cols,
+                            start_function=_start_function,
+                            **_must_args)
+    if is_enough_info == False: return sumstats
+    log.warning("This function only flips alleles in SNPID without changing EA, NEA, STATUS or any statistics.")
+    log.write(" -Checking if SNPID is CHR:POS:ATCG_Allele:ATCG_Allele...(separator: - ,: , _)",verbose=verbose)
+    is_chrposrefalt = sumstats[snpid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
+    # check if SNPID is NA
+    is_snpid_na = sumstats[snpid].isna()
+    log.write(" -Flipping {} non-NA fixable SNPIDs...".format(sum(is_chrposrefalt)),verbose=verbose)
+    # flip
+    sumstats.loc[is_chrposrefalt,snpid] = \
+        sumstats.loc[is_chrposrefalt,snpid].str.extract(r'^(chr)?(\w+[:_-]\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1].astype("string")  \
+        + ":"+sumstats.loc[is_chrposrefalt,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4].astype("string") \
+        + ":"+sumstats.loc[is_chrposrefalt,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3].astype("string")
+    finished(log,verbose,_end_line)
+    return sumstats
 ###############################################################################################################
 # 20230128
@@ -1041,7 +1119,7 @@ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, ver
         cols_to_check.append(header)
         if header=="STATUS":
             log.write(" -Checking STATUS and converting STATUS to categories....", verbose=verbose)
-            categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
+            categories = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
             sumstats[header] = pd.Categorical(sumstats[header],categories=categories)
             return sumstats
@@ -1496,8 +1574,20 @@ def liftover_variant(sumstats,
              pos="POS",
              status="STATUS",
              from_build="19",
-             to_build="38"):
-    converter = get_lifter("hg"+from_build,"hg"+to_build)
+             to_build="38",
+             chain=None):
+    try:
+        if chain is None:
+            converter = get_lifter(from_build,to_build,one_based=True)
+        else:
+            converter = ChainFile(chain, one_based=True)
+    except:
+        if chain is None:
+            converter = get_lifter(from_build,to_build)
+        else:
+            converter = ChainFile(chain)
     dic= get_number_to_chr(in_chr=False,xymt=["X","Y","M"])
     dic2= get_chr_to_number(out_chr=False)
     for i in sumstats[chrom].unique():
@@ -1509,7 +1599,7 @@ def liftover_variant(sumstats,
         sumstats.loc[variants_on_chrom_to_convert,chrom]    =  lifted.str[0].map(dic2).astype("Int64")
     return sumstats
-def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True, verbose=True,log=Log()):
+def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True,chain=None, verbose=True,log=Log()):
     ##start function with col checking##########################################################
     _start_line = "perform liftover"
     _end_line = "liftover"
@@ -1528,8 +1618,21 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
                             **_must_args)
     if is_enough_info == False: return sumstats
     ############################################################################################
+    lifter_from_build = _process_build(from_build,log=log,verbose=False)
+    lifter_to_build = _process_build(to_build,log=log,verbose=False)
-    log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build, verbose=verbose)
+    if chain is not None:
+        log.write(" -Creating converter using ChainFile: {}".format(chain), verbose=verbose)
+    else:
+        try:
+            chain = get_chain(from_build=from_build, to_build=to_build)
+            log.write(" -Creating converter using ChainFile: {}".format(chain), verbose=verbose)
+        except:
+            chain = None
+            lifter_from_build=from_build
+            lifter_to_build=to_build
+    log.write(" -Creating converter : {} -> {}".format(lifter_from_build, lifter_to_build), verbose=verbose)
     # valid chr and pos
     pattern = r"\w\w\w0\w\w\w"
     to_lift = sumstats[status].str.match(pattern)
@@ -1545,11 +1648,10 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
         pool = Pool(n_cores)
         #df = pd.concat(pool.starmap(func, df_split))
         func=liftover_variant
-        sumstats[[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status),df_split))
+        sumstats[[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status,chain=chain),df_split))
         pool.close()
         pool.join()
     ############################################################################
     unmap_num = len(sumstats.loc[sumstats[pos].isna(),:])
     if remove is True:

gwaslab 3.4.47__tar.gz → 3.4.49__tar.gz

Potentially problematic release.

gwaslab 3.4.47tar.gz → 3.4.49tar.gz