PyPI - gwaslab - Versions diffs - 3.4.48__py3-none-any.whl → 3.5.0__py3-none-any.whl - Mend

gwaslab 3.4.48py3-none-any.whl → 3.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (25) hide show

gwaslab/bd_common_data.py +3 -1
gwaslab/data/reference.json +10 -2
gwaslab/g_Sumstats.py +24 -2
gwaslab/g_vchange_status.py +1 -1
gwaslab/g_version.py +2 -2
gwaslab/hm_harmonize_sumstats.py +23 -7
gwaslab/io_preformat_input.py +73 -8
gwaslab/io_to_formats.py +5 -5
gwaslab/qc_fix_sumstats.py +106 -7
gwaslab/util_in_fill_data.py +20 -2
gwaslab/util_in_get_sig.py +18 -2
gwaslab/viz_aux_annotate_plot.py +75 -242
gwaslab/viz_aux_quickfix.py +9 -2
gwaslab/viz_aux_save_figure.py +2 -1
gwaslab/viz_plot_compare_effect.py +48 -20
gwaslab/viz_plot_miamiplot2.py +5 -1
gwaslab/viz_plot_mqqplot.py +70 -20
gwaslab/viz_plot_phe_heatmap.py +260 -0
gwaslab/viz_plot_stackedregional.py +11 -4
{gwaslab-3.4.48.dist-info → gwaslab-3.5.0.dist-info}/METADATA +1 -1
{gwaslab-3.4.48.dist-info → gwaslab-3.5.0.dist-info}/RECORD +25 -24
{gwaslab-3.4.48.dist-info → gwaslab-3.5.0.dist-info}/WHEEL +1 -1
{gwaslab-3.4.48.dist-info → gwaslab-3.5.0.dist-info}/LICENSE +0 -0
{gwaslab-3.4.48.dist-info → gwaslab-3.5.0.dist-info}/LICENSE_before_v3.4.39 +0 -0
{gwaslab-3.4.48.dist-info → gwaslab-3.5.0.dist-info}/top_level.txt +0 -0

gwaslab/bd_common_data.py CHANGED Viewed

@@ -274,7 +274,9 @@ def get_gtf(chrom, build="19",source="ensembl"):
         gtf = pd.DataFrame(columns=["seqname","start","end","strand","feature","gene_biotype","gene_id","gene_name"])
     return gtf
+def get_chain(from_build="19", to_build="38"):
+    chain_path = check_and_download("{}to{}".format(from_build, to_build))
+    return chain_path
 ####################################################################################################################
 def gtf_to_protein_coding(gtfpath,log=Log(),verbose=True):
     protein_coding_path = gtfpath[:-6]+"protein_coding.gtf.gz"

gwaslab/data/reference.json CHANGED Viewed

@@ -90,12 +90,20 @@
     "1kg_dbsnp151_hg38_x_md5":"48c05eeb1454c0dd4cbee3cb26382e8e",
     "recombination_hg19":"https://www.dropbox.com/s/wbesl8haxknonuc/recombination_hg19.tar.gz?dl=1",
     "recombination_hg38":"https://www.dropbox.com/s/vuo8mvqx0fpibzj/recombination_hg38.tar.gz?dl=1",
-    "ensembl_hg19_gtf":"https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.chr.gtf.gz",
+    "ensembl_hg19_gtf":"https://ftp.ensembl.org/pub/grch37/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.chr.gtf.gz",
     "ensembl_hg38_gtf":"https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens//Homo_sapiens.GRCh38.109.chr.gtf.gz",
     "refseq_hg19_gtf":"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gtf.gz",
     "refseq_hg38_gtf":"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz",
     "testlink":"https://www.dropbox.com/s/8u7capwge0ihshu/EAS.chr22.split_norm_af.1kgp3v5.vcf.gz?dl=1",
-    "testlink_tbi":"https://www.dropbox.com/s/hdneg53t6u1j6ib/EAS.chr22.split_norm_af.1kgp3v5.vcf.gz.tbi?dl=1"
+    "testlink_tbi":"https://www.dropbox.com/s/hdneg53t6u1j6ib/EAS.chr22.split_norm_af.1kgp3v5.vcf.gz.tbi?dl=1",
+    "19to38":"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz",
+    "19to13":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/hg19-chm13v2.chain",
+    "38to19":"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz",
+    "38to13":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/grch38-chm13v2.chain",
+    "13to19":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/chm13v2-hg19.chain",
+    "13to38":"https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chain/v1_nflo/chm13v2-grch38.chain",
+    "18to19":"https://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz",
+    "18to38":"https://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg38.over.chain.gz"
 }

gwaslab/g_Sumstats.py CHANGED Viewed

@@ -8,6 +8,8 @@ from gwaslab.io_preformat_input import preformat
 from gwaslab.io_to_formats import _to_format
 from gwaslab.g_Log import Log
 from gwaslab.qc_fix_sumstats import fixID
+from gwaslab.qc_fix_sumstats import flipSNPID
+from gwaslab.qc_fix_sumstats import stripSNPID
 from gwaslab.qc_fix_sumstats import removedup
 from gwaslab.qc_fix_sumstats import fixchr
 from gwaslab.qc_fix_sumstats import fixpos
@@ -79,6 +81,7 @@ from gwaslab.bd_get_hapmap3 import gethapmap3
 from gwaslab.util_abf_finemapping import abf_finemapping
 from gwaslab.util_abf_finemapping import make_cs
 import gc
+from gwaslab.viz_plot_phe_heatmap import _gwheatmap
 #20220309
 class Sumstats():
@@ -123,6 +126,8 @@ class Sumstats():
              snpr2=None,
              status=None,
              other=[],
+             chrom_pat=None,
+             snpid_pat=None,
              usekeys=None,
              direction=None,
              verbose=True,
@@ -205,6 +210,8 @@ class Sumstats():
           status=status,
           other=other,
           usekeys=usekeys,
+          chrom_pat=chrom_pat,
+          snpid_pat=snpid_pat,
           verbose=verbose,
           readargs=readargs,
           log=self.log)
@@ -418,6 +425,10 @@ class Sumstats():
     #customizable API to build your own QC pipeline
     def fix_id(self,**kwargs):
         self.data = fixID(self.data,log=self.log,**kwargs)
+    def flip_snpid(self,**kwargs):
+        self.data = flipSNPID(self.data,log=self.log,**kwargs)
+    def strip_snpid(self,**kwargs):
+        self.data = stripSNPID(self.data,log=self.log,**kwargs)
     def fix_chr(self,**kwargs):
         self.data = fixchr(self.data,log=self.log,**kwargs)
     def fix_pos(self,**kwargs):
@@ -592,6 +603,11 @@ class Sumstats():
     def plot_daf(self, **kwargs):
         fig,outliers = plotdaf(self.data, **kwargs)
         return fig, outliers
+    def plot_gwheatmap(self, **kwargs):
+        fig = _gwheatmap(self.data, **kwargs)
+        return fig
     def plot_mqq(self, build=None, **kwargs):
         chrom="CHR"
@@ -695,7 +711,7 @@ class Sumstats():
         # return sumstats object
         return output
-    def check_cis(self, **kwargs):
+    def check_cis(self, gls=False, **kwargs):
         if "SNPID" in self.data.columns:
             id_to_use = "SNPID"
         else:
@@ -707,7 +723,13 @@ class Sumstats():
                            p="P",
                            log=self.log,
                            **kwargs)
-        # return sumstats object
+        # return sumstats object
+        if gls == True:
+            new_Sumstats_object = copy.deepcopy(self)
+            new_Sumstats_object.data = output
+            gc.collect()
+            return new_Sumstats_object
         return output
     def check_novel_set(self, **kwargs):

gwaslab/g_vchange_status.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import pandas as pd
-CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
+CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
 def vchange_status(status,digit,before,after):
     dic={}

gwaslab/g_version.py CHANGED Viewed

@@ -15,8 +15,8 @@ def _get_version():
 def gwaslab_info():
     # version meta information
     dic={
-   "version":"3.4.48",
-   "release_date":"20240822"
+   "version":"3.5.0",
+   "release_date":"20241029"
     }
     return dic

gwaslab/hm_harmonize_sumstats.py CHANGED Viewed

@@ -21,6 +21,7 @@ from gwaslab.qc_check_datatype import check_dataframe_shape
 from gwaslab.bd_common_data import get_number_to_chr
 from gwaslab.bd_common_data import get_chr_list
 from gwaslab.bd_common_data import get_chr_to_number
+from gwaslab.bd_common_data import get_number_to_NC
 from gwaslab.bd_common_data import _maketrans
 from gwaslab.g_vchange_status import vchange_status
 from gwaslab.g_version import _get_version
@@ -355,7 +356,7 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
     log.write("\n",end="",show_time=False,verbose=verbose)
-    CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
+    CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
     sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
     #sumstats[status] = sumstats[status].astype("string")
@@ -674,7 +675,7 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
         sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
         log.write(" -Finished checking records", verbose=verbose)
-    CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
+    CATEGORIES = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
     sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
     #sumstats[status] = sumstats[status].astype("string")
@@ -1496,17 +1497,21 @@ def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
 def auto_check_vcf_chr_dict(vcf_path, vcf_chr_dict, verbose, log):
     if vcf_path is not None:
         if vcf_chr_dict is None:
-            log.write(" -Checking prefix for chromosomes in vcf files..." ,verbose=verbose)
-            prefix = check_vcf_chr_prefix(vcf_path)
+            log.write(" -Checking chromosome notations in VCF/BCF files..." ,verbose=verbose)
+            vcf_chr_dict = check_vcf_chr_NC(vcf_path, log, verbose)
+            if vcf_chr_dict is not None:
+                return vcf_chr_dict
+            log.write(" -Checking prefix for chromosomes in VCF/BCF files..." ,verbose=verbose)
+            prefix = check_vcf_chr_prefix(vcf_path, log,verbose)
             if prefix is not None:
                 log.write(" -Prefix for chromosomes: ",prefix)
                 vcf_chr_dict = get_number_to_chr(prefix=prefix)
             else:
-                log.write(" -No prefix for chromosomes in the VCF files." ,verbose=verbose)
+                log.write(" -No prefix for chromosomes in the VCF/BCF files." ,verbose=verbose)
                 vcf_chr_dict = get_number_to_chr()
     return vcf_chr_dict
-def check_vcf_chr_prefix(vcf_bcf_path):
+def check_vcf_chr_prefix(vcf_bcf_path,log,verbose):
     vcf_bcf = VariantFile(vcf_bcf_path)
     for i in list(vcf_bcf.header.contigs):
         m = re.search('(chr|Chr|CHR)([0-9xXyYmM]+)', i)
@@ -1514,5 +1519,16 @@ def check_vcf_chr_prefix(vcf_bcf_path):
             return m.group(1)
     else:
         return None
+def check_vcf_chr_NC(vcf_bcf_path,log,verbose):
+    vcf_bcf = VariantFile(vcf_bcf_path)
+    for i in list(vcf_bcf.header.contigs):
+        if i in get_number_to_NC(build="19").values():
+            log.write("  -RefSeq ID detected (hg19) in VCF/BCF...",verbose=verbose)
+            return get_number_to_NC(build="19")
+        elif i in get_number_to_NC(build="38").values():
+            log.write("  -RefSeq ID detected (hg38) in VCF/BCF...",verbose=verbose)
+            return get_number_to_NC(build="38")
+    else:
+        return None

gwaslab/io_preformat_input.py CHANGED Viewed

@@ -56,6 +56,8 @@ def preformat(sumstats,
           build=None,
           other=[],
           usekeys=None,
+          chrom_pat=None,
+          snpid_pat=None,
           verbose=False,
           readargs=None,
           log=None):
@@ -84,7 +86,10 @@ def preformat(sumstats,
         if "format_separator" in meta_data.keys():
             if "sep" not in readargs.keys():
                 readargs["sep"] = meta_data["format_separator"]
+            else:
+                if readargs["sep"] != meta_data["format_separator"]:
+                    log.write('  - format_separator will be changed to: "{}"'.format(readargs["sep"]),verbose=verbose)
         if "format_na" in meta_data.keys():
             readargs["na_values"] = meta_data["format_na"]
@@ -92,7 +97,7 @@ def preformat(sumstats,
             readargs["comment"] = meta_data["format_comment"]
         if "sep" not in readargs.keys():
-             readargs["sep"] = "\t"
+            readargs["sep"] = "\t"
 #########################################################################################################################################################
@@ -323,10 +328,30 @@ def preformat(sumstats,
                 skip_rows = get_skip_rows(inpath)
                 readargs["skiprows"] = skip_rows
                 log.write("Start to initialize gl.Sumstats from file :" + inpath,verbose=verbose)
-                sumstats = pd.read_table(inpath,
-                                 usecols=set(usecols),
-                                 dtype=dtype_dictionary,
-                                 **readargs)
+                if chrom_pat is not None:
+                    sumstats = _load_single_chr(inpath,
+                                                usecols,
+                                                dtype_dictionary,
+                                                readargs=readargs,
+                                                rename_dictionary=rename_dictionary,
+                                                chrom_pat=chrom_pat,
+                                                log=log,
+                                                verbose=verbose)
+                elif snpid_pat is not None:
+                    sumstats = _load_variants_with_pattern(inpath,
+                                                usecols,
+                                                dtype_dictionary,
+                                                readargs=readargs,
+                                                rename_dictionary=rename_dictionary,
+                                                snpid_pat=snpid_pat,
+                                                log=log,
+                                                verbose=verbose)
+                else:
+                    sumstats = pd.read_table(inpath,
+                                    usecols=set(usecols),
+                                    dtype=dtype_dictionary,
+                                    **readargs)
         elif type(sumstats) is pd.DataFrame:
             ## loading data from dataframe
@@ -520,9 +545,49 @@ def process_status(sumstats,build,log,verbose):
     #sumstats["STATUS"] = int(build)*(10**5) +99999
     build = _process_build(build,log,verbose)
     sumstats["STATUS"] = build +"99999"
-    categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
+    categories = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
     sumstats["STATUS"] = pd.Categorical(sumstats["STATUS"],categories=categories)
     return sumstats
+def _load_single_chr(inpath,usecols,dtype_dictionary,readargs,rename_dictionary,chrom_pat,log,verbose):
+    sumstats_iter = pd.read_table(inpath,
+                usecols=set(usecols),
+                dtype=dtype_dictionary,
+                iterator=True,
+                chunksize=500000,
+                **readargs)
+    # get chr
+    for k,v in rename_dictionary.items():
+        if v=="CHR":
+            if k in usecols:
+                log.write(" -Columns used to filter variants: {}".format(k),verbose=verbose)
+                chunk_chrom = k
+                break
+    log.write(" -Loading only variants on chromosome with pattern : {} ...".format(chrom_pat),verbose=verbose)
+    sumstats_filtered = pd.concat([chunk[chunk[chunk_chrom].str.match(chrom_pat, case=False,na=False) ] for chunk in sumstats_iter])
+    log.write(" -Loaded {} variants on chromosome with pattern :{} ...".format(len(sumstats_filtered), chrom_pat),verbose=verbose)
+    return sumstats_filtered
+def _load_variants_with_pattern(inpath,usecols,dtype_dictionary,readargs,rename_dictionary,snpid_pat,log,verbose):
+    sumstats_iter = pd.read_table(inpath,
+                usecols=set(usecols),
+                dtype=dtype_dictionary,
+                iterator=True,
+                chunksize=500000,
+                **readargs)
+    # get chr
+    for k,v in rename_dictionary.items():
+        if v=="SNPID":
+            if k in usecols:
+                log.write(" -Columns used to filter variants: {}".format(k),verbose=verbose)
+                chunk_snpid = k
+                break
+    log.write(" -Loading only variants with pattern :  {} ...".format(snpid_pat),verbose=verbose)
+    sumstats_filtered = pd.concat([chunk[chunk[chunk_snpid].str.match(snpid_pat, case=False,na=False) ] for chunk in sumstats_iter])
+    log.write(" -Loaded {} variants with pattern : {} ...".format(len(sumstats_filtered), snpid_pat),verbose=verbose)
+    return sumstats_filtered

gwaslab/io_to_formats.py CHANGED Viewed

@@ -342,7 +342,7 @@ def tofmt(sumstats,
         meta_data,rename_dictionary = get_format_dict(fmt,inverse=True)
         print_format_info(fmt=fmt, meta_data=meta_data,rename_dictionary=rename_dictionary,verbose=verbose, log=log, output=True)
-        ymal_path = path + "."+suffix+".tsv-meta.ymal"
+        yaml_path = path + "."+suffix+".tsv-meta.yaml"
         path = path + "."+suffix+".tsv.gz"
         log.write(" -Output path:",path, verbose=verbose)
@@ -361,7 +361,7 @@ def tofmt(sumstats,
             md5_value = calculate_md5sum_file(path)
         ## update ssf-style meta data and export to yaml file
-        _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose)
+        _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, yaml_path, log, verbose)
         return sumstats
@@ -476,7 +476,7 @@ def _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status
     return sumstats, to_csvargs
-def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose):
+def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, yaml_path, log, verbose):
     ### calculate meta data
     if "EAF" in sumstats.columns:
         min_maf = sumstats["EAF"].min()
@@ -506,8 +506,8 @@ def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value
         sumstats_meta_copy["gwaslab"]["samples"]["sample_size_min"] = n_min
         sumstats_meta_copy["gwaslab"]["samples"]["sample_size_median"] = n_median
         sumstats_meta_copy["gwaslab"]["variants"]["variant_number"] = len(sumstats)
-        log.write(" -Exporting SSF-style meta data to {}".format(ymal_path),verbose=verbose)
-        with open(ymal_path, 'w') as outfile:
+        log.write(" -Exporting SSF-style meta data to {}".format(yaml_path),verbose=verbose)
+        with open(yaml_path, 'w') as outfile:
             yaml.dump(sumstats_meta_copy, outfile)

gwaslab/qc_fix_sumstats.py CHANGED Viewed

@@ -5,6 +5,7 @@ import numpy as np
 from itertools import repeat
 from multiprocessing import  Pool
 from liftover import get_lifter
+from liftover import ChainFile
 from functools import partial
 from gwaslab.g_vchange_status import vchange_status
 from gwaslab.g_vchange_status import status_match
@@ -19,6 +20,7 @@ from gwaslab.g_version import _get_version
 from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
 from gwaslab.util_in_fill_data import _convert_betase_to_p
 from gwaslab.util_in_fill_data import _convert_mlog10p_to_p
+from gwaslab.bd_common_data import get_chain
 #process build
 #setbuild
 #fixID
@@ -43,9 +45,15 @@ def _process_build(build,log,verbose):
     if str(build).lower() in ["hg19","19","37","b37","grch37"]:
         log.write(" -Genomic coordinates are based on GRCh37/hg19...", verbose=verbose)
         final_build = "19"
+    elif str(build).lower() in ["hg18","18","36","b36","grch36"]:
+        log.write(" -Genomic coordinates are based on GRCh36/hg18...", verbose=verbose)
+        final_build = "18"
     elif str(build).lower() in ["hg38","38","b38","grch38"]:
         log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
         final_build = "38"
+    elif str(build).lower() in ["t2t","hs1","chm13","13"]:
+        log.write(" -Genomic coordinates are based on T2T-CHM13...", verbose=verbose)
+        final_build = "13"
     else:
         log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
         final_build = "99"
@@ -358,6 +366,76 @@ def fixID(sumstats,
 ""
+def stripSNPID(sumstats,snpid="SNPID",overwrite=False,verbose=True,log=Log()):
+    '''
+    flip EA and NEA SNPid   CHR:POS:EA:NEA -> CHR:POS:NEA:EA
+    '''
+    ##start function with col checking##########################################################
+    _start_line = "strip SNPID"
+    _end_line = "stripping SNPID"
+    _start_cols =["SNPID"]
+    _start_function = ".strip_snpid()"
+    _must_args ={}
+    is_enough_info = start_to(sumstats=sumstats,
+                            log=log,
+                            verbose=verbose,
+                            start_line=_start_line,
+                            end_line=_end_line,
+                            start_cols=_start_cols,
+                            start_function=_start_function,
+                            **_must_args)
+    if is_enough_info == False: return sumstats
+    log.write(" -Checking if SNPID is (xxx:)CHR:POS:ATCG_Allele:ATCG_Allele(:xxx)...(separator: - ,: , _)",verbose=verbose)
+    is_chrposrefalt = sumstats[snpid].str.contains(r'[:_-]?\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+[:_-]?', case=False, flags=0, na=False)
+    # check if SNPID is NA
+    is_snpid_na = sumstats[snpid].isna()
+    log.write(" -Stripping {} non-NA fixable SNPIDs...".format(sum(is_chrposrefalt)),verbose=verbose)
+    # flip
+    sumstats.loc[is_chrposrefalt,snpid] = \
+        sumstats.loc[is_chrposrefalt,snpid].str.extract(r'[:_-]?(chr)?(\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+)[:_-]?',flags=re.IGNORECASE|re.ASCII)[1].astype("string")
+    finished(log,verbose,_end_line)
+    return sumstats
+def flipSNPID(sumstats,snpid="SNPID",overwrite=False,verbose=True,log=Log()):
+    '''
+    flip EA and NEA SNPid   CHR:POS:EA:NEA -> CHR:POS:NEA:EA
+    '''
+    ##start function with col checking##########################################################
+    _start_line = "flip SNPID"
+    _end_line = "flipping SNPID"
+    _start_cols =["SNPID"]
+    _start_function = ".flip_snpid()"
+    _must_args ={}
+    is_enough_info = start_to(sumstats=sumstats,
+                            log=log,
+                            verbose=verbose,
+                            start_line=_start_line,
+                            end_line=_end_line,
+                            start_cols=_start_cols,
+                            start_function=_start_function,
+                            **_must_args)
+    if is_enough_info == False: return sumstats
+    log.warning("This function only flips alleles in SNPID without changing EA, NEA, STATUS or any statistics.")
+    log.write(" -Checking if SNPID is CHR:POS:ATCG_Allele:ATCG_Allele...(separator: - ,: , _)",verbose=verbose)
+    is_chrposrefalt = sumstats[snpid].str.match(r'^\w+[:_-]\d+[:_-][ATCG]+[:_-][ATCG]+$', case=False, flags=0, na=False)
+    # check if SNPID is NA
+    is_snpid_na = sumstats[snpid].isna()
+    log.write(" -Flipping {} non-NA fixable SNPIDs...".format(sum(is_chrposrefalt)),verbose=verbose)
+    # flip
+    sumstats.loc[is_chrposrefalt,snpid] = \
+        sumstats.loc[is_chrposrefalt,snpid].str.extract(r'^(chr)?(\w+[:_-]\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[1].astype("string")  \
+        + ":"+sumstats.loc[is_chrposrefalt,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[4].astype("string") \
+        + ":"+sumstats.loc[is_chrposrefalt,snpid].str.extract(r'^(chr)?(\w+)[:_-](\d+)[:_-]([ATCG]+)[:_-]([ATCG]+)$',flags=re.IGNORECASE|re.ASCII)[3].astype("string")
+    finished(log,verbose,_end_line)
+    return sumstats
 ###############################################################################################################
 # 20230128
@@ -1041,7 +1119,7 @@ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, ver
         cols_to_check.append(header)
         if header=="STATUS":
             log.write(" -Checking STATUS and converting STATUS to categories....", verbose=verbose)
-            categories = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
+            categories = {str(j+i) for j in [1300000,1800000,1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
             sumstats[header] = pd.Categorical(sumstats[header],categories=categories)
             return sumstats
@@ -1496,11 +1574,19 @@ def liftover_variant(sumstats,
              pos="POS",
              status="STATUS",
              from_build="19",
-             to_build="38"):
+             to_build="38",
+             chain=None):
     try:
-        converter = get_lifter("hg"+from_build,"hg"+to_build,one_based=True)
+        if chain is None:
+            converter = get_lifter(from_build,to_build,one_based=True)
+        else:
+            converter = ChainFile(chain, one_based=True)
     except:
-        converter = get_lifter("hg"+from_build,"hg"+to_build)
+        if chain is None:
+            converter = get_lifter(from_build,to_build)
+        else:
+            converter = ChainFile(chain)
     dic= get_number_to_chr(in_chr=False,xymt=["X","Y","M"])
     dic2= get_chr_to_number(out_chr=False)
@@ -1513,7 +1599,7 @@ def liftover_variant(sumstats,
         sumstats.loc[variants_on_chrom_to_convert,chrom]    =  lifted.str[0].map(dic2).astype("Int64")
     return sumstats
-def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True, verbose=True,log=Log()):
+def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_build="19", to_build="38",status="STATUS",remove=True,chain=None, verbose=True,log=Log()):
     ##start function with col checking##########################################################
     _start_line = "perform liftover"
     _end_line = "liftover"
@@ -1532,8 +1618,21 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
                             **_must_args)
     if is_enough_info == False: return sumstats
     ############################################################################################
+    lifter_from_build = _process_build(from_build,log=log,verbose=False)
+    lifter_to_build = _process_build(to_build,log=log,verbose=False)
-    log.write(" -Creating converter : hg" + from_build +" to hg"+ to_build, verbose=verbose)
+    if chain is not None:
+        log.write(" -Creating converter using ChainFile: {}".format(chain), verbose=verbose)
+    else:
+        try:
+            chain = get_chain(from_build=from_build, to_build=to_build)
+            log.write(" -Creating converter using ChainFile: {}".format(chain), verbose=verbose)
+        except:
+            chain = None
+            lifter_from_build=from_build
+            lifter_to_build=to_build
+    log.write(" -Creating converter : {} -> {}".format(lifter_from_build, lifter_to_build), verbose=verbose)
     # valid chr and pos
     pattern = r"\w\w\w0\w\w\w"
     to_lift = sumstats[status].str.match(pattern)
@@ -1549,7 +1648,7 @@ def parallelizeliftovervariant(sumstats,n_cores=1,chrom="CHR", pos="POS", from_b
         pool = Pool(n_cores)
         #df = pd.concat(pool.starmap(func, df_split))
         func=liftover_variant
-        sumstats[[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status),df_split))
+        sumstats[[chrom,pos,status]] = pd.concat(pool.map(partial(func,chrom=chrom,pos=pos,from_build=from_build,to_build=to_build,status=status,chain=chain),df_split))
         pool.close()
         pool.join()
     ############################################################################

gwaslab/util_in_fill_data.py CHANGED Viewed

@@ -184,7 +184,8 @@ def fill_mlog10p(sumstats,log,verbose=True,filled_count=0):
     else:
         return 0,filled_count
     return 1,filled_count
-def fill_extreme_mlog10p(sumstats,log,verbose=True,filled_count=0):
+def fill_extreme_mlog10p(sumstats,df,log,verbose=True,filled_count=0):
     # ref: https://stackoverflow.com/questions/46416027/how-to-compute-p-values-from-z-scores-in-r-when-the-z-score-is-large-pvalue-muc/46416222#46416222
     if "Z" in sumstats.columns:
         # P -> MLOG10P
@@ -198,6 +199,10 @@ def fill_extreme_mlog10p(sumstats,log,verbose=True,filled_count=0):
         log.write("  - Filling MLOG10P using Z column...", verbose=verbose)
         sumstats = fill_extreme_mlog10(sumstats, "Z")
         filled_count +=1
+    elif "CHISQ" in sumstats.columns and "DOF" in sumstats.columns:
+        log.write("  - Filling MLOG10P using CHISQ and DOF column...", verbose=verbose)
+        sumstats = fill_extreme_mlog10_chisq(sumstats, "CHISQ", df)
+        filled_count +=1
     else:
         return 0,filled_count
     return 1,filled_count
@@ -223,6 +228,19 @@ def fill_extreme_mlog10(sumstats, z):
     sumstats["P_EXPONENT"]= exponent
     return sumstats
+def fill_extreme_mlog10_chisq(sumstats, chisq, df):
+    #https://stackoverflow.com/a/46416222/199475
+    log_pvalue = ss.chi2.logsf(sumstats[chisq], sumstats[df])
+    log10_pvalue = log_pvalue/np.log(10)
+    mantissa = 10**(log10_pvalue %1)
+    exponent = log10_pvalue // 1
+    sumstats["MLOG10P"] = -log10_pvalue
+    sumstats["P_MANTISSA"]= mantissa
+    sumstats["P_EXPONENT"]= exponent
+    return sumstats
 ####################################################################################################################
 def fill_iteratively(sumstats,raw_to_fill,log,only_sig,df,extreme,verbose,sig_level):
     to_fill = raw_to_fill.copy()
@@ -260,7 +278,7 @@ def fill_iteratively(sumstats,raw_to_fill,log,only_sig,df,extreme,verbose,sig_le
     # p to -log10(P)  ###############################################################################################
         if "MLOG10P" in to_fill:
             if extreme==True:
-                status,filled_count = fill_extreme_mlog10p(sumstats,log,verbose=verbose,filled_count=filled_count)
+                status,filled_count = fill_extreme_mlog10p(sumstats,df, log,verbose=verbose,filled_count=filled_count)
                 filled_count +=1
             elif "P" not in sumstats.columns:
                 fill_p(sumstats,log,verbose=verbose)

gwaslab/util_in_get_sig.py CHANGED Viewed

@@ -619,8 +619,10 @@ def _check_cis(insumstats,
     except:
         pass
-    allsig["CIS/TRANS"] = allsig.apply(lambda x: determine_if_cis(x, group_key,windowsizekb, reference_dict), axis=1)
+    #allsig["CIS/TRANS"] = allsig.apply(lambda x: determine_if_cis(x, group_key,windowsizekb, reference_dict), axis=1)
+    cis_tuples = allsig.apply(lambda x: determine_if_cis2(x, group_key,windowsizekb, reference_dict), axis=1)
+    allsig[["CIS/TRANS","REF_CHR","REF_START","REF_END"]] = pd.DataFrame(cis_tuples.tolist(), index=allsig.index)
     try:
         allsig = allsig.where(~pd.isna(allsig), pd.NA)
     except:
@@ -689,6 +691,20 @@ def determine_if_cis(x, group_key,windowsizekb, reference_dict):
     else:
         return "NoReference"
+def determine_if_cis2(x, group_key,windowsizekb, reference_dict):
+    if x[group_key] in reference_dict.keys():
+        is_same_chr = str(reference_dict[x[group_key]][0]) == str(x["CHR"])
+        is_large_than_start = int(reference_dict[x[group_key]][1]) - windowsizekb*1000 <= x["POS"]
+        is_smaller_than_end = int(reference_dict[x[group_key]][2]) + windowsizekb*1000 >= x["POS"]
+        if  is_same_chr and is_large_than_start  and is_smaller_than_end:
+            return "Cis", int(reference_dict[x[group_key]][0]), int(reference_dict[x[group_key]][1]), int(reference_dict[x[group_key]][2])
+        else:
+            return "Trans", int(reference_dict[x[group_key]][0]), int(reference_dict[x[group_key]][1]), int(reference_dict[x[group_key]][2])
+    else:
+        return "NoReference", pd.NA, pd.NA, pd.NA
 def determine_distance(allsig, knownsig):
     if len(allsig)==0:
         return allsig

gwaslab 3.4.48__py3-none-any.whl → 3.5.0__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.48py3-none-any.whl → 3.5.0py3-none-any.whl