PyPI - gwaslab - Versions diffs - 3.4.44__tar.gz → 3.4.46__tar.gz - Mend

gwaslab 3.4.44tar.gz → 3.4.46tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (89) hide show

{gwaslab-3.4.44/src/gwaslab.egg-info → gwaslab-3.4.46}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: gwaslab
-Version: 3.4.44
+Version: 3.4.46
 Summary: A collection of handy tools for GWAS SumStats
 Author-email: Yunye <yunye@gwaslab.com>
 Project-URL: Homepage, https://cloufield.github.io/gwaslab/
@@ -8,16 +8,16 @@ Project-URL: Github, https://github.com/Cloufield/gwaslab
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
-Requires-Python: <=3.10,>=3.9
+Requires-Python: <3.11,>=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 License-File: LICENSE_before_v3.4.39
 Requires-Dist: pandas!=1.5,>=1.3
-Requires-Dist: numpy>=1.21.2
-Requires-Dist: matplotlib!=3.7.2,>=3.5
+Requires-Dist: numpy<2,>=1.21.2
+Requires-Dist: matplotlib!=3.7.2,<3.9,>=3.5
 Requires-Dist: seaborn>=0.12
 Requires-Dist: scipy>=1.12
-Requires-Dist: pySAM<0.20,>=0.18.1
+Requires-Dist: pySAM==0.22.1
 Requires-Dist: Biopython>=1.79
 Requires-Dist: adjustText<=0.8,>=0.7.3
 Requires-Dist: liftover>=1.1.13
@@ -51,7 +51,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
 ### install via pip
 ```
-pip install gwaslab==3.4.43
+pip install gwaslab==3.4.45
 ```
 ```python
@@ -90,7 +90,7 @@ Create a Python 3.9 environment and install gwaslab using pip:
 ```
 conda env create -n gwaslab_test -c conda-forge python=3.9
 conda activate gwaslab
-pip install gwaslab==3.4.43
+pip install gwaslab==3.4.45
 ```
 or create a new environment using yml file [environment_3.4.40.yml](https://github.com/Cloufield/gwaslab/blob/main/environment_3.4.40.yml)

{gwaslab-3.4.44 → gwaslab-3.4.46}/README.md RENAMED Viewed

@@ -23,7 +23,7 @@ Warning: Known issues of GWASLab are summarized in [https://cloufield.github.io/
 ### install via pip
 ```
-pip install gwaslab==3.4.43
+pip install gwaslab==3.4.45
 ```
 ```python
@@ -62,7 +62,7 @@ Create a Python 3.9 environment and install gwaslab using pip:
 ```
 conda env create -n gwaslab_test -c conda-forge python=3.9
 conda activate gwaslab
-pip install gwaslab==3.4.43
+pip install gwaslab==3.4.45
 ```
 or create a new environment using yml file [environment_3.4.40.yml](https://github.com/Cloufield/gwaslab/blob/main/environment_3.4.40.yml)

{gwaslab-3.4.44 → gwaslab-3.4.46}/pyproject.toml RENAMED Viewed

@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "gwaslab"
-version = "3.4.44"
+version = "3.4.46"
 authors = [
   { name="Yunye", email="yunye@gwaslab.com" },
 ]
@@ -17,11 +17,11 @@ readme = "README.md"
 dependencies = [
     "pandas>=1.3,!=1.5",
-    "numpy>=1.21.2",
-    "matplotlib>=3.5,!=3.7.2",
+    "numpy>=1.21.2,<2",
+    "matplotlib>=3.5,!=3.7.2,<3.9",
     "seaborn>=0.12",
     "scipy>=1.12",
-    "pySAM>=0.18.1,<0.20",
+    "pySAM==0.22.1",
     "Biopython>=1.79",
     "adjustText>=0.7.3, <=0.8",
     "liftover>=1.1.13",
@@ -31,7 +31,7 @@ dependencies = [
     "h5py>=3.10.0"
 ]
-requires-python = ">=3.9,<=3.10"
+requires-python = ">=3.9,<3.11"
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",

{gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/__init__.py RENAMED Viewed

@@ -44,4 +44,5 @@ from gwaslab.viz_plot_trumpetplot import plot_power
 from gwaslab.viz_plot_trumpetplot import plot_power_x
 from gwaslab.util_ex_process_h5 import process_vcf_to_hfd5
 from gwaslab.util_ex_run_susie import _run_susie_rss as run_susie_rss
-from gwaslab.io_read_tabular import _read_tabular as read_tabular
+from gwaslab.io_read_tabular import _read_tabular as read_tabular
+from gwaslab.util_in_meta import meta_analyze

{gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/bd_common_data.py RENAMED Viewed

@@ -298,6 +298,28 @@ def gtf_to_protein_coding(gtfpath,log=Log(),verbose=True):
     return protein_coding_path
+def gtf_to_all_gene(gtfpath,log=Log(),verbose=True):
+    all_gene_path = gtfpath[:-6]+"all_genes.gtf.gz"
+    # if not existing, extract protein coding records and output to a new file
+    if not path.isfile(all_gene_path):
+        # get gene list
+        log.write(" - Extracting genes from {}".format(gtfpath),verbose=verbose)
+        gtf = read_gtf(gtfpath,usecols=["feature","gene_biotype","gene_id","gene_name"])
+        gene_list = gtf.loc[gtf["feature"]=="gene","gene_id"].values
+        log.write(" - Loaded {} genes.".format(len(gene_list)),verbose=verbose)
+        # extract entry using csv
+        gtf_raw = pd.read_csv(gtfpath,sep="\t",header=None,comment="#",dtype="string")
+        gtf_raw["_gene_id"] = gtf_raw[8].str.extract(r'gene_id "([\w\.-]+)"')
+        gtf_raw = gtf_raw.loc[ gtf_raw["_gene_id"].isin(gene_list) ,:]
+        gtf_raw = gtf_raw.drop("_gene_id",axis=1)
+        log.write(" - Extracted records are saved to : {} ".format(all_gene_path),verbose=verbose)
+        gtf_raw.to_csv(all_gene_path, header=None, index=None, sep="\t")
+    return all_gene_path
 ####################################################################################################################
 # From BioPython: https://github.com/biopython/biopython/blob/c5a6b1374267d769b19c1022b4b45472316e78b4/Bio/Seq.py#L36
 def _maketrans(complement_mapping):

{gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/g_Sumstats.py RENAMED Viewed

@@ -121,6 +121,7 @@ class Sumstats():
              snpr2=None,
              status=None,
              other=[],
+             usekeys=None,
              direction=None,
              verbose=True,
              study="Study_1",
@@ -200,6 +201,7 @@ class Sumstats():
           trait=trait,
           status=status,
           other=other,
+          usekeys=usekeys,
           verbose=verbose,
           readargs=readargs,
           log=self.log)

{gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/g_SumstatsPair.py RENAMED Viewed

@@ -139,7 +139,7 @@ class SumstatsPair( ):
         self.clumps["clumps"], self.clumps["plink_log"] = _clump(self.data, log=self.log, p="P_1",mlog10p="MLOG10P_1", study = self.study_name, **kwargs)
     def to_coloc(self,**kwargs):
-        self.to_finemapping_file_path, self.plink_log = tofinemapping(self.data,study=self.study_name,suffixes=self.suffixes,log=self.log,**kwargs)
+        self.to_finemapping_file_path, output_file_list, self.plink_log = tofinemapping(self.data,study=self.study_name,suffixes=self.suffixes,log=self.log,**kwargs)
     def run_coloc_susie(self,**kwargs):

{gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/g_vchange_status.py RENAMED Viewed

@@ -1,13 +1,15 @@
 import pandas as pd
+CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
 def vchange_status(status,digit,before,after):
     dic={}
     for i in range(len(before)):
         dic[before[i]]=after[i]
     if digit>1:
-        return status.str[:digit-1]+status.str[digit-1].replace(dic)+status.str[digit:]
+        return pd.Categorical(status.str[:digit-1]+status.str[digit-1].replace(dic)+status.str[digit:],categories=CATEGORIES)
     else:
-        return status.str[digit-1].replace(dic)+status.str[digit:]
+        return pd.Categorical(status.str[digit-1].replace(dic)+status.str[digit:],categories=CATEGORIES)
 def copy_status(from_status,to_status, digit):
     if digit>1:

{gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/g_version.py RENAMED Viewed

@@ -15,8 +15,8 @@ def _get_version():
 def gwaslab_info():
     # version meta information
     dic={
-   "version":"3.4.44",
-   "release_date":"20240424"
+   "version":"3.4.46",
+   "release_date":"20240624"
     }
     return dic

{gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/hm_harmonize_sumstats.py RENAMED Viewed

@@ -355,7 +355,11 @@ def oldcheckref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status=
     log.write("\n",end="",show_time=False,verbose=verbose)
-    sumstats[status] = sumstats[status].astype("string")
+    CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
+    sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
+    #sumstats[status] = sumstats[status].astype("string")
     available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
     status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
     status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
@@ -669,9 +673,11 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
         sumstats_to_check = sumstats.loc[to_check_ref,[chrom,pos,ea,nea,status]]
         sumstats.loc[to_check_ref,status] = check_status(sumstats_to_check, all_records_dict, log=log, verbose=verbose)
         log.write(" -Finished checking records", verbose=verbose)
-    sumstats[status] = sumstats[status].astype("string")
+    CATEGORIES = {str(j+i) for j in [1900000,3800000,9700000,9800000,9900000] for i in range(0,100000)}
+    sumstats[status] = pd.Categorical(sumstats[status],categories=CATEGORIES)
+    #sumstats[status] = sumstats[status].astype("string")
     available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
     status_0=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[0]\w", case=False, flags=0, na=False))
     status_3=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[3]\w", case=False, flags=0, na=False))
@@ -700,6 +706,7 @@ def checkref(sumstats,ref_seq,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="ST
     if remove is True:
         sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
         log.write(" -Variants not on given reference sequence were removed.",verbose=verbose)
     finished(log, verbose, _end_line)
     return sumstats
@@ -861,8 +868,9 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
         if is_enough_info == False: return sumstats
         ############################################################################################
-        standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
+        #standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
+        standardized_normalized = sumstats["STATUS"] == sumstats["STATUS"]
         if rsid not in sumstats.columns:
             sumstats[rsid]=pd.Series(dtype="string")

{gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/io_preformat_input.py RENAMED Viewed

@@ -55,6 +55,7 @@ def preformat(sumstats,
           trait=None,
           build=None,
           other=[],
+          usekeys=None,
           verbose=False,
           readargs=None,
           log=None):
@@ -65,6 +66,11 @@ def preformat(sumstats,
     dtype_dictionary ={}
  #######################################################################################################################################################
+    # workflow:
+    # 1. formatbook
+    # 2. user specified header
+    # 3. usekeys
     if fmt is not None:
         # loading format parameters
         log.write("Start to load format from formatbook....",verbose=verbose)
@@ -129,6 +135,8 @@ def preformat(sumstats,
         ################################################
         for key,value in rename_dictionary.items():
+            # check avaiable keys  key->raw header
+            # usecols : a list of raw headers to load from file/DataFrame
             if key in raw_cols:
                 usecols.append(key)
             if value in ["EA","NEA"]:
@@ -137,7 +145,7 @@ def preformat(sumstats,
                 dtype_dictionary[value]="string"
     except ValueError:
-        raise ValueError("Please input a path or a pd.DataFrame, and make sure the columns you specified are in the file.")
+        raise ValueError("Please input a path or a pd.DataFrame, and make sure the separator is correct and the columns you specified are in the file.")
     ###################################################################################################################################################
     ## check columns/datatype to use
@@ -276,6 +284,19 @@ def preformat(sumstats,
         else:
             study = raw_cols[9]
             usecols =  usecols + [study]
+    if usekeys is not None:
+    # extract only specified keys
+        usecols_new =[]
+        for i in usekeys:
+            for k, v in rename_dictionary.items():
+                if i == v:
+                    usecols_new.append(k)
+        usecols_valid =[]
+        for i in usecols_new:
+            if i in usecols:
+                usecols_valid.append(i)
+        usecols = usecols_valid
  #loading data ##########################################################################################################
     try:

{gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/qc_fix_sumstats.py RENAMED Viewed

@@ -1061,6 +1061,13 @@ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, ver
             if sum(is_low_p) >0:
                 log.warning("Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)))
                 log.warning("Please consider using MLOG10P instead.")
+        if header=="INFO":
+            is_high_info =  sumstats["INFO"]>1
+            if sum(is_high_info) >0:
+                log.warning("High INFO detected (INFO>1) : {}".format(sum(is_high_info)))
+                log.warning("max(INFO): {}".format(sumstats["INFO"].max()))
+                log.warning("Please check if this is as expected.")
         if sum(~is_valid)>0:
             try:
@@ -1102,7 +1109,7 @@ def sanitycheckstats(sumstats,
                      HR=(-100,100),
                      HR_95L=(0,float("Inf")),
                      HR_95U=(0,float("Inf")),
-                     info=(0,1),
+                     info=(0,2),
                      float_tolerence = 1e-7,
                      verbose=True,
                      log=Log()):

{gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_in_filter_value.py RENAMED Viewed

@@ -10,6 +10,7 @@ from gwaslab.g_vchange_status import vchange_status
 from gwaslab.qc_fix_sumstats import sortcoordinate
 from gwaslab.qc_fix_sumstats import start_to
 from gwaslab.qc_fix_sumstats import finished
+from gwaslab.qc_fix_sumstats import _process_build
 from gwaslab.hm_harmonize_sumstats import is_palindromic
 import gc
@@ -430,8 +431,43 @@ def _filter_snp(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
     log.write("Finished filtering SNPs.",verbose=verbose)
     return snp
-def _exclude_hla(sumstats, chrom="CHR", pos="POS", lower=25000000 ,upper=34000000 ,log=Log(), verbose=True):
+def _exclude_hla(sumstats, chrom="CHR", pos="POS", lower=None ,upper=None, build=None, mode="xmhc", log=Log(), verbose=True):
+    if build is not None:
+        build = _process_build(build = build,log = log,verbose = verbose)
+        # xMHC : HIST1H2AA ~ 7.6mb ~ RPL12P1
+        # reference: Horton, R., Wilming, L., Rand, V., Lovering, R. C., Bruford, E. A., Khodiyar, V. K., ... & Beck, S. (2004). Gene map of the extended human MHC. Nature Reviews Genetics, 5(12), 889-899.
+        # hg38:  25,726,063 ~ 33,400,644
+        # hg19 : 25,726,291 ~ 33,368,421
+        # HLA : GABBR1 ~ 3.78mb ~ KIFC1
+        # reference: Shiina, T., Hosomichi, K., Inoko, H., & Kulski, J. K. (2009). The HLA genomic loci map: expression, interaction, diversity and disease. Journal of human genetics, 54(1), 15-39.
+        # hg38:  29,602,238 ~ 33,409,896
+        # hg19:  29,570,015 ~ 33,377,673
+        if build == "19":
+            if mode =="xmhc":
+                lower=25000000
+                upper=34000000
+            if mode =="hla" or mode =="mhc":
+                lower=29500000
+                upper=33500000
+        if build == "38":
+            if mode =="xmhc":
+                lower=25000000
+                upper=34000000
+            if mode =="hla" or mode =="mhc":
+                lower=29500000
+                upper=33500000
+    else:
+        # -> 25,000,000 ~ 34,000,000
+        if mode =="xmhc":
+            lower=25000000
+            upper=34000000
+        if mode =="hla" or mode =="mhc":
+            lower=29500000
+            upper=33500000
     raw_len = len(sumstats)
     if str(sumstats[chrom].dtype) == "string":

{gwaslab-3.4.44 → gwaslab-3.4.46}/src/gwaslab/util_in_get_sig.py RENAMED Viewed

@@ -11,6 +11,7 @@ from gwaslab.bd_common_data import get_chr_to_number
 from gwaslab.bd_common_data import get_number_to_chr
 from gwaslab.bd_common_data import get_chr_to_NC
 from gwaslab.bd_common_data import gtf_to_protein_coding
+from gwaslab.bd_common_data import gtf_to_all_gene
 from gwaslab.bd_download import check_and_download
 from gwaslab.util_ex_gwascatalog import gwascatalog_trait
 from gwaslab.qc_fix_sumstats import check_dataframe_shape
@@ -38,6 +39,7 @@ def getsig(insumstats,
            wc_correction=False,
            build="19",
            source="ensembl",
+           gtf_path=None,
            verbose=True):
     """
     Extract the lead variants using a sliding window. P or MLOG10P will be used and converted to SCALEDP for sorting.
@@ -172,6 +174,7 @@ def getsig(insumstats,
                xymt=xymt,
                build=build,
                source=source,
+               gtf_path=gtf_path,
                verbose=verbose)
     # drop internal id
@@ -253,6 +256,7 @@ def annogene(
            xymt=["X","Y","MT"],
            build="19",
            source="ensembl",
+           gtf_path=None,
            verbose=True):
     log.write("Start to annotate variants with nearest gene name(s)...", verbose=verbose)
@@ -267,8 +271,13 @@ def annogene(
             #| gzip >Homo_sapiens.GRCh37.75.processed.chr.gtf.gz
             #gtf_path = check_and_download("ensembl_hg19_gtf_protein_coding")
-            gtf_path = check_and_download("ensembl_hg19_gtf")
-            gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
+            if gtf_path is None:
+                gtf_path = check_and_download("ensembl_hg19_gtf")
+                gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
+            else:
+                log.write(" -Using user-provided gtf:{}".format(gtf_path))
+                gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
             gtf_db_path = gtf_path[:-2]+"db"
             data = Genome(
@@ -283,8 +292,13 @@ def annogene(
         elif build=="38":
             log.write(" -Assigning Gene name using ensembl_hg38_gtf for protein coding genes", verbose=verbose)
             #gtf_path = check_and_download("ensembl_hg38_gtf_protein_coding")
-            gtf_path = check_and_download("ensembl_hg38_gtf")
-            gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
+            if gtf_path is None:
+                gtf_path = check_and_download("ensembl_hg38_gtf")
+                gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
+            else:
+                log.write(" -Using user-provided gtf:{}".format(gtf_path))
+                gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
             gtf_db_path = gtf_path[:-2]+"db"
             data = Genome(
                 reference_name='GRCh38',
@@ -300,8 +314,13 @@ def annogene(
         if build=="19":
             log.write(" -Assigning Gene name using NCBI refseq latest GRCh37 for protein coding genes", verbose=verbose)
             #gtf_path = check_and_download("refseq_hg19_gtf_protein_coding")
-            gtf_path = check_and_download("refseq_hg19_gtf")
-            gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
+            if gtf_path is None:
+                gtf_path = check_and_download("refseq_hg19_gtf")
+                gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
+            else:
+                log.write(" -Using user-provided gtf:{}".format(gtf_path))
+                gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
             gtf_db_path = gtf_path[:-2]+"db"
             data = Genome(
                 reference_name='GRCh37',
@@ -315,8 +334,13 @@ def annogene(
         elif build=="38":
             log.write(" -Assigning Gene name using NCBI refseq latest GRCh38 for protein coding genes", verbose=verbose)
             #gtf_path = check_and_download("refseq_hg38_gtf_protein_coding")
-            gtf_path = check_and_download("refseq_hg38_gtf")
-            gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
+            if gtf_path is None:
+                gtf_path = check_and_download("refseq_hg38_gtf")
+                gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
+            else:
+                log.write(" -Using user-provided gtf:{}".format(gtf_path))
+                gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
             gtf_db_path = gtf_path[:-2]+"db"
             data = Genome(
                 reference_name='GRCh38',

gwaslab 3.4.44__tar.gz → 3.4.46__tar.gz

Potentially problematic release.

gwaslab 3.4.44tar.gz → 3.4.46tar.gz