PyPI - gwaslab - Versions diffs - 3.4.45__py3-none-any.whl → 3.4.47__py3-none-any.whl - Mend

gwaslab 3.4.45py3-none-any.whl → 3.4.47py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (27) hide show

gwaslab/__init__.py +2 -1
gwaslab/bd_common_data.py +22 -0
gwaslab/g_Sumstats.py +2 -0
gwaslab/g_version.py +7 -7
gwaslab/hm_harmonize_sumstats.py +3 -2
gwaslab/io_preformat_input.py +22 -1
gwaslab/io_to_formats.py +8 -3
gwaslab/qc_fix_sumstats.py +8 -1
gwaslab/util_ex_calculate_ldmatrix.py +20 -7
gwaslab/util_ex_calculate_prs.py +13 -7
gwaslab/util_ex_process_ref.py +22 -11
gwaslab/util_in_filter_value.py +38 -2
gwaslab/util_in_get_sig.py +32 -8
gwaslab/util_in_meta.py +234 -0
gwaslab/util_in_snphwe.py +58 -0
gwaslab/viz_aux_chromatin.py +112 -0
gwaslab/viz_plot_compare_effect.py +4 -1
gwaslab/viz_plot_mqqplot.py +82 -42
gwaslab/viz_plot_regional2.py +792 -0
gwaslab/viz_plot_regionalplot.py +4 -0
gwaslab/viz_plot_stackedregional.py +97 -22
{gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/METADATA +5 -5
{gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/RECORD +27 -23
{gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/WHEEL +1 -1
{gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/LICENSE +0 -0
{gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/LICENSE_before_v3.4.39 +0 -0
{gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/top_level.txt +0 -0

gwaslab/__init__.py CHANGED Viewed

@@ -44,4 +44,5 @@ from gwaslab.viz_plot_trumpetplot import plot_power
 from gwaslab.viz_plot_trumpetplot import plot_power_x
 from gwaslab.util_ex_process_h5 import process_vcf_to_hfd5
 from gwaslab.util_ex_run_susie import _run_susie_rss as run_susie_rss
-from gwaslab.io_read_tabular import _read_tabular as read_tabular
+from gwaslab.io_read_tabular import _read_tabular as read_tabular
+from gwaslab.util_in_meta import meta_analyze

gwaslab/bd_common_data.py CHANGED Viewed

@@ -298,6 +298,28 @@ def gtf_to_protein_coding(gtfpath,log=Log(),verbose=True):
     return protein_coding_path
+def gtf_to_all_gene(gtfpath,log=Log(),verbose=True):
+    all_gene_path = gtfpath[:-6]+"all_genes.gtf.gz"
+    # if not existing, extract protein coding records and output to a new file
+    if not path.isfile(all_gene_path):
+        # get gene list
+        log.write(" - Extracting genes from {}".format(gtfpath),verbose=verbose)
+        gtf = read_gtf(gtfpath,usecols=["feature","gene_biotype","gene_id","gene_name"])
+        gene_list = gtf.loc[gtf["feature"]=="gene","gene_id"].values
+        log.write(" - Loaded {} genes.".format(len(gene_list)),verbose=verbose)
+        # extract entry using csv
+        gtf_raw = pd.read_csv(gtfpath,sep="\t",header=None,comment="#",dtype="string")
+        gtf_raw["_gene_id"] = gtf_raw[8].str.extract(r'gene_id "([\w\.-]+)"')
+        gtf_raw = gtf_raw.loc[ gtf_raw["_gene_id"].isin(gene_list) ,:]
+        gtf_raw = gtf_raw.drop("_gene_id",axis=1)
+        log.write(" - Extracted records are saved to : {} ".format(all_gene_path),verbose=verbose)
+        gtf_raw.to_csv(all_gene_path, header=None, index=None, sep="\t")
+    return all_gene_path
 ####################################################################################################################
 # From BioPython: https://github.com/biopython/biopython/blob/c5a6b1374267d769b19c1022b4b45472316e78b4/Bio/Seq.py#L36
 def _maketrans(complement_mapping):

gwaslab/g_Sumstats.py CHANGED Viewed

@@ -121,6 +121,7 @@ class Sumstats():
              snpr2=None,
              status=None,
              other=[],
+             usekeys=None,
              direction=None,
              verbose=True,
              study="Study_1",
@@ -200,6 +201,7 @@ class Sumstats():
           trait=trait,
           status=status,
           other=other,
+          usekeys=usekeys,
           verbose=verbose,
           readargs=readargs,
           log=self.log)

gwaslab/g_version.py CHANGED Viewed

@@ -15,16 +15,16 @@ def _get_version():
 def gwaslab_info():
     # version meta information
     dic={
-   "version":"3.4.45",
-   "release_date":"20240509"
+   "version":"3.4.47",
+   "release_date":"20240703"
     }
     return dic
-def _checking_plink_version(v=2,log=Log(), verbose=True):
-    if v==1:
-        which_plink_script = "plink --version"
-    elif v==2:
-        which_plink_script = "plink2 --version"
+def _checking_plink_version(plink=None,plink2=None,log=Log(), verbose=True):
+    if plink is not None:
+        which_plink_script = "{} --version".format(plink)
+    elif plink2 is not None:
+        which_plink_script = "{}  --version".format(plink2)
     output = subprocess.check_output(which_plink_script, stderr=subprocess.STDOUT, shell=True,text=True)
     log.write(" -PLINK version: {}".format(output.strip()))
     return log

gwaslab/hm_harmonize_sumstats.py CHANGED Viewed

@@ -868,8 +868,9 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
         if is_enough_info == False: return sumstats
         ############################################################################################
-        standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
+        #standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
+        standardized_normalized = sumstats["STATUS"] == sumstats["STATUS"]
         if rsid not in sumstats.columns:
             sumstats[rsid]=pd.Series(dtype="string")

gwaslab/io_preformat_input.py CHANGED Viewed

@@ -55,6 +55,7 @@ def preformat(sumstats,
           trait=None,
           build=None,
           other=[],
+          usekeys=None,
           verbose=False,
           readargs=None,
           log=None):
@@ -65,6 +66,11 @@ def preformat(sumstats,
     dtype_dictionary ={}
  #######################################################################################################################################################
+    # workflow:
+    # 1. formatbook
+    # 2. user specified header
+    # 3. usekeys
     if fmt is not None:
         # loading format parameters
         log.write("Start to load format from formatbook....",verbose=verbose)
@@ -129,6 +135,8 @@ def preformat(sumstats,
         ################################################
         for key,value in rename_dictionary.items():
+            # check avaiable keys  key->raw header
+            # usecols : a list of raw headers to load from file/DataFrame
             if key in raw_cols:
                 usecols.append(key)
             if value in ["EA","NEA"]:
@@ -137,7 +145,7 @@ def preformat(sumstats,
                 dtype_dictionary[value]="string"
     except ValueError:
-        raise ValueError("Please input a path or a pd.DataFrame, and make sure the columns you specified are in the file.")
+        raise ValueError("Please input a path or a pd.DataFrame, and make sure the separator is correct and the columns you specified are in the file.")
     ###################################################################################################################################################
     ## check columns/datatype to use
@@ -276,6 +284,19 @@ def preformat(sumstats,
         else:
             study = raw_cols[9]
             usecols =  usecols + [study]
+    if usekeys is not None:
+    # extract only specified keys
+        usecols_new =[]
+        for i in usekeys:
+            for k, v in rename_dictionary.items():
+                if i == v:
+                    usecols_new.append(k)
+        usecols_valid =[]
+        for i in usecols_new:
+            if i in usecols:
+                usecols_valid.append(i)
+        usecols = usecols_valid
  #loading data ##########################################################################################################
     try:

gwaslab/io_to_formats.py CHANGED Viewed

@@ -212,8 +212,10 @@ def tofmt(sumstats,
     log.write(" -Start outputting sumstats in "+fmt+" format...")
     if "CHR" in sumstats.columns:
+        # output X,Y,MT instead of 23,24,25
         if xymt_number is False and pd.api.types.is_integer_dtype(sumstats["CHR"]):
             sumstats["CHR"]= sumstats["CHR"].map(get_number_to_chr(xymt=xymt,prefix=chr_prefix))
+        # add prefix to CHR
         elif chr_prefix is not None:
             sumstats["CHR"]= chr_prefix + sumstats["CHR"].astype("string")
@@ -437,17 +439,20 @@ def _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status
             ouput_cols.append(i)
     # + additional cols and remove duplicated
-    ouput_cols = list(set(ouput_cols + cols))
+    ouput_cols_final = []
+    for i in ouput_cols + cols:
+        if i not in ouput_cols_final:
+            ouput_cols_final.append(i)
     # remove STATUS
     try:
         if no_status == True:
-            ouput_cols.remove("STATUS")
+            ouput_cols_final.remove("STATUS")
     except:
         pass
     #filter and rename to target fromat headers
-    sumstats = sumstats[ouput_cols]
+    sumstats = sumstats[ouput_cols_final]
     sumstats = sumstats.rename(columns=rename_dictionary)
     # configure target format args and reorder columns

gwaslab/qc_fix_sumstats.py CHANGED Viewed

@@ -1061,6 +1061,13 @@ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, ver
             if sum(is_low_p) >0:
                 log.warning("Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)))
                 log.warning("Please consider using MLOG10P instead.")
+        if header=="INFO":
+            is_high_info =  sumstats["INFO"]>1
+            if sum(is_high_info) >0:
+                log.warning("High INFO detected (INFO>1) : {}".format(sum(is_high_info)))
+                log.warning("max(INFO): {}".format(sumstats["INFO"].max()))
+                log.warning("Please check if this is as expected.")
         if sum(~is_valid)>0:
             try:
@@ -1102,7 +1109,7 @@ def sanitycheckstats(sumstats,
                      HR=(-100,100),
                      HR_95L=(0,float("Inf")),
                      HR_95U=(0,float("Inf")),
-                     info=(0,1),
+                     info=(0,2),
                      float_tolerence = 1e-7,
                      verbose=True,
                      log=Log()):

gwaslab/util_ex_calculate_ldmatrix.py CHANGED Viewed

@@ -17,6 +17,8 @@ def tofinemapping(sumstats,
                   vcf=None,
                   loci=None,
                   out="./",
+                  plink="plink",
+                  plink2="plink2",
                   windowsizekb=1000,
                   n_cores=1,
                   mode="r",
@@ -56,6 +58,9 @@ def tofinemapping(sumstats,
     else:
         sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
+    log.write(" -plink1.9 path: {}".format(plink),verbose=verbose)
+    log.write(" -plink2 path: {}".format(plink2),verbose=verbose)
     # Drop duplicate!!!!
     log.write(" -Dropping duplicated SNPIDs...",verbose=verbose)
     sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
@@ -68,11 +73,13 @@ def tofinemapping(sumstats,
     if exclude_hla==True:
         sig_df = _exclude_hla(sig_df, log=log, verbose=verbose)
+    sig_df = sig_df.reset_index()
     ## for each lead variant
     for index, row in sig_df.iterrows():
         # extract snplist in each locus
         gc.collect()
+        log.write(" -Locus #{}---------------------------------------------------------------".format(index+1))
         log.write(" -Processing locus with lead variant {} at CHR {} POS {} ...".format(row["SNPID"],row["CHR"],row["POS"]))
         locus_sumstats = _extract_variants_in_locus(sumstats, windowsizekb, locus = (row["CHR"],row["POS"]))
@@ -84,7 +91,10 @@ def tofinemapping(sumstats,
                                                                     n_cores=n_cores,
                                                                     log=log,
                                                                     load_bim=True,
-                                                                    overwrite=overwrite,**kwargs)
+                                                                    overwrite=overwrite,
+                                                                    plink=plink,
+                                                                    plink2=plink2,
+                                                                    **kwargs)
         ## check available snps with reference file
         matched_sumstats = _align_sumstats_with_bim(row=row,
@@ -114,7 +124,10 @@ def tofinemapping(sumstats,
                                                             windowsizekb=windowsizekb,
                                                             out=out,
                                                             plink_log=plink_log,
-                                                            log=log,filetype=filetype,
+                                                            log=log,
+                                                            filetype=filetype,
+                                                            plink=plink,
+                                                            plink2=plink2,
                                                             verbose=verbose)
@@ -143,12 +156,12 @@ def tofinemapping(sumstats,
-def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, windowsizekb,out,plink_log,log,memory,mode,filetype,verbose=True):
+def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, windowsizekb,out,plink_log,log,memory,mode,filetype,plink,plink2,verbose=True):
     '''
     Calculate LD r matrix by calling PLINK; return file name and log
     '''
     log.write(" -Start to calculate LD r matrix...",verbose=verbose)
-    log = _checking_plink_version(v=1, log=log)
+    log = _checking_plink_version(plink=plink, log=log)
     if "@" in bfile_prefix:
         bfile_to_use = bfile_prefix.replace("@",str(row["CHR"]))
     else:
@@ -165,7 +178,7 @@ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, w
             raise ValueError("Please use bfile instead of pfile for PLINK1.")
         script_vcf_to_bfile = """
-        plink \
+        {} \
             --bfile {} \
             --keep-allele-order \
             --extract {} \
@@ -175,7 +188,7 @@ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, w
             --threads {} {}\
             --write-snplist \
             --out {}
-        """.format(bfile_to_use, snplist_path , row["CHR"], mode, n_cores, memory_flag if memory is not None else "", output_prefix)
+        """.format(plink, bfile_to_use, snplist_path , row["CHR"], mode, n_cores, memory_flag if memory is not None else "", output_prefix)
         try:
             output = subprocess.check_output(script_vcf_to_bfile, stderr=subprocess.STDOUT, shell=True,text=True)

gwaslab/util_ex_calculate_prs.py CHANGED Viewed

@@ -18,6 +18,8 @@ def _calculate_prs(sumstats,
           memory=None,
           overwrite=False,
           mode=None,delete=True,
+          plink="plink",
+          plink2="plink2",
           log=Log(),**kwargs):
     #matching_alleles
@@ -30,14 +32,18 @@ def _calculate_prs(sumstats,
         chrlist.sort()
         plink_log = ""
         #process reference fileWWW
-        bfile_prefix, plink_log, ref_bim, filetype = _process_plink_input_files(  chrlist=chrlist,
+        bfile_prefix, plink_log, ref_bim, filetype = _process_plink_input_files(
+                                                                    chrlist=chrlist,
                                                                     bfile=bfile,
                                                                     vcf=vcf,
                                                                     plink_log=plink_log,
                                                                     n_cores=n_cores,
                                                                     log=log,
                                                                     load_bim=False,
-                                                                    overwrite=overwrite,**kwargs)
+                                                                    overwrite=overwrite,
+                                                                    plink=plink,
+                                                                    plink2=plink2,
+                                                                    **kwargs)
         score_file_path_list =[]
         for index, chrom in enumerate(chrlist):
             chr_sumstats = sumstats.loc[sumstats["CHR"]==chrom,:].copy()
@@ -61,7 +67,7 @@ def _calculate_prs(sumstats,
                                plink_log=plink_log,
                                log=log,
                                memory=memory,
-                               mode=mode,filetype=filetype)
+                               mode=mode,filetype=filetype,plink2=plink2)
             score_file_path_list.append(score_file_path)
             if delete == True:
                 os.remove(model_path)
@@ -71,10 +77,10 @@ def _calculate_prs(sumstats,
-def _run_calculate_prs(study, chrom , model_path, bfile_prefix, n_cores, out, plink_log, log, memory,filetype, mode=None):
+def _run_calculate_prs(study, chrom , model_path, bfile_prefix, n_cores, out, plink_log, log, memory,filetype, plink2,mode=None):
     log.write(" -Start to calculate PRS for Chr {}...".format(chrom))
-    _checking_plink_version(v=2, log=log)
+    _checking_plink_version(plink2=plink2, log=log)
     if "@" in bfile_prefix:
         bpfile_to_use = bfile_prefix.replace("@",str(chrom))
@@ -92,13 +98,13 @@ def _run_calculate_prs(study, chrom , model_path, bfile_prefix, n_cores, out, pl
         memory_flag = "--memory {}".format(memory)
     script_vcf_to_bfile = """
-    plink2 \
+    {} \
         {} \
         --score {} 1 2 3 header {} cols=+scoresums,+denom ignore-dup-ids \
         --chr {} \
         --threads {} {}\
         --out {}
-    """.format(file_flag, model_path ,  mode if mode is not None else "", chrom, n_cores, memory_flag if memory is not None else "", output_prefix)
+    """.format(plink2, file_flag, model_path ,  mode if mode is not None else "", chrom, n_cores, memory_flag if memory is not None else "", output_prefix)
     try:
         output = subprocess.check_output(script_vcf_to_bfile, stderr=subprocess.STDOUT, shell=True,text=True)

gwaslab/util_ex_process_ref.py CHANGED Viewed

@@ -20,7 +20,9 @@ def _process_plink_input_files(chrlist,
                                bgen_mode="ref-first",
                                convert="bfile",
                                memory=None,
-                               load_bim=False):
+                               load_bim=False,
+                               plink="plink",
+                               plink2="plink2"):
     """
     Process input files (bfile,pfile,vcf,bgen) to either PLINK1 bed/bim/fam or PLINK2 pgen/psam/pvar.
@@ -66,7 +68,9 @@ def _process_plink_input_files(chrlist,
                                                             convert=convert,
                                                             memory=memory,
                                                             overwrite=overwrite,
-                                                            load_bim=load_bim)
+                                                            load_bim=load_bim,
+                                                            plink=plink,
+                                                            plink2=plink2)
         filetype = convert
     elif filetype == "bgen":
         ref_file_prefix, plink_log, ref_bims = _process_bgen(ref_file_prefix=ref_file_prefix,
@@ -81,7 +85,9 @@ def _process_plink_input_files(chrlist,
                                                             convert=convert,
                                                             memory=memory,
                                                             overwrite=overwrite,
-                                                            load_bim=load_bim)
+                                                            load_bim=load_bim,
+                                                            plink=plink,
+                                                            plink2=plink2)
         filetype = convert
     return ref_file_prefix, plink_log, ref_bims, filetype
@@ -199,11 +205,13 @@ def _process_vcf(ref_file_prefix,
                  convert="bfile",
                  memory=None,
                  overwrite=False,
-                 load_bim=False):
+                 load_bim=False,
+                 plink="plink",
+                 plink2="plink2"):
     log.write(" -Processing VCF : {}...".format(ref_file_prefix))
     #check plink version
-    log = _checking_plink_version(v=2,log=log)
+    log = _checking_plink_version(plink2=plink2,log=log)
     # file path prefix to return
     if is_wild_card==True:
@@ -243,14 +251,15 @@ def _process_vcf(ref_file_prefix,
         #if not existing or overwrite is True
         if (not is_file_exist) or overwrite:
             script_vcf_to_bfile = """
-            plink2 \
+            {} \
                 --vcf {} \
                 --chr {} \
                 {} \
                 --rm-dup force-first \
                 --threads {}{}\
                 --out {}
-            """.format(vcf_to_load,
+            """.format(plink2,
+                        vcf_to_load,
                        i,
                        make_flag,
                        n_cores, memory_flag,
@@ -288,11 +297,13 @@ def _process_bgen(ref_file_prefix,
                   convert="bfile",
                   memory=None,
                   overwrite=False,
-                  load_bim=False):
+                  load_bim=False,
+                  plink="plink",
+                 plink2="plink2"):
     log.write(" -Processing BGEN files : {}...".format(ref_file_prefix))
     #check plink version
-    log = _checking_plink_version(v=2,log=log)
+    log = _checking_plink_version(log=log,plink2=plink2)
     # file path prefix to return
     if is_wild_card==True:
@@ -338,14 +349,14 @@ def _process_bgen(ref_file_prefix,
         #if not existing or overwrite is True
         if (not is_file_exist) or overwrite:
             script_vcf_to_bfile = """
-            plink2 \
+            {} \
                 --bgen {} {} {}\
                 --chr {} \
                 {} \
                 --rm-dup force-first \
                 --threads {}{}\
                 --out {}
-            """.format(bgen_to_load, bgen_mode, sample_flag,
+            """.format(plink2,bgen_to_load, bgen_mode, sample_flag,
                        i,
                        make_flag,
                        n_cores, memory_flag,

gwaslab/util_in_filter_value.py CHANGED Viewed

@@ -10,6 +10,7 @@ from gwaslab.g_vchange_status import vchange_status
 from gwaslab.qc_fix_sumstats import sortcoordinate
 from gwaslab.qc_fix_sumstats import start_to
 from gwaslab.qc_fix_sumstats import finished
+from gwaslab.qc_fix_sumstats import _process_build
 from gwaslab.hm_harmonize_sumstats import is_palindromic
 import gc
@@ -430,8 +431,43 @@ def _filter_snp(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
     log.write("Finished filtering SNPs.",verbose=verbose)
     return snp
-def _exclude_hla(sumstats, chrom="CHR", pos="POS", lower=25000000 ,upper=34000000 ,log=Log(), verbose=True):
+def _exclude_hla(sumstats, chrom="CHR", pos="POS", lower=None ,upper=None, build=None, mode="xmhc", log=Log(), verbose=True):
+    if build is not None:
+        build = _process_build(build = build,log = log,verbose = verbose)
+        # xMHC : HIST1H2AA ~ 7.6mb ~ RPL12P1
+        # reference: Horton, R., Wilming, L., Rand, V., Lovering, R. C., Bruford, E. A., Khodiyar, V. K., ... & Beck, S. (2004). Gene map of the extended human MHC. Nature Reviews Genetics, 5(12), 889-899.
+        # hg38:  25,726,063 ~ 33,400,644
+        # hg19 : 25,726,291 ~ 33,368,421
+        # HLA : GABBR1 ~ 3.78mb ~ KIFC1
+        # reference: Shiina, T., Hosomichi, K., Inoko, H., & Kulski, J. K. (2009). The HLA genomic loci map: expression, interaction, diversity and disease. Journal of human genetics, 54(1), 15-39.
+        # hg38:  29,602,238 ~ 33,409,896
+        # hg19:  29,570,015 ~ 33,377,673
+        if build == "19":
+            if mode =="xmhc":
+                lower=25000000
+                upper=34000000
+            if mode =="hla" or mode =="mhc":
+                lower=29500000
+                upper=33500000
+        if build == "38":
+            if mode =="xmhc":
+                lower=25000000
+                upper=34000000
+            if mode =="hla" or mode =="mhc":
+                lower=29500000
+                upper=33500000
+    else:
+        # -> 25,000,000 ~ 34,000,000
+        if mode =="xmhc":
+            lower=25000000
+            upper=34000000
+        if mode =="hla" or mode =="mhc":
+            lower=29500000
+            upper=33500000
     raw_len = len(sumstats)
     if str(sumstats[chrom].dtype) == "string":

gwaslab/util_in_get_sig.py CHANGED Viewed

@@ -11,6 +11,7 @@ from gwaslab.bd_common_data import get_chr_to_number
 from gwaslab.bd_common_data import get_number_to_chr
 from gwaslab.bd_common_data import get_chr_to_NC
 from gwaslab.bd_common_data import gtf_to_protein_coding
+from gwaslab.bd_common_data import gtf_to_all_gene
 from gwaslab.bd_download import check_and_download
 from gwaslab.util_ex_gwascatalog import gwascatalog_trait
 from gwaslab.qc_fix_sumstats import check_dataframe_shape
@@ -38,6 +39,7 @@ def getsig(insumstats,
            wc_correction=False,
            build="19",
            source="ensembl",
+           gtf_path=None,
            verbose=True):
     """
     Extract the lead variants using a sliding window. P or MLOG10P will be used and converted to SCALEDP for sorting.
@@ -172,6 +174,7 @@ def getsig(insumstats,
                xymt=xymt,
                build=build,
                source=source,
+               gtf_path=gtf_path,
                verbose=verbose)
     # drop internal id
@@ -253,6 +256,7 @@ def annogene(
            xymt=["X","Y","MT"],
            build="19",
            source="ensembl",
+           gtf_path=None,
            verbose=True):
     log.write("Start to annotate variants with nearest gene name(s)...", verbose=verbose)
@@ -267,8 +271,13 @@ def annogene(
             #| gzip >Homo_sapiens.GRCh37.75.processed.chr.gtf.gz
             #gtf_path = check_and_download("ensembl_hg19_gtf_protein_coding")
-            gtf_path = check_and_download("ensembl_hg19_gtf")
-            gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
+            if gtf_path is None:
+                gtf_path = check_and_download("ensembl_hg19_gtf")
+                gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
+            else:
+                log.write(" -Using user-provided gtf:{}".format(gtf_path))
+                gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
             gtf_db_path = gtf_path[:-2]+"db"
             data = Genome(
@@ -283,8 +292,13 @@ def annogene(
         elif build=="38":
             log.write(" -Assigning Gene name using ensembl_hg38_gtf for protein coding genes", verbose=verbose)
             #gtf_path = check_and_download("ensembl_hg38_gtf_protein_coding")
-            gtf_path = check_and_download("ensembl_hg38_gtf")
-            gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
+            if gtf_path is None:
+                gtf_path = check_and_download("ensembl_hg38_gtf")
+                gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
+            else:
+                log.write(" -Using user-provided gtf:{}".format(gtf_path))
+                gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
             gtf_db_path = gtf_path[:-2]+"db"
             data = Genome(
                 reference_name='GRCh38',
@@ -300,8 +314,13 @@ def annogene(
         if build=="19":
             log.write(" -Assigning Gene name using NCBI refseq latest GRCh37 for protein coding genes", verbose=verbose)
             #gtf_path = check_and_download("refseq_hg19_gtf_protein_coding")
-            gtf_path = check_and_download("refseq_hg19_gtf")
-            gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
+            if gtf_path is None:
+                gtf_path = check_and_download("refseq_hg19_gtf")
+                gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
+            else:
+                log.write(" -Using user-provided gtf:{}".format(gtf_path))
+                gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
             gtf_db_path = gtf_path[:-2]+"db"
             data = Genome(
                 reference_name='GRCh37',
@@ -315,8 +334,13 @@ def annogene(
         elif build=="38":
             log.write(" -Assigning Gene name using NCBI refseq latest GRCh38 for protein coding genes", verbose=verbose)
             #gtf_path = check_and_download("refseq_hg38_gtf_protein_coding")
-            gtf_path = check_and_download("refseq_hg38_gtf")
-            gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
+            if gtf_path is None:
+                gtf_path = check_and_download("refseq_hg38_gtf")
+                gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
+            else:
+                log.write(" -Using user-provided gtf:{}".format(gtf_path))
+                gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
             gtf_db_path = gtf_path[:-2]+"db"
             data = Genome(
                 reference_name='GRCh38',

gwaslab 3.4.45__py3-none-any.whl → 3.4.47__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.45py3-none-any.whl → 3.4.47py3-none-any.whl