PyPI - gwaslab - Versions diffs - 3.5.4__py3-none-any.whl → 3.5.6__py3-none-any.whl - Mend

gwaslab 3.5.4py3-none-any.whl → 3.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (34) hide show

gwaslab/__init__.py +3 -1
gwaslab/g_Sumstats.py +56 -9
gwaslab/g_SumstatsPair.py +16 -12
gwaslab/g_SumstatsSet.py +663 -0
gwaslab/g_headers.py +131 -0
gwaslab/g_meta.py +2 -1
gwaslab/g_version.py +3 -3
gwaslab/hm_harmonize_sumstats.py +91 -1
gwaslab/io_preformat_input.py +29 -7
gwaslab/io_read_pipcs.py +23 -0
gwaslab/io_to_formats.py +45 -44
gwaslab/qc_check_datatype.py +65 -42
gwaslab/qc_fix_sumstats.py +1 -1
gwaslab/util_ex_ldproxyfinder.py +162 -3
gwaslab/util_ex_ldsc.py +9 -0
gwaslab/util_ex_run_2samplemr.py +34 -0
gwaslab/util_ex_run_clumping.py +4 -2
gwaslab/util_in_fill_data.py +28 -3
gwaslab/util_in_filter_value.py +66 -1
gwaslab/util_in_merge.py +51 -0
gwaslab/viz_aux_save_figure.py +2 -1
gwaslab/viz_plot_credible_sets.py +99 -0
gwaslab/viz_plot_effect.py +283 -0
gwaslab/viz_plot_miamiplot2.py +1 -1
gwaslab/viz_plot_mqqplot.py +31 -11
gwaslab/viz_plot_regional2.py +133 -32
gwaslab/viz_plot_stackedregional.py +64 -34
{gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/METADATA +4 -4
{gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/RECORD +33 -28
{gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/WHEEL +1 -1
gwaslab/vis_plot_credible sets.py +0 -0
{gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/LICENSE +0 -0
{gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/LICENSE_before_v3.4.39 +0 -0
{gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/top_level.txt +0 -0

gwaslab/util_ex_ldproxyfinder.py CHANGED Viewed

@@ -37,7 +37,7 @@ from gwaslab.hm_harmonize_sumstats import auto_check_vcf_chr_dict
             #check if in outcome and exposure snp list
             #replace
-def _extract_with_ld_proxy(  snplist=None,
+def _extract_with_ld_proxy( snplist=None,
                             common_sumstats=None,
                             sumstats1=None,
                             vcf_path=None,
@@ -58,6 +58,7 @@ def _extract_with_ld_proxy(  snplist=None,
     is_needed=[]
     no_need  =[]
+    print(common_sumstats.head())
     for i in snplist:
         if i in common_sumstats["SNPID"].values:
             no_need.append(i)
@@ -72,7 +73,7 @@ def _extract_with_ld_proxy(  snplist=None,
     if len(in_sumstats)==0:
         log.write(" -No available variants for LD proxy checking...Skipping... ", verbose=verbose)
     else:
-        log.write(" -{}} available variants for LD proxy checking... ".format(len(in_sumstats)), verbose=verbose)
+        log.write(" -{} available variants for LD proxy checking... ".format(len(in_sumstats)), verbose=verbose)
     for index,row in in_sumstats.iterrows():
         # determine SNP and select region
@@ -93,6 +94,16 @@ def _extract_with_ld_proxy(  snplist=None,
         if len(flanking_sumstats)==0:
             log.write("  -No availble variants in the region...Skipping!", verbose=verbose)
             continue
+        _get_rsq_single(in_sumstats.loc[index,["POS","NEA_1","EA_1"]],
+                        row_pos=row["POS"],
+                        vcf_path=vcf_path,
+                        region=region,
+                        log=log,
+                        verbose=verbose,
+                        vcf_chr_dict=vcf_chr_dict,
+                        tabix=tabix)
         flanking_sumstats = _get_rsq(row =in_sumstats.loc[index,["POS","NEA_1","EA_1"]],
                                      sumstats = flanking_sumstats,
@@ -126,6 +137,81 @@ def _extract_with_ld_proxy(  snplist=None,
     return extracted_sumstats
+def _extract_ld_proxy(  snplist=None,
+                        common_sumstats=None,
+                        vcf_path=None,
+                        vcf_chr_dict=None,
+                        tabix=None,
+                        log=Log(),
+                        verbose=True,
+                        windowsizekb=100,
+                        ld_threshold=0.8
+                            ):
+    ### Load vcf#######################################################################################
+    log.write("Start to load reference genotype...", verbose=verbose)
+    log.write(" -reference vcf path : "+ vcf_path, verbose=verbose)
+    if tabix is None:
+        tabix = which("tabix")
+    vcf_chr_dict = auto_check_vcf_chr_dict(vcf_path=vcf_path, vcf_chr_dict=vcf_chr_dict, verbose=verbose, log=log)
+    ld_proxies  = pd.DataFrame()
+    in_sumstats = common_sumstats.loc[common_sumstats["SNPID"].isin(snplist),:]
+    if len(in_sumstats)==0:
+        log.write(" -No available variants for LD proxy checking...Skipping... ", verbose=verbose)
+    else:
+        log.write(" -{} available variants for LD proxy checking... ".format(len(in_sumstats)), verbose=verbose)
+    for index,row in in_sumstats.iterrows():
+        # determine SNP and select region
+        snpid = row["SNPID"]
+        chrom= int(row["CHR"])
+        start= int(row["POS"]-windowsizekb*1000)
+        end=   int(row["POS"]+windowsizekb*1000)
+        region = (chrom, start, end)
+        ###  #######################################################################################
+        #is_flanking = common_sumstats["CHR"] == chrom & common_sumstats["CHR"]>start & common_sumstats["CHR"]<end
+        #flanking_sumstats = common_sumstats.loc[is_flanking,:]
+        flanking_sumstats = common_sumstats.query('CHR == @chrom and @start < POS < @end',engine='python').copy()
+        log.write(" -Extract {} variants in flanking region of {} for checking: {}:{}-{}".format(len(flanking_sumstats), snpid, chrom, start, end), verbose=verbose)
+        if len(flanking_sumstats)==0:
+            log.write("  -No availble variants in the region...Skipping!", verbose=verbose)
+            continue
+        flanking_sumstats = _get_rsq(row =in_sumstats.loc[index,["POS","NEA","EA"]],
+                                     sumstats = flanking_sumstats,
+                                     row_pos=row["POS"],
+                                     vcf_path=vcf_path,
+                                     region=region,
+                                     log=log,
+                                     verbose=verbose,
+                                     vcf_chr_dict=vcf_chr_dict,
+                                     tabix=tabix)
+        if flanking_sumstats is None:
+            log.write("  -{} is not found in the vcf...Skipping!".format(snpid))
+            continue
+        flanking_sumstats = flanking_sumstats.loc[flanking_sumstats["RSQ"]>ld_threshold,:]
+        log.write("  -Variants in LD with {} (RSQ > {}): {}".format(snpid, ld_threshold,len(flanking_sumstats)), verbose=verbose)
+        if len(flanking_sumstats)>0:
+            flanking_sumstats["LD_REF_VARIANT"]= snpid
+            for i,row_with_rsq in flanking_sumstats.iterrows():
+                if row_with_rsq["SNPID"] in common_sumstats["SNPID"].values:
+                    log.write("  -Top Proxy for {} is found: {} (LD RSQ= {})".format(snpid, row_with_rsq["SNPID"], row_with_rsq["RSQ"]))
+                    break
+            #row_with_rsq = pd.DataFrame(row_with_rsq)
+            ld_proxies = pd.concat([ld_proxies, flanking_sumstats], ignore_index=True)
+    log.write("Finished loading reference genotype successfully!", verbose=verbose)
+    return ld_proxies.sort_values(by="RSQ",ascending=False)
 def _get_rsq(    row,
                  sumstats,
                  row_pos,
@@ -205,4 +291,77 @@ def _get_rsq(    row,
         return sumstats
 def _check_if_in_sumstats2(row, sumstast):
-    pass
+    pass
+def _get_rsq_single(  row,
+                 row_pos,
+                 vcf_path,
+                 region,
+                 log,
+                 verbose,
+                 vcf_chr_dict,
+                 tabix):
+    #load genotype data of the targeted region
+    ref_genotype = read_vcf(vcf_path,region=vcf_chr_dict[region[0]]+":"+str(region[1])+"-"+str(region[2]),tabix=tabix)
+    if ref_genotype is None:
+        log.warning("No data was retrieved. Skipping ...", verbose=verbose)
+        ref_genotype=dict()
+        ref_genotype["variants/POS"]=np.array([],dtype="int64")
+        return None
+    log.write("  -Retrieving index...", verbose=verbose)
+    log.write("  -Ref variants in the region: {}".format(len(ref_genotype["variants/POS"])), verbose=verbose)
+    #  match sumstats pos and ref pos:
+    # get ref index for its first appearance of sumstats pos
+    #######################################################################################
+    def match_varaint(x):
+        # x: "POS,NEA,EA"
+        if np.any(ref_genotype["variants/POS"] == x.iloc[0]):
+            if len(np.where(ref_genotype["variants/POS"] == x.iloc[0] )[0])>1:
+            # multiple position matches
+                for j in np.where(ref_genotype["variants/POS"] == x.iloc[0])[0]:
+                # for each possible match, compare ref and alt
+                    if x.iloc[1] == ref_genotype["variants/REF"][j]:
+                        if x.iloc[2] in ref_genotype["variants/ALT"][j]:
+                            return j
+                    elif x.iloc[1] in ref_genotype["variants/ALT"][j]:
+                        if x.iloc[2] == ref_genotype["variants/REF"][j]:
+                            return j
+                    else:
+                        return None
+            else:
+                # single match
+                return np.where(ref_genotype["variants/POS"] == x.iloc[0] )[0][0]
+        else:
+            # no position match
+            return None
+    #############################################################################################
+    lead_pos = row_pos
+    # if lead pos is available:
+    if lead_pos in ref_genotype["variants/POS"]:
+        # get ref index for lead snp
+        lead_snp_ref_index = match_varaint(row)
+        #lead_snp_ref_index = np.where(ref_genotype["variants/POS"] == lead_pos)[0][0]
+        # non-na other snp index
+        other_snps_ref_index = list(range(len(ref_genotype["calldata/GT"])))
+        other_snps_ref_index.remove(lead_snp_ref_index)
+        # get genotype
+        lead_snp_genotype = GenotypeArray([ref_genotype["calldata/GT"][lead_snp_ref_index]]).to_n_alt()
+        other_snp_genotype = GenotypeArray(ref_genotype["calldata/GT"][other_snps_ref_index]).to_n_alt()
+        log.write("  -Calculating Rsq...", verbose=verbose)
+        if len(other_snp_genotype)>1:
+            valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype)[0],2)
+        else:
+            valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype),2)
+        ld_proxy = pd.DataFrame( {"SNPID":ref_genotype["variants/ID"][other_snps_ref_index],"RSQ":valid_r2 })
+    return  ld_proxy.sort_values(by="RSQ",ascending=False)

gwaslab/util_ex_ldsc.py CHANGED Viewed

@@ -260,6 +260,9 @@ class ARGS():
 def _estimate_h2_by_ldsc(insumstats, log, verbose=True, munge=False, munge_args=None, **kwargs):
     sumstats = insumstats.copy()
+    if "N" in sumstats.columns:
+        sumstats["N"] = sumstats["N"].astype("int64")
     if munge:
         if munge_args is None:
             munge_args={}
@@ -320,6 +323,8 @@ def _estimate_h2_by_ldsc(insumstats, log, verbose=True, munge=False, munge_args=
 def _estimate_partitioned_h2_by_ldsc(insumstats, log, verbose=True, **kwargs):
     sumstats = insumstats.copy()
+    if "N" in sumstats.columns:
+        sumstats["N"] = sumstats["N"].astype("int64")
     ##start function with col checking##########################################################
     _start_line = "run LD score regression"
     _end_line = "running LD score regression"
@@ -366,6 +371,8 @@ def _estimate_partitioned_h2_by_ldsc(insumstats, log, verbose=True, **kwargs):
 def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **kwargs):
     sumstats = insumstats.copy()
+    if "N" in sumstats.columns:
+        sumstats["N"] = sumstats["N"].astype("int64")
     ##start function with col checking##########################################################
     _start_line = "run LD score regression for genetic correlation"
     _end_line = "running LD score regression for genetic correlation"
@@ -426,6 +433,8 @@ def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **kwargs):
 def _estimate_h2_cts_by_ldsc(insumstats, log, verbose=True, **kwargs):
     sumstats = insumstats.copy()
+    if "N" in sumstats.columns:
+        sumstats["N"] = sumstats["N"].astype("int64")
     ##start function with col checking##########################################################
     _start_line = "run LD score regression"
     _end_line = "running LD score regression"

gwaslab/util_ex_run_2samplemr.py CHANGED Viewed

@@ -21,6 +21,8 @@ def _run_two_sample_mr(sumstatspair_object,
                        n1=None,
                        n2=None,
                        binary1=False,
+                       cck1=None,
+                       cck2=None,
                        ncase1=None,
                        ncontrol1=None,
                        prevalence1=None,
@@ -35,6 +37,22 @@ def _run_two_sample_mr(sumstatspair_object,
     if methods is None:
         methods = ["mr_ivw","mr_simple_mode","mr_weighted_median","mr_egger_regression","mr_ivw_mre", "mr_weighted_mode"]
         methods_string = '"{}"'.format('","'.join(methods))
+    if cck1 is not None:
+        log.write(" - ncase1, ncontrol1, prevalence1:{}".format(cck1))
+        binary1 = True
+        ncase1 = cck1[0]
+        ncontrol1 = cck1[1]
+        prevalence1 =  cck1[2]
+        n1 = ncase1 + ncontrol1
+    if cck2 is not None:
+        log.write(" - ncase2, ncontrol2, prevalence2:{}".format(cck2))
+        binary2 = True
+        ncase2 = cck2[0]
+        ncontrol2 = cck2[1]
+        prevalence2 =  cck2[2]
+        n2 = ncase2 + ncontrol2
     if clump==True:
         sumstatspair = sumstatspair_object.clumps["clumps"]
     else:
@@ -64,10 +82,16 @@ def _run_two_sample_mr(sumstatspair_object,
     ###
     calculate_r_script = ""
     if binary1==True:
         calculate_r_script+= _make_script_for_calculating_r("exposure", ncase1, ncontrol1, prevalence1)
+    else:
+        calculate_r_script+= _make_script_for_calculating_r_quant("exposure")
     if binary2==True:
         calculate_r_script+= _make_script_for_calculating_r("outcome", ncase2, ncontrol2, prevalence2)
+    else:
+        calculate_r_script+= _make_script_for_calculating_r_quant("outcome")
     # create scripts
     directionality_test_script='''
@@ -218,6 +242,16 @@ def _make_script_for_calculating_r(exposure_or_outcome, ncase, ncontrol, prevale
         return script
+def _make_script_for_calculating_r_quant(exposure_or_outcome):
+        script = """
+        harmonized_data$"r.{exposure_or_outcome}" <- get_r_from_bsen(  harmonized_data$"beta.{exposure_or_outcome}",
+                                                        harmonized_data$"se.{exposure_or_outcome}",
+                                                        harmonized_data$"samplesize.{exposure_or_outcome}"
+                                                        )
+        """.format(
+             exposure_or_outcome = exposure_or_outcome
+        )
+        return script
 def _filter_by_f(sumstatspair, f_check, n1, binary1=None, ncase1=None, ncontrol1=None, prevalence1=None, log=Log() ):

gwaslab/util_ex_run_clumping.py CHANGED Viewed

@@ -162,7 +162,7 @@ def _clump(insumstats, vcf=None, scaled=False, out="clumping_plink2",
             log.write(e.output)
         #os.system(script)
-        clumped = pd.read_csv("{}.clumps".format(out_single_chr),usecols=[2,0,1,3],sep="\s+")
+        clumped = pd.read_csv("{}.clumps".format(out_single_chr),sep="\s+")
         results = pd.concat([results,clumped],ignore_index=True)
         # remove temp SNPIDP file
@@ -172,7 +172,9 @@ def _clump(insumstats, vcf=None, scaled=False, out="clumping_plink2",
     log.write("Finished clumping.",verbose=verbose)
     results_sumstats = insumstats.loc[insumstats["SNPID"].isin(results["SNPID"]),:].copy()
     finished(log=log, verbose=verbose, end_line=_end_line)
-    return results_sumstats, plink_log
+    return results_sumstats, results, plink_log

gwaslab/util_in_fill_data.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import pandas as pd
 import numpy as np
 import scipy.stats as ss
+from scipy.stats import norm
 from scipy import stats
 from gwaslab.g_Log import Log
 import gc
@@ -8,6 +9,7 @@ import gc
 from gwaslab.g_version import _get_version
 from gwaslab.qc_check_datatype import check_datatype
 def filldata(
     insumstats,
     to_fill=None,
@@ -38,7 +40,7 @@ def filldata(
         for i in skip_cols:
             to_fill.remove(i)
         log.write("  -Skipping columns: ",skip_cols, verbose=verbose)
-    if len(set(to_fill) & set(["OR","OR_95L","OR_95U","BETA","SE","P","Z","CHISQ","MLOG10P","MAF"]))==0:
+    if len(set(to_fill) & set(["OR","OR_95L","OR_95U","BETA","SE","P","Z","CHISQ","MLOG10P","MAF","SIG"]))==0:
         log.write(" -No available columns to fill. Skipping.", verbose=verbose)
         log.write("Finished filling data using existing columns.", verbose=verbose)
         return sumstats
@@ -217,6 +219,20 @@ def fill_maf(sumstats,log,verbose=True,filled_count=0):
         return 0,filled_count
     return 1,filled_count
+def fill_sig(sumstats,log,sig_level=5e-8, verbose=True,filled_count=0):
+    if "P" in sumstats.columns or "MLOG10P" in sumstats.columns:
+        log.write("  - Determining significant using P and MLOG10P with threshold:{}".format(sig_level), verbose=verbose)
+        if "P" in sumstats.columns:
+            is_sig = sumstats["P"]<sig_level
+        elif "MLOG10P" in sumstats.columns:
+            is_sig = sumstats["MLOG10P"]>np.log10(sig_level)
+        sumstats["SIGNIFICANT"] = False
+        sumstats.loc[is_sig, "SIGNIFICANT"] = True
+        filled_count +=1
+    else:
+        return 0,filled_count
+    return 1,filled_count
 ####################################################################################################################
 def fill_extreme_mlog10(sumstats, z):
     log_pvalue = np.log(2) + ss.norm.logsf(np.abs(sumstats[z])) #two-sided
@@ -287,7 +303,10 @@ def fill_iteratively(sumstats,raw_to_fill,log,only_sig,df,extreme,verbose,sig_le
             else:
                 status,filled_count = fill_mlog10p(sumstats,log,verbose=verbose)
             if status == 1 : to_fill.remove("MLOG10P")
+        if "SIG" in to_fill:
+            status,filled_count = fill_sig(sumstats,sig_level=sig_level ,log=log,verbose=verbose,filled_count=filled_count)
+            if status == 1 : to_fill.remove("SIG")
         if filled_count == 0:
             break
@@ -330,4 +349,10 @@ def _convert_or_to_beta(OR):
     return np.log(OR)
 def _convert_beta_to_or(beta):
-    return np.exp(beta)
+    return np.exp(beta)
+def rank_based_int(series, c=3/8):
+    #https://onlinelibrary.wiley.com/doi/10.1111/biom.13214
+    n=sum(~series.isna())
+    normalized_value = norm.ppf((series.rank()-c)/(n+1-2*c))
+    return normalized_value

gwaslab/util_in_filter_value.py CHANGED Viewed

@@ -513,4 +513,69 @@ def _exclude(sumstats, exclude=None, id_use="SNPID", log=Log(), verbose=True ):
         log.write(" -Excluding {} variants from sumstats...".format(len(exclude)),verbose=verbose)
         sumstats = sumstats.loc[~sumstats[id_use].isin(exclude),:]
         log.write(" -Excluded {} variants from sumstats...".format(len(sumstats)),verbose=verbose)
-    return sumstats
+    return sumstats
+def _filter_region(sumstats, region, chrom="CHR",pos="POS",log=Log(),verbose=True):
+    if region is not None:
+        region_chr = region[0]
+        region_start = region[1]
+        region_end = region[2]
+        log.write(" -Extract SNPs in region : chr{}:{}-{}...".format(region_chr, region[1], region[2]),verbose=verbose)
+        in_region_snp = (sumstats[chrom]==region_chr) & (sumstats[pos]<region_end) & (sumstats[pos]>region_start)
+        log.write(" -Extract SNPs in specified regions: "+str(sum(in_region_snp)),verbose=verbose)
+        sumstats = sumstats.loc[in_region_snp,:]
+        return sumstats.copy()
+def _search_variants( sumstats, snplist=None,
+                     snpid="SNPID" ,rsid="rsID",
+                     chrom="CHR",pos="POS",ea="EA",nea="NEA",
+                     log=Log(),verbose=True):
+    log.write("Start to search for variants...", verbose=verbose)
+    # create a boolean col with FALSE
+    if snpid in sumstats.columns:
+        is_extract = sumstats[snpid]!=sumstats[snpid]
+    else:
+        is_extract = sumstats[rsid]!=sumstats[rsid]
+    # search each variant
+    for variant in snplist:
+        if pd.api.types.is_list_like(variant):
+            # (1:1234)
+            single_chrom=variant[0]
+            single_pos=variant[1]
+            is_extract = is_extract | ((sumstats[pos] == single_pos ) &(sumstats[chrom] == single_chrom))
+        elif pd.api.types.is_string_dtype(type(variant)):
+            # rs123
+            if "rsID" in sumstats.columns:
+                is_extract = is_extract | (sumstats["rsID"] == variant)
+            # 1:123:A:D
+            if "SNPID" in sumstats.columns:
+                is_extract = is_extract | (sumstats["SNPID"] == variant)
+            # 1:123:A:D -> (1:1234)
+            a= re.match(r'^(chr|Chr|CHR)?(\d+)[:_-](\d+)([:_-]([ATCG]+)[:_-]([ATCG]+))?$', variant, flags=0)
+            if a is not None:
+                if a[4] is None:
+                    single_chrom=int(a[2])
+                    single_pos=int(a[3])
+                    is_extract = is_extract | ((sumstats[pos] == single_pos ) &(sumstats[chrom] == single_chrom))
+                else:
+                    single_chrom = int(a[2])
+                    single_pos = int(a[3])
+                    single_ea = a[5]
+                    single_nea = a[6]
+                    a_match = ((sumstats[nea] == single_nea) & (sumstats[ea] == single_ea)) | ((sumstats[nea] == single_ea) & (sumstats[ea] == single_nea))
+                    is_extract = is_extract | ((sumstats[pos] == single_pos ) &(sumstats[chrom] == single_chrom)  & a_match)
+    to_search =  sumstats.loc[is_extract,:].copy()
+    log.write(" -Found {} variants...".format(len(to_search)),verbose=verbose)
+    log.write("Finished searching variants.", verbose=verbose)
+    return to_search

gwaslab/util_in_merge.py ADDED Viewed

@@ -0,0 +1,51 @@
+import pandas as pd
+from gwaslab.g_Log import Log
+import re
+def _extract_variant(variant_set, sumstats_dic, log=Log(), verbose=True):
+    combined = pd.DataFrame()
+    log.write("Start to initialize gl.SumstatsSet...", verbose=verbose)
+    for key, sumstats_gls in sumstats_dic.items():
+        log.write(" -{} : {}".format(key, sumstats_gls), verbose=verbose)
+    for key, sumstats_gls in sumstats_dic.items():
+        sumstats_single = sumstats_gls.data
+        # create a boolean col with FALSE
+        is_extract = sumstats_single["SNPID"]!=sumstats_single["SNPID"]
+        for variant in variant_set:
+            if pd.api.types.is_list_like(variant):
+                chrom=variant[0]
+                pos=variant[1]
+                is_extract = is_extract | ((sumstats_single["POS"] == pos ) &(sumstats_single["CHR"] == chrom))
+            elif pd.api.types.is_string_dtype(type(variant)):
+                is_extract = is_extract | (sumstats_single["SNPID"] == variant)
+                a= re.search(r'^(chr|Chr|CHR)?(\d+)[:_-](\d+)[:_-][ATCG]+[:_-][ATCG]+$', variant, flags=0)
+                if a is not None:
+                    chrom=int(a[2])
+                    pos=int(a[3])
+                    is_extract = is_extract | ((sumstats_single["POS"] == pos ) &(sumstats_single["CHR"] == chrom))
+        to_extract =  sumstats_single.loc[is_extract,:].copy()
+        log.write(" -Extracted {} variants from {}".format(len(to_extract), key),verbose=verbose)
+        to_extract["STUDY"] = key
+        to_extract_cols=["STUDY"]
+        default_cols=["SNPID","EA","NEA","CHR","POS","BETA","SE","P","MLOG10P","EAF","MAF","STATUS"]
+        for i in default_cols:
+            if i in sumstats_single.columns:
+                to_extract_cols.append(i)
+        combined = pd.concat([combined, to_extract[to_extract_cols]], ignore_index=True)
+    log.write("Finished initializing gl.SumstatsSet.", verbose=verbose)
+    return combined

gwaslab/viz_aux_save_figure.py CHANGED Viewed

@@ -52,7 +52,8 @@ def get_default_path(keyword,fmt="png"):
                         "esc":"effect_size_comparision",
                         "afc":"allele_frequency_comparision",
                         "gwheatmap":"genome_wide_heatmap",
-                        "scatter":"scatter"
+                        "scatter":"scatter",
+                        "forest":"forest"
                         }
     prefix = path_dictionary[keyword]
     count = 1

gwaslab/viz_plot_credible_sets.py ADDED Viewed

@@ -0,0 +1,99 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+from gwaslab.g_Log import Log
+from gwaslab.viz_aux_quickfix import _quick_assign_i_with_rank
+from gwaslab.viz_plot_mqqplot import _process_xtick
+from gwaslab.viz_plot_mqqplot import _process_xlabel
+from gwaslab.bd_common_data import get_number_to_chr
+from gwaslab.util_in_filter_value import _filter_region
+from gwaslab.io_process_args import _extract_kwargs
+def _plot_cs(pipcs,
+            region,
+            figax=None,
+            _posdiccul=None,
+            xtick_chr_dict=None,
+            pip="PIP",
+            onlycs=False,
+            cs="CREDIBLE_SET_INDEX",
+            marker_size=(45,85),
+            fontsize = 12,
+            font_family = "Arial",
+            legend_title="Credible sets",
+            log=Log(),
+            verbose=True,
+            **kwargs):
+        '''
+        pipcs : a DataFrame of finemapping results
+        '''
+        ## parameters #############################
+        if xtick_chr_dict is None:
+                xtick_chr_dict = get_number_to_chr()
+        scatter_kwargs =   _extract_kwargs("scatter", dict(), locals())
+        region_marker_shapes = ['o', '^','s','D','*','P','X','h','8']
+        region_ld_colors_m = ["grey","#E51819","green","#F07818","#AD5691","yellow","purple"]
+        ## filter data #############################
+        pipcs = _filter_region(pipcs, region)
+        if onlycs ==True:
+                pipcs = pipcs.loc[pipcs[cs]>0,:]
+        pipcs[cs] = pipcs[cs].astype("string")
+        ## figure and ax #############################
+        if figax is not None:
+                ax=figax[1]
+                fig=figax[0]
+        else:
+                fig, ax = plt.subplots()
+        # assign i
+        pipcs,chrom_df=_quick_assign_i_with_rank(pipcs,  chrpad=0.00,
+                                                use_rank=False,
+                                                chrom="CHR",pos="POS",
+                                                drop_chr_start=False,
+                                                _posdiccul=_posdiccul)
+        pipcs = pipcs.sort_values(by=cs,ascending=True)
+        ## plot ##########################################
+        scatter_kwargs["markers"]= {m:region_marker_shapes[i] for i,m in enumerate(pipcs[cs].unique())}
+        palette = sns.color_palette(region_ld_colors_m,n_colors=pipcs[cs].nunique())
+        edgecolor="none"
+        plot = sns.scatterplot(data=pipcs,
+                        x="i",
+                        y=pip,
+                        hue=cs,
+                        edgecolor=edgecolor,
+                        palette=palette,
+                        style=cs,
+                        s=marker_size[1],
+                        ax=ax,
+                        **scatter_kwargs)
+        # process legend
+        handles, labels = ax.get_legend_handles_labels()
+        new_labels = []
+        new_handles = []
+        ncol = len(labels)
+        for i,label in enumerate(labels):
+                if label in [str(j) for j in range(1,10)]:
+                        new_labels.append(labels[i])
+                        new_handles.append(handles[i])
+        ax.legend(labels =new_labels,
+                  handles=new_handles,
+                  loc="upper right",
+                  bbox_to_anchor=(0.995, 0.995),
+                  ncol=1,
+                  scatterpoints=2,
+                  title=legend_title,
+                  frameon=True)
+        return fig, log

gwaslab 3.5.4__py3-none-any.whl → 3.5.6__py3-none-any.whl

Potentially problematic release.

gwaslab 3.5.4py3-none-any.whl → 3.5.6py3-none-any.whl