PyPI - gwaslab - Versions diffs - 3.4.45__py3-none-any.whl → 3.4.47__py3-none-any.whl - Mend

gwaslab 3.4.45py3-none-any.whl → 3.4.47py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (27) hide show

gwaslab/__init__.py +2 -1
gwaslab/bd_common_data.py +22 -0
gwaslab/g_Sumstats.py +2 -0
gwaslab/g_version.py +7 -7
gwaslab/hm_harmonize_sumstats.py +3 -2
gwaslab/io_preformat_input.py +22 -1
gwaslab/io_to_formats.py +8 -3
gwaslab/qc_fix_sumstats.py +8 -1
gwaslab/util_ex_calculate_ldmatrix.py +20 -7
gwaslab/util_ex_calculate_prs.py +13 -7
gwaslab/util_ex_process_ref.py +22 -11
gwaslab/util_in_filter_value.py +38 -2
gwaslab/util_in_get_sig.py +32 -8
gwaslab/util_in_meta.py +234 -0
gwaslab/util_in_snphwe.py +58 -0
gwaslab/viz_aux_chromatin.py +112 -0
gwaslab/viz_plot_compare_effect.py +4 -1
gwaslab/viz_plot_mqqplot.py +82 -42
gwaslab/viz_plot_regional2.py +792 -0
gwaslab/viz_plot_regionalplot.py +4 -0
gwaslab/viz_plot_stackedregional.py +97 -22
{gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/METADATA +5 -5
{gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/RECORD +27 -23
{gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/WHEEL +1 -1
{gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/LICENSE +0 -0
{gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/LICENSE_before_v3.4.39 +0 -0
{gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/top_level.txt +0 -0

gwaslab/util_in_meta.py ADDED Viewed

@@ -0,0 +1,234 @@
+import pandas as pd
+import numpy as np
+from scipy.stats.distributions import chi2
+from scipy.stats import norm
+from gwaslab.g_Log import Log
+from gwaslab.io_to_pickle import load_data_from_pickle
+from gwaslab.g_Sumstats import Sumstats
+import gc
+def meta_analyze(sumstats_list,random_effects=False, match_allele=True, log=Log()):
+    ###########################################################################
+    columns=["SNPID","CHR","POS","EA","NEA"]
+    results_df = pd.DataFrame(columns=columns)
+    log.write("Start to perform meta-analysis...")
+    log.write(" -Datasets:")
+    for index,sumstats_path in enumerate(sumstats_list):
+        if isinstance(sumstats_path, pd.DataFrame):
+            log.write("  -Sumstats #{}: {} ".format(index, sumstats_path))
+        elif isinstance(sumstats_path, Sumstats):
+            log.write("  -Sumstats #{}: {} ".format(index, sumstats_path))
+        else:
+            log.write("  -Sumstats #{}: {} ".format(index, sumstats_path))
+    # extract all variants information
+    log.write(" -Iterating through {} datasets to determine variant list...".format(len(sumstats_list)))
+    for index,sumstats_path in enumerate(sumstats_list):
+        sumstats = get_sumstats(sumstats_path,usekeys=["SNPID","CHR","POS","EA","NEA"])
+        new_rows = sumstats.loc[~sumstats["SNPID"].isin(results_df["SNPID"]),["SNPID","CHR","POS","EA","NEA"]]
+        log.write("  -Sumstats #{}: {} new variants (out of {}) are being added to analysis...".format(index, len(new_rows),len(sumstats)))
+        if len(new_rows)>0:
+            if len(results_df) == 0:
+                results_df = new_rows
+            else:
+                results_df = pd.concat([results_df, new_rows],ignore_index=True)
+        del sumstats
+        del new_rows
+        gc.collect()
+    ###########################################################################
+    log.write(" -Initiating result DataFrame...")
+    columns=["SNPID","CHR","POS","EA","NEA","_BETAW_SUM","_EA_N","_NEA_N","_BETA2W_SUM","_W_SUM","EAF","N","DIRECTION","BETA","SE","DOF"]
+    results_df = results_df.set_index("SNPID")
+    results_df["N"] = 0
+    results_df["_BETAW_SUM"] = 0.0
+    results_df["_BETA2W_SUM"] = 0.0
+    results_df["_W_SUM"] = 0.0
+    results_df["_W2_SUM"] = 0.0
+    results_df["_EA_N"] = 0.0
+    results_df["_NEA_N"] = 0.0
+    results_df["N"] = 0
+    results_df["DIRECTION"] = ""
+    results_df["BETA"] = 0.0
+    results_df["SE"] = 0.0
+    results_df["DOF"] = -1
+    dtype_dict ={
+        "_BETAW_SUM":"float64",
+        "_EA_N":"float64",
+        "_NEA_N":"float64",
+        "_BETA2W_SUM":"float64",
+        "_W_SUM":"float64",
+        "BETA":"float64",
+        "SE":"float64",
+        "N":"Int64",
+        "DOF":"Int64"
+    }
+    results_df=results_df.astype(dtype_dict)
+    ###########################################################################
+    log.write(" -Iterating through {} datasets to compute statistics for fixed-effect model...".format(len(sumstats_list)))
+    for index,sumstats_path in enumerate(sumstats_list):
+        to_use_sumstats = process_sumstats(sumstats_path,
+                                           results_df[["EA","NEA"]],
+                                           index=index,
+                                           match_allele=match_allele,)
+        sumstats_index = to_use_sumstats.index
+        results_df_not_in_sumstat_index = results_df.index[~results_df.index.isin(to_use_sumstats.index)]
+        # N and DOF
+        results_df.loc[sumstats_index, "N"]         += to_use_sumstats["N"]
+        results_df.loc[sumstats_index, "DOF"]       += 1
+        # BEAT and SE
+        results_df.loc[sumstats_index,"_BETA2W_SUM"] += to_use_sumstats["BETA"]**2 *(1/(to_use_sumstats["SE"]**2))
+        results_df.loc[sumstats_index,"_BETAW_SUM"]  += to_use_sumstats["BETA"]*(1/(to_use_sumstats["SE"]**2))
+        results_df.loc[sumstats_index,"_W_SUM"]      += 1/(to_use_sumstats["SE"]**2)
+        results_df.loc[sumstats_index,"_W2_SUM"]     += results_df.loc[sumstats_index,"_W_SUM"]**2
+        # EAF
+        results_df.loc[sumstats_index,"_EA_N"] += to_use_sumstats["N"]*to_use_sumstats["EAF"]
+        results_df.loc[sumstats_index,"_NEA_N"]  += to_use_sumstats["N"]*(1 - to_use_sumstats["EAF"])
+        # DIRECTION
+        beta_index = to_use_sumstats[to_use_sumstats["BETA"]>0].index
+        results_df.loc[beta_index, "DIRECTION"] += "+"
+        beta_index = to_use_sumstats[to_use_sumstats["BETA"]==0].index
+        results_df.loc[beta_index, "DIRECTION"] += "0"
+        beta_index = to_use_sumstats[to_use_sumstats["BETA"]<0].index
+        results_df.loc[beta_index, "DIRECTION"] += "-"
+        results_df.loc[results_df_not_in_sumstat_index, "DIRECTION"] += "?"
+        del to_use_sumstats
+        gc.collect()
+    ##############################################################################
+    # fixed - effect statistics
+    results_df["BETA"] = results_df["_BETAW_SUM"] / results_df["_W_SUM"]
+    results_df["EAF"] = results_df["_EA_N"] / (results_df["_EA_N"] + results_df["_NEA_N"])
+    results_df["SE"] = np.sqrt(1/results_df["_W_SUM"])
+    results_df["Z"] = results_df["BETA"] / results_df["SE"]
+    results_df["P"] = norm.sf(abs(results_df["Z"]))*2
+    results_df["Q"] = results_df["_BETA2W_SUM"] - (results_df["_BETAW_SUM"]**2 / results_df["_W_SUM"])
+    for dof in results_df["DOF"].unique():
+        results_df_dof_index = results_df["DOF"] == dof
+        results_df.loc[results_df_dof_index,"P_HET"] = chi2.sf(results_df.loc[results_df_dof_index, "Q"].values,dof)
+        gc.collect()
+    results_df["I2_HET"] = (results_df["Q"] - results_df["DOF"])/results_df["Q"]
+    results_df.loc[results_df["I2_HET"]<0, "I2_HET"] = 0
+    results_df=results_df.drop(columns=["_EA_N","_NEA_N"])
+    gc.collect()
+    ###########################################################################
+    if random_effects==True:
+        log.write(" -Iterating through {} datasets to compute statistics for random-effects model...".format(len(sumstats_list)))
+        results_df["_R2"] = (results_df["Q"] - results_df["DOF"])/(results_df["_W_SUM"] - (results_df["_W2_SUM"]/results_df["_W_SUM"]))
+        results_df.loc[results_df["_R2"]<0, "_R2"] = 0
+        variant_index_random = results_df[results_df["_R2"]>0].index
+        results_df["_BETAW_SUM_R"] = 0.0
+        results_df["_W_SUM_R"] = 0.0
+        results_df["BETA_RANDOM"] = results_df["BETA"]
+        results_df["SE_RANDOM"] = results_df["SE"]
+        for index,sumstats_path in enumerate(sumstats_list):
+            to_use_sumstats = process_sumstats(sumstats_path,
+                                               results_df.loc[variant_index_random, ["EA","NEA"]],
+                                               index=index,
+                                               match_allele=match_allele,
+                                               extract_index=variant_index_random)
+            sumstats_index = to_use_sumstats.index
+            # BEAT and SE
+            results_df.loc[sumstats_index,"_BETAW_SUM_R"]  += to_use_sumstats["BETA"]*(1/(to_use_sumstats["SE"]**2 + results_df.loc[sumstats_index,"_R2"]))
+            results_df.loc[sumstats_index,"_W_SUM_R"]      += 1/(to_use_sumstats["SE"]**2 + results_df.loc[sumstats_index,"_R2"])
+            del to_use_sumstats
+            del sumstats_index
+            gc.collect()
+        results_df.loc[variant_index_random,"BETA_RANDOM"] = results_df.loc[variant_index_random,"_BETAW_SUM_R"] / results_df.loc[variant_index_random,"_W_SUM_R"]
+        results_df.loc[variant_index_random,"SE_RANDOM"] = np.sqrt(1/results_df.loc[variant_index_random,"_W_SUM_R"])
+        results_df["Z_RANDOM"] = results_df["BETA_RANDOM"] / results_df["SE_RANDOM"]
+        results_df["P_RANDOM"] = norm.sf(abs(results_df["Z_RANDOM"]))*2
+        results_df = results_df.drop(columns=["_BETAW_SUM_R","_W_SUM_R"])
+        gc.collect()
+    ###########################################################################
+    results_df = results_df.drop(columns=["_BETAW_SUM","_BETA2W_SUM","_W_SUM","_R2","_W2_SUM"]).sort_values(by=["CHR","POS"])
+    gc.collect()
+    log.write("Finished meta-analysis successfully!")
+    return results_df
+def process_sumstats(sumstats_path, results_df, index, extract_index=None, match_allele=True, log=Log()):
+    if extract_index is None:
+        extract_index = results_df.index
+    sumstats = get_sumstats(sumstats_path)
+    to_use_sumstats = sumstats.loc[sumstats["SNPID"].isin(extract_index.values),["SNPID","EA","NEA","BETA","N","SE","EAF"]]
+    if len(to_use_sumstats)>0:
+        n_pre_dup = len(to_use_sumstats)
+        log.write("  -Processing {} variants from sumstats #{}".format(len(to_use_sumstats), index))
+        to_use_sumstats = to_use_sumstats.drop_duplicates(subset="SNPID").set_index("SNPID")
+        n_post_dup = len(to_use_sumstats)
+        if n_pre_dup - n_post_dup>0:
+            log.write("  -Dropping {} duplicated variants from sumstats #{}".format(n_pre_dup - n_post_dup, index))
+        if match_allele==True:
+            sumstats_index = to_use_sumstats.index
+            # drop not matched
+            is_match = (to_use_sumstats.loc[sumstats_index,"EA"] == results_df.loc[sumstats_index, "EA"] )&(to_use_sumstats.loc[sumstats_index,"NEA"] == results_df.loc[sumstats_index, "NEA"])
+            is_flip = (to_use_sumstats.loc[sumstats_index,"EA"] == results_df.loc[sumstats_index, "NEA"])&( to_use_sumstats.loc[sumstats_index,"NEA"] == results_df.loc[sumstats_index, "EA"])
+            is_flip = is_flip | ((to_use_sumstats.loc[sumstats_index,"NEA"] == results_df.loc[sumstats_index, "EA"])&( to_use_sumstats.loc[sumstats_index,"EA"] == results_df.loc[sumstats_index, "NEA"]))
+            is_to_use = is_match|is_flip
+            if sum(~is_to_use)>0:
+                log.write("  -Dropping {} variants with unmatched alleles from sumstats #{}".format(sum(~is_to_use), index))
+            to_use_sumstats.loc[is_flip[is_flip].index, "BETA"] =  -to_use_sumstats.loc[is_flip[is_flip].index, "BETA"]
+            to_use_sumstats.loc[is_flip[is_flip].index, "EAF"]  = 1-to_use_sumstats.loc[is_flip[is_flip].index, "EAF"]
+            to_use_sumstats = to_use_sumstats.loc[is_to_use[is_to_use].index,:]
+    gc.collect()
+    return to_use_sumstats
+def get_sumstats(input_path,usekeys=None):
+    if isinstance(input_path, tuple):
+        path = input_path[0]
+        path_args = input_path[1]
+    else:
+        path = input_path
+        path_args={}
+    if isinstance(path, pd.DataFrame):
+        sumstats = Sumstats(path,fmt="auto",verbose=False,usekeys=usekeys,**path_args).data
+    elif isinstance(path, Sumstats):
+        sumstats = path.data
+        if usekeys is not None:
+            sumstats = sumstats[usekeys]
+    elif path[-6:] == "pickle":
+        sumstats = load_data_from_pickle(path)
+        if usekeys is not None:
+            sumstats = sumstats[usekeys]
+    else:
+        sumstats = Sumstats(path,fmt="auto",verbose=False,usekeys=usekeys,**path_args).data
+    return sumstats

gwaslab/util_in_snphwe.py ADDED Viewed

@@ -0,0 +1,58 @@
+import numpy as np
+import pandas as pd
+def snphwe(obs_hets, obs_hom1, obs_hom2):
+    # Convert cpp code from (Jeremy McRae) to python
+    # https://github.com/jeremymcrae/snphwe/blob/master/src/snp_hwe.cpp
+    #/* (original comments)
+    #// This code implements an exact SNP test of Hardy-Weinberg Equilibrium as
+    #// described in Wigginton, JE, Cutler, DJ, and Abecasis, GR (2005) A Note on
+    #// Exact Tests of Hardy-Weinberg Equilibrium. AJHG 76: 887-893
+    #//
+    #// Written by Jan Wigginton
+    #*/
+    obs_homr = min(obs_hom1, obs_hom2)
+    obs_homc = max(obs_hom1, obs_hom2)
+    rare = 2 * obs_homr + obs_hets
+    genotypes = obs_hets + obs_homc + obs_homr
+    probs = np.array([0.0 for i in range(rare +1)])
+    mid = rare * (2 * genotypes - rare) // (2 * genotypes)
+    if mid % 2 != rare%2:
+        mid += 1
+    probs[mid] = 1.0
+    sum_p = 1 #probs[mid]
+    curr_homr = (rare - mid) // 2
+    curr_homc = genotypes - mid - curr_homr
+    for curr_hets in range(mid, 1, -2):
+        probs[curr_hets - 2] = probs[curr_hets] * curr_hets * (curr_hets - 1.0)/ (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))
+        sum_p+= probs[curr_hets - 2]
+        curr_homr += 1
+        curr_homc += 1
+    curr_homr = (rare - mid) // 2
+    curr_homc = genotypes - mid - curr_homr
+    for curr_hets in range(mid, rare-1, 2):
+        probs[curr_hets + 2] = probs[curr_hets] * 4.0 * curr_homr * curr_homc/ ((curr_hets + 2.0) * (curr_hets + 1.0))
+        sum_p += probs[curr_hets + 2]
+        curr_homr -= 1
+        curr_homc -= 1
+    target = probs[obs_hets]
+    p_hwe = 0.0
+    for p in probs:
+        if p <= target :
+            p_hwe += p / sum_p
+    return min(p_hwe,1)

gwaslab/viz_aux_chromatin.py ADDED Viewed

@@ -0,0 +1,112 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+from gwaslab.g_Log import Log
+#STATE NO.	MNEMONIC	DESCRIPTION	COLOR NAME	COLOR CODE
+#1	TssA	Active TSS	Red	255,0,0
+#2	TssAFlnk	Flanking Active TSS	Orange Red	255,69,0
+#3	TxFlnk	Transcr. at gene 5' and 3'	LimeGreen	50,205,50
+#4	Tx	Strong transcription	Green	0,128,0
+#5	TxWk	Weak transcription	DarkGreen	0,100,0
+#6	EnhG	Genic enhancers	GreenYellow	194,225,5
+#7	Enh	Enhancers	Yellow	255,255,0
+#8	ZNF/Rpts	ZNF genes & repeats	Medium Aquamarine	102,205,170
+#9	Het	Heterochromatin	PaleTurquoise	138,145,208
+#10	TssBiv	Bivalent/Poised TSS	IndianRed	205,92,92
+#11	BivFlnk	Flanking Bivalent TSS/Enh	DarkSalmon	233,150,122
+#12	EnhBiv	Bivalent Enhancer	DarkKhaki	189,183,107
+#13	ReprPC	Repressed PolyComb	Silver	128,128,128
+#14	ReprPCWk	Weak Repressed PolyComb	Gainsboro	192,192,192
+#15	Quies	Quiescent/Low	White	255,255,255
+color_dict={
+    "E1": np.array([255,0,0]),
+    "E2": np.array([255,69,0]),
+    "E3": np.array([50,205,50]),
+    "E4": np.array([0,128,0]),
+    "E5": np.array([0,100,0]),
+    "E6": np.array([194,225,5]),
+    "E7": np.array([255,255,0]),
+    "E8": np.array([102,205,170]),
+    "E9": np.array([138,145,208]),
+    "E10":np.array([205,92,92]),
+    "E11":np.array([233,150,122]),
+    "E12":np.array([189,183,107]),
+    "E13":np.array([128,128,128]),
+    "E14":np.array([192,192,192]),
+    "E15":np.array([255,255,255])
+}
+color_dict_i={
+    1: np.array([255,0,0]),
+    2: np.array([255,69,0]),
+    3: np.array([50,205,50]),
+    4: np.array([0,128,0]),
+    5: np.array([0,100,0]),
+    6: np.array([194,225,5]),
+    7: np.array([255,255,0]),
+    8: np.array([102,205,170]),
+    9: np.array([138,145,208]),
+    10:np.array([205,92,92]),
+    11:np.array([233,150,122]),
+    12:np.array([189,183,107]),
+    13:np.array([128,128,128]),
+    14:np.array([192,192,192]),
+    15:np.array([255,255,255])
+}
+def _plot_chromatin_state(region_chromatin_files,
+                          region_chromatin_labels,
+                          region,
+                          fig,
+                          ax,
+                          xlim_i,
+                          fontsize = 12,
+                          font_family = "Arial",
+                          log=Log(),
+                          verbose=True):
+    '''
+    files : a list of numbers
+    '''
+    target_chr = region[0]
+    target_start = region[1]
+    target_end = region[2]
+    offset_i = xlim_i[0] - region[1]
+    ax.set_ylim([-0.05,0.1*len(region_chromatin_files)-0.05])
+    ax.set_xlim([offset_i+target_start,offset_i+target_end])
+    px_for_01 = ax.transData.transform([0,0])[1] - ax.transData.transform([0,0.1])[1]
+    point=fig.dpi/72
+    points_for_01 = px_for_01*72 / fig.dpi
+    # each tissue
+    for i,file in enumerate(region_chromatin_files):
+        log.write(" -Loading : {}".format(file), verbose=verbose)
+        enh = pd.read_csv(file,sep="\t",header=None)
+        enh.columns=["ID","START","END","STATE"]
+        enh["CHR"] =  enh["ID"].str.extract(r"chr([0-9]+)").astype("float").astype("Int64")
+        enh["STATE_i"] =  enh["STATE"].str.extract(r"([0-9]+)_*").astype("float").astype("Int64")
+        enh_in_region = (enh["CHR"] == target_chr) & ((enh["END"] > target_start) & (enh["START"]<target_end))
+        df =enh.loc[enh_in_region,["STATE_i","START","END"]].sort_values("STATE_i",ascending=False)
+        log.write("  -Number of records in specified region: {}".format(len(df)), verbose=verbose)
+        # each block
+        for index, row in df.iterrows():
+            color=color_dict_i[row["STATE_i"]]
+            ax.plot([offset_i + row["START"] ,offset_i + row["END"]],
+                    [i*0.1,i*0.1],
+                    c=color/255,linewidth=points_for_01,solid_capstyle="butt")
+    ## add stripe label
+    if len(region_chromatin_labels) == len(region_chromatin_files):
+        ax.set_yticks([i*0.1 for i in range(len(region_chromatin_labels))], region_chromatin_labels, fontsize=fontsize, family=font_family)
+    else:
+        ax.set_yticks(ticks=[])
+    #ax.set_xticks(ticks=[])
+    ax.invert_yaxis()
+    return fig

gwaslab/viz_plot_compare_effect.py CHANGED Viewed

@@ -75,7 +75,10 @@ def compare_effect(path1,
     if scaled == True:
         scaled1 = True
         scaled2 = True
+    if is_q_mc=="fdr" or is_q_mc=="bon":
+        is_q = True
+    else:
+        raise ValueError("Please select either fdr or bon for is_q_mc.")
     if save_args is None:
         save_args = {"dpi":300,"facecolor":"white"}
     if reg_box is None:

gwaslab 3.4.45__py3-none-any.whl → 3.4.47__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.45py3-none-any.whl → 3.4.47py3-none-any.whl