gwaslab 3.4.45__py3-none-any.whl → 3.4.47__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

@@ -0,0 +1,234 @@
1
+
2
+ import pandas as pd
3
+ import numpy as np
4
+ from scipy.stats.distributions import chi2
5
+ from scipy.stats import norm
6
+ from gwaslab.g_Log import Log
7
+ from gwaslab.io_to_pickle import load_data_from_pickle
8
+ from gwaslab.g_Sumstats import Sumstats
9
+ import gc
10
+
11
+ def meta_analyze(sumstats_list,random_effects=False, match_allele=True, log=Log()):
12
+
13
+ ###########################################################################
14
+ columns=["SNPID","CHR","POS","EA","NEA"]
15
+ results_df = pd.DataFrame(columns=columns)
16
+
17
+ log.write("Start to perform meta-analysis...")
18
+ log.write(" -Datasets:")
19
+ for index,sumstats_path in enumerate(sumstats_list):
20
+ if isinstance(sumstats_path, pd.DataFrame):
21
+ log.write(" -Sumstats #{}: {} ".format(index, sumstats_path))
22
+ elif isinstance(sumstats_path, Sumstats):
23
+ log.write(" -Sumstats #{}: {} ".format(index, sumstats_path))
24
+ else:
25
+ log.write(" -Sumstats #{}: {} ".format(index, sumstats_path))
26
+
27
+
28
+ # extract all variants information
29
+ log.write(" -Iterating through {} datasets to determine variant list...".format(len(sumstats_list)))
30
+
31
+ for index,sumstats_path in enumerate(sumstats_list):
32
+ sumstats = get_sumstats(sumstats_path,usekeys=["SNPID","CHR","POS","EA","NEA"])
33
+ new_rows = sumstats.loc[~sumstats["SNPID"].isin(results_df["SNPID"]),["SNPID","CHR","POS","EA","NEA"]]
34
+ log.write(" -Sumstats #{}: {} new variants (out of {}) are being added to analysis...".format(index, len(new_rows),len(sumstats)))
35
+
36
+ if len(new_rows)>0:
37
+ if len(results_df) == 0:
38
+ results_df = new_rows
39
+ else:
40
+ results_df = pd.concat([results_df, new_rows],ignore_index=True)
41
+ del sumstats
42
+ del new_rows
43
+ gc.collect()
44
+
45
+
46
+
47
+ ###########################################################################
48
+ log.write(" -Initiating result DataFrame...")
49
+ columns=["SNPID","CHR","POS","EA","NEA","_BETAW_SUM","_EA_N","_NEA_N","_BETA2W_SUM","_W_SUM","EAF","N","DIRECTION","BETA","SE","DOF"]
50
+ results_df = results_df.set_index("SNPID")
51
+ results_df["N"] = 0
52
+ results_df["_BETAW_SUM"] = 0.0
53
+ results_df["_BETA2W_SUM"] = 0.0
54
+ results_df["_W_SUM"] = 0.0
55
+ results_df["_W2_SUM"] = 0.0
56
+ results_df["_EA_N"] = 0.0
57
+ results_df["_NEA_N"] = 0.0
58
+ results_df["N"] = 0
59
+ results_df["DIRECTION"] = ""
60
+ results_df["BETA"] = 0.0
61
+ results_df["SE"] = 0.0
62
+ results_df["DOF"] = -1
63
+
64
+ dtype_dict ={
65
+ "_BETAW_SUM":"float64",
66
+ "_EA_N":"float64",
67
+ "_NEA_N":"float64",
68
+ "_BETA2W_SUM":"float64",
69
+ "_W_SUM":"float64",
70
+ "BETA":"float64",
71
+ "SE":"float64",
72
+ "N":"Int64",
73
+ "DOF":"Int64"
74
+ }
75
+ results_df=results_df.astype(dtype_dict)
76
+ ###########################################################################
77
+
78
+ log.write(" -Iterating through {} datasets to compute statistics for fixed-effect model...".format(len(sumstats_list)))
79
+ for index,sumstats_path in enumerate(sumstats_list):
80
+ to_use_sumstats = process_sumstats(sumstats_path,
81
+ results_df[["EA","NEA"]],
82
+ index=index,
83
+ match_allele=match_allele,)
84
+ sumstats_index = to_use_sumstats.index
85
+ results_df_not_in_sumstat_index = results_df.index[~results_df.index.isin(to_use_sumstats.index)]
86
+
87
+ # N and DOF
88
+ results_df.loc[sumstats_index, "N"] += to_use_sumstats["N"]
89
+ results_df.loc[sumstats_index, "DOF"] += 1
90
+
91
+ # BEAT and SE
92
+ results_df.loc[sumstats_index,"_BETA2W_SUM"] += to_use_sumstats["BETA"]**2 *(1/(to_use_sumstats["SE"]**2))
93
+ results_df.loc[sumstats_index,"_BETAW_SUM"] += to_use_sumstats["BETA"]*(1/(to_use_sumstats["SE"]**2))
94
+ results_df.loc[sumstats_index,"_W_SUM"] += 1/(to_use_sumstats["SE"]**2)
95
+ results_df.loc[sumstats_index,"_W2_SUM"] += results_df.loc[sumstats_index,"_W_SUM"]**2
96
+
97
+ # EAF
98
+ results_df.loc[sumstats_index,"_EA_N"] += to_use_sumstats["N"]*to_use_sumstats["EAF"]
99
+ results_df.loc[sumstats_index,"_NEA_N"] += to_use_sumstats["N"]*(1 - to_use_sumstats["EAF"])
100
+
101
+ # DIRECTION
102
+ beta_index = to_use_sumstats[to_use_sumstats["BETA"]>0].index
103
+ results_df.loc[beta_index, "DIRECTION"] += "+"
104
+ beta_index = to_use_sumstats[to_use_sumstats["BETA"]==0].index
105
+ results_df.loc[beta_index, "DIRECTION"] += "0"
106
+ beta_index = to_use_sumstats[to_use_sumstats["BETA"]<0].index
107
+ results_df.loc[beta_index, "DIRECTION"] += "-"
108
+ results_df.loc[results_df_not_in_sumstat_index, "DIRECTION"] += "?"
109
+
110
+ del to_use_sumstats
111
+ gc.collect()
112
+
113
+ ##############################################################################
114
+ # fixed - effect statistics
115
+ results_df["BETA"] = results_df["_BETAW_SUM"] / results_df["_W_SUM"]
116
+ results_df["EAF"] = results_df["_EA_N"] / (results_df["_EA_N"] + results_df["_NEA_N"])
117
+ results_df["SE"] = np.sqrt(1/results_df["_W_SUM"])
118
+ results_df["Z"] = results_df["BETA"] / results_df["SE"]
119
+ results_df["P"] = norm.sf(abs(results_df["Z"]))*2
120
+ results_df["Q"] = results_df["_BETA2W_SUM"] - (results_df["_BETAW_SUM"]**2 / results_df["_W_SUM"])
121
+
122
+ for dof in results_df["DOF"].unique():
123
+ results_df_dof_index = results_df["DOF"] == dof
124
+ results_df.loc[results_df_dof_index,"P_HET"] = chi2.sf(results_df.loc[results_df_dof_index, "Q"].values,dof)
125
+ gc.collect()
126
+
127
+ results_df["I2_HET"] = (results_df["Q"] - results_df["DOF"])/results_df["Q"]
128
+ results_df.loc[results_df["I2_HET"]<0, "I2_HET"] = 0
129
+
130
+ results_df=results_df.drop(columns=["_EA_N","_NEA_N"])
131
+ gc.collect()
132
+
133
+ ###########################################################################
134
+ if random_effects==True:
135
+ log.write(" -Iterating through {} datasets to compute statistics for random-effects model...".format(len(sumstats_list)))
136
+ results_df["_R2"] = (results_df["Q"] - results_df["DOF"])/(results_df["_W_SUM"] - (results_df["_W2_SUM"]/results_df["_W_SUM"]))
137
+ results_df.loc[results_df["_R2"]<0, "_R2"] = 0
138
+ variant_index_random = results_df[results_df["_R2"]>0].index
139
+
140
+ results_df["_BETAW_SUM_R"] = 0.0
141
+ results_df["_W_SUM_R"] = 0.0
142
+ results_df["BETA_RANDOM"] = results_df["BETA"]
143
+ results_df["SE_RANDOM"] = results_df["SE"]
144
+
145
+ for index,sumstats_path in enumerate(sumstats_list):
146
+ to_use_sumstats = process_sumstats(sumstats_path,
147
+ results_df.loc[variant_index_random, ["EA","NEA"]],
148
+ index=index,
149
+ match_allele=match_allele,
150
+ extract_index=variant_index_random)
151
+
152
+ sumstats_index = to_use_sumstats.index
153
+
154
+ # BEAT and SE
155
+ results_df.loc[sumstats_index,"_BETAW_SUM_R"] += to_use_sumstats["BETA"]*(1/(to_use_sumstats["SE"]**2 + results_df.loc[sumstats_index,"_R2"]))
156
+ results_df.loc[sumstats_index,"_W_SUM_R"] += 1/(to_use_sumstats["SE"]**2 + results_df.loc[sumstats_index,"_R2"])
157
+
158
+ del to_use_sumstats
159
+ del sumstats_index
160
+ gc.collect()
161
+
162
+ results_df.loc[variant_index_random,"BETA_RANDOM"] = results_df.loc[variant_index_random,"_BETAW_SUM_R"] / results_df.loc[variant_index_random,"_W_SUM_R"]
163
+ results_df.loc[variant_index_random,"SE_RANDOM"] = np.sqrt(1/results_df.loc[variant_index_random,"_W_SUM_R"])
164
+ results_df["Z_RANDOM"] = results_df["BETA_RANDOM"] / results_df["SE_RANDOM"]
165
+ results_df["P_RANDOM"] = norm.sf(abs(results_df["Z_RANDOM"]))*2
166
+ results_df = results_df.drop(columns=["_BETAW_SUM_R","_W_SUM_R"])
167
+
168
+ gc.collect()
169
+ ###########################################################################
170
+ results_df = results_df.drop(columns=["_BETAW_SUM","_BETA2W_SUM","_W_SUM","_R2","_W2_SUM"]).sort_values(by=["CHR","POS"])
171
+ gc.collect()
172
+ log.write("Finished meta-analysis successfully!")
173
+
174
+ return results_df
175
+
176
+ def process_sumstats(sumstats_path, results_df, index, extract_index=None, match_allele=True, log=Log()):
177
+
178
+ if extract_index is None:
179
+ extract_index = results_df.index
180
+
181
+ sumstats = get_sumstats(sumstats_path)
182
+
183
+ to_use_sumstats = sumstats.loc[sumstats["SNPID"].isin(extract_index.values),["SNPID","EA","NEA","BETA","N","SE","EAF"]]
184
+
185
+ if len(to_use_sumstats)>0:
186
+ n_pre_dup = len(to_use_sumstats)
187
+ log.write(" -Processing {} variants from sumstats #{}".format(len(to_use_sumstats), index))
188
+
189
+ to_use_sumstats = to_use_sumstats.drop_duplicates(subset="SNPID").set_index("SNPID")
190
+ n_post_dup = len(to_use_sumstats)
191
+
192
+ if n_pre_dup - n_post_dup>0:
193
+ log.write(" -Dropping {} duplicated variants from sumstats #{}".format(n_pre_dup - n_post_dup, index))
194
+
195
+ if match_allele==True:
196
+ sumstats_index = to_use_sumstats.index
197
+ # drop not matched
198
+ is_match = (to_use_sumstats.loc[sumstats_index,"EA"] == results_df.loc[sumstats_index, "EA"] )&(to_use_sumstats.loc[sumstats_index,"NEA"] == results_df.loc[sumstats_index, "NEA"])
199
+ is_flip = (to_use_sumstats.loc[sumstats_index,"EA"] == results_df.loc[sumstats_index, "NEA"])&( to_use_sumstats.loc[sumstats_index,"NEA"] == results_df.loc[sumstats_index, "EA"])
200
+ is_flip = is_flip | ((to_use_sumstats.loc[sumstats_index,"NEA"] == results_df.loc[sumstats_index, "EA"])&( to_use_sumstats.loc[sumstats_index,"EA"] == results_df.loc[sumstats_index, "NEA"]))
201
+ is_to_use = is_match|is_flip
202
+
203
+ if sum(~is_to_use)>0:
204
+ log.write(" -Dropping {} variants with unmatched alleles from sumstats #{}".format(sum(~is_to_use), index))
205
+
206
+ to_use_sumstats.loc[is_flip[is_flip].index, "BETA"] = -to_use_sumstats.loc[is_flip[is_flip].index, "BETA"]
207
+ to_use_sumstats.loc[is_flip[is_flip].index, "EAF"] = 1-to_use_sumstats.loc[is_flip[is_flip].index, "EAF"]
208
+ to_use_sumstats = to_use_sumstats.loc[is_to_use[is_to_use].index,:]
209
+
210
+ gc.collect()
211
+
212
+ return to_use_sumstats
213
+
214
+ def get_sumstats(input_path,usekeys=None):
215
+ if isinstance(input_path, tuple):
216
+ path = input_path[0]
217
+ path_args = input_path[1]
218
+ else:
219
+ path = input_path
220
+ path_args={}
221
+
222
+ if isinstance(path, pd.DataFrame):
223
+ sumstats = Sumstats(path,fmt="auto",verbose=False,usekeys=usekeys,**path_args).data
224
+ elif isinstance(path, Sumstats):
225
+ sumstats = path.data
226
+ if usekeys is not None:
227
+ sumstats = sumstats[usekeys]
228
+ elif path[-6:] == "pickle":
229
+ sumstats = load_data_from_pickle(path)
230
+ if usekeys is not None:
231
+ sumstats = sumstats[usekeys]
232
+ else:
233
+ sumstats = Sumstats(path,fmt="auto",verbose=False,usekeys=usekeys,**path_args).data
234
+ return sumstats
@@ -0,0 +1,58 @@
1
+
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ def snphwe(obs_hets, obs_hom1, obs_hom2):
6
+ # Convert cpp code from (Jeremy McRae) to python
7
+ # https://github.com/jeremymcrae/snphwe/blob/master/src/snp_hwe.cpp
8
+ #/* (original comments)
9
+ #// This code implements an exact SNP test of Hardy-Weinberg Equilibrium as
10
+ #// described in Wigginton, JE, Cutler, DJ, and Abecasis, GR (2005) A Note on
11
+ #// Exact Tests of Hardy-Weinberg Equilibrium. AJHG 76: 887-893
12
+ #//
13
+ #// Written by Jan Wigginton
14
+ #*/
15
+
16
+ obs_homr = min(obs_hom1, obs_hom2)
17
+ obs_homc = max(obs_hom1, obs_hom2)
18
+
19
+ rare = 2 * obs_homr + obs_hets
20
+ genotypes = obs_hets + obs_homc + obs_homr
21
+
22
+ probs = np.array([0.0 for i in range(rare +1)])
23
+
24
+ mid = rare * (2 * genotypes - rare) // (2 * genotypes)
25
+
26
+ if mid % 2 != rare%2:
27
+ mid += 1
28
+
29
+ probs[mid] = 1.0
30
+
31
+ sum_p = 1 #probs[mid]
32
+ curr_homr = (rare - mid) // 2
33
+ curr_homc = genotypes - mid - curr_homr
34
+
35
+
36
+ for curr_hets in range(mid, 1, -2):
37
+ probs[curr_hets - 2] = probs[curr_hets] * curr_hets * (curr_hets - 1.0)/ (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))
38
+ sum_p+= probs[curr_hets - 2]
39
+ curr_homr += 1
40
+ curr_homc += 1
41
+
42
+ curr_homr = (rare - mid) // 2
43
+ curr_homc = genotypes - mid - curr_homr
44
+
45
+ for curr_hets in range(mid, rare-1, 2):
46
+ probs[curr_hets + 2] = probs[curr_hets] * 4.0 * curr_homr * curr_homc/ ((curr_hets + 2.0) * (curr_hets + 1.0))
47
+ sum_p += probs[curr_hets + 2]
48
+ curr_homr -= 1
49
+ curr_homc -= 1
50
+
51
+ target = probs[obs_hets]
52
+ p_hwe = 0.0
53
+
54
+ for p in probs:
55
+ if p <= target :
56
+ p_hwe += p / sum_p
57
+
58
+ return min(p_hwe,1)
@@ -0,0 +1,112 @@
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ import pandas as pd
4
+ from gwaslab.g_Log import Log
5
+
6
+ #STATE NO. MNEMONIC DESCRIPTION COLOR NAME COLOR CODE
7
+ #1 TssA Active TSS Red 255,0,0
8
+ #2 TssAFlnk Flanking Active TSS Orange Red 255,69,0
9
+ #3 TxFlnk Transcr. at gene 5' and 3' LimeGreen 50,205,50
10
+ #4 Tx Strong transcription Green 0,128,0
11
+ #5 TxWk Weak transcription DarkGreen 0,100,0
12
+ #6 EnhG Genic enhancers GreenYellow 194,225,5
13
+ #7 Enh Enhancers Yellow 255,255,0
14
+ #8 ZNF/Rpts ZNF genes & repeats Medium Aquamarine 102,205,170
15
+ #9 Het Heterochromatin PaleTurquoise 138,145,208
16
+ #10 TssBiv Bivalent/Poised TSS IndianRed 205,92,92
17
+ #11 BivFlnk Flanking Bivalent TSS/Enh DarkSalmon 233,150,122
18
+ #12 EnhBiv Bivalent Enhancer DarkKhaki 189,183,107
19
+ #13 ReprPC Repressed PolyComb Silver 128,128,128
20
+ #14 ReprPCWk Weak Repressed PolyComb Gainsboro 192,192,192
21
+ #15 Quies Quiescent/Low White 255,255,255
22
+
23
+ color_dict={
24
+ "E1": np.array([255,0,0]),
25
+ "E2": np.array([255,69,0]),
26
+ "E3": np.array([50,205,50]),
27
+ "E4": np.array([0,128,0]),
28
+ "E5": np.array([0,100,0]),
29
+ "E6": np.array([194,225,5]),
30
+ "E7": np.array([255,255,0]),
31
+ "E8": np.array([102,205,170]),
32
+ "E9": np.array([138,145,208]),
33
+ "E10":np.array([205,92,92]),
34
+ "E11":np.array([233,150,122]),
35
+ "E12":np.array([189,183,107]),
36
+ "E13":np.array([128,128,128]),
37
+ "E14":np.array([192,192,192]),
38
+ "E15":np.array([255,255,255])
39
+ }
40
+
41
+ color_dict_i={
42
+ 1: np.array([255,0,0]),
43
+ 2: np.array([255,69,0]),
44
+ 3: np.array([50,205,50]),
45
+ 4: np.array([0,128,0]),
46
+ 5: np.array([0,100,0]),
47
+ 6: np.array([194,225,5]),
48
+ 7: np.array([255,255,0]),
49
+ 8: np.array([102,205,170]),
50
+ 9: np.array([138,145,208]),
51
+ 10:np.array([205,92,92]),
52
+ 11:np.array([233,150,122]),
53
+ 12:np.array([189,183,107]),
54
+ 13:np.array([128,128,128]),
55
+ 14:np.array([192,192,192]),
56
+ 15:np.array([255,255,255])
57
+ }
58
+
59
+
60
+ def _plot_chromatin_state(region_chromatin_files,
61
+ region_chromatin_labels,
62
+ region,
63
+ fig,
64
+ ax,
65
+ xlim_i,
66
+ fontsize = 12,
67
+ font_family = "Arial",
68
+ log=Log(),
69
+ verbose=True):
70
+ '''
71
+ files : a list of numbers
72
+ '''
73
+ target_chr = region[0]
74
+ target_start = region[1]
75
+ target_end = region[2]
76
+
77
+ offset_i = xlim_i[0] - region[1]
78
+
79
+ ax.set_ylim([-0.05,0.1*len(region_chromatin_files)-0.05])
80
+ ax.set_xlim([offset_i+target_start,offset_i+target_end])
81
+
82
+ px_for_01 = ax.transData.transform([0,0])[1] - ax.transData.transform([0,0.1])[1]
83
+
84
+ point=fig.dpi/72
85
+ points_for_01 = px_for_01*72 / fig.dpi
86
+
87
+ # each tissue
88
+ for i,file in enumerate(region_chromatin_files):
89
+ log.write(" -Loading : {}".format(file), verbose=verbose)
90
+ enh = pd.read_csv(file,sep="\t",header=None)
91
+ enh.columns=["ID","START","END","STATE"]
92
+ enh["CHR"] = enh["ID"].str.extract(r"chr([0-9]+)").astype("float").astype("Int64")
93
+ enh["STATE_i"] = enh["STATE"].str.extract(r"([0-9]+)_*").astype("float").astype("Int64")
94
+ enh_in_region = (enh["CHR"] == target_chr) & ((enh["END"] > target_start) & (enh["START"]<target_end))
95
+ df =enh.loc[enh_in_region,["STATE_i","START","END"]].sort_values("STATE_i",ascending=False)
96
+ log.write(" -Number of records in specified region: {}".format(len(df)), verbose=verbose)
97
+ # each block
98
+ for index, row in df.iterrows():
99
+ color=color_dict_i[row["STATE_i"]]
100
+ ax.plot([offset_i + row["START"] ,offset_i + row["END"]],
101
+ [i*0.1,i*0.1],
102
+ c=color/255,linewidth=points_for_01,solid_capstyle="butt")
103
+
104
+ ## add stripe label
105
+ if len(region_chromatin_labels) == len(region_chromatin_files):
106
+ ax.set_yticks([i*0.1 for i in range(len(region_chromatin_labels))], region_chromatin_labels, fontsize=fontsize, family=font_family)
107
+ else:
108
+ ax.set_yticks(ticks=[])
109
+
110
+ #ax.set_xticks(ticks=[])
111
+ ax.invert_yaxis()
112
+ return fig
@@ -75,7 +75,10 @@ def compare_effect(path1,
75
75
  if scaled == True:
76
76
  scaled1 = True
77
77
  scaled2 = True
78
-
78
+ if is_q_mc=="fdr" or is_q_mc=="bon":
79
+ is_q = True
80
+ else:
81
+ raise ValueError("Please select either fdr or bon for is_q_mc.")
79
82
  if save_args is None:
80
83
  save_args = {"dpi":300,"facecolor":"white"}
81
84
  if reg_box is None: