gwaslab 3.4.14__py3-none-any.whl → 3.4.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/Sumstats.py +6 -4
- gwaslab/__init__.py +3 -1
- gwaslab/annotateplot.py +2 -2
- gwaslab/calculate_power.py +119 -42
- gwaslab/compare_effect.py +83 -17
- gwaslab/download.py +19 -4
- gwaslab/fill.py +183 -57
- gwaslab/miamiplot.py +25 -10
- gwaslab/mqqplot.py +4 -3
- gwaslab/plotrg.py +208 -75
- gwaslab/regionalplot.py +21 -3
- gwaslab/retrievedata.py +49 -18
- gwaslab/to_pickle.py +12 -0
- gwaslab/trumpetplot.py +0 -0
- gwaslab/version.py +3 -3
- {gwaslab-3.4.14.dist-info → gwaslab-3.4.16.dist-info}/METADATA +2 -2
- {gwaslab-3.4.14.dist-info → gwaslab-3.4.16.dist-info}/RECORD +20 -19
- {gwaslab-3.4.14.dist-info → gwaslab-3.4.16.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.14.dist-info → gwaslab-3.4.16.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.14.dist-info → gwaslab-3.4.16.dist-info}/top_level.txt +0 -0
gwaslab/plotrg.py
CHANGED
|
@@ -17,7 +17,7 @@ def convert_p_to_width(p,sig_level):
|
|
|
17
17
|
#scaled using mlog10(p)
|
|
18
18
|
return max(-np.log10(p)/width_factor,0.1)
|
|
19
19
|
|
|
20
|
-
def
|
|
20
|
+
def convert_rg_to_color(rg,cmap):
|
|
21
21
|
#(1,1)
|
|
22
22
|
if rg>1: rg=1
|
|
23
23
|
if rg<-1: rg=-1
|
|
@@ -25,48 +25,120 @@ def conver_rg_to_color(rg,cmap):
|
|
|
25
25
|
####################################################################################################
|
|
26
26
|
|
|
27
27
|
def plot_rg(ldscrg,
|
|
28
|
-
p1="p1",
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
28
|
+
p1="p1",
|
|
29
|
+
p2="p2",
|
|
30
|
+
rg="rg",
|
|
31
|
+
p="p",
|
|
32
|
+
sig_levels=None,
|
|
33
|
+
rganno="non",
|
|
34
|
+
panno=True,
|
|
35
|
+
corrections=None,
|
|
36
|
+
panno_texts=None,
|
|
37
|
+
equal_aspect=True,
|
|
32
38
|
cmap = matplotlib.cm.get_cmap('RdBu'),
|
|
39
|
+
full_cell =None,
|
|
33
40
|
log=Log(),
|
|
34
41
|
panno_args=None,
|
|
42
|
+
rganno_args=None,
|
|
35
43
|
verbose=True,
|
|
36
44
|
asize=10,
|
|
37
45
|
sort_key=None,
|
|
38
46
|
square=False,
|
|
39
|
-
|
|
40
|
-
|
|
47
|
+
colorbar_args=None,
|
|
48
|
+
fig_args=None,
|
|
49
|
+
xticklabel_args=None,
|
|
50
|
+
yticklabel_args=None,
|
|
51
|
+
fdr_method="i",
|
|
52
|
+
fontsize=10,
|
|
53
|
+
save=None,
|
|
54
|
+
save_args=None):
|
|
41
55
|
|
|
42
|
-
if verbose: log.write("
|
|
56
|
+
if verbose: log.write("Start to create ldsc genetic correlation heatmap...")
|
|
57
|
+
# configure arguments
|
|
58
|
+
if fig_args is None:
|
|
59
|
+
fig_args = {"dpi":300}
|
|
60
|
+
if colorbar_args is None:
|
|
61
|
+
colorbar_args={"shrink":0.82}
|
|
62
|
+
if yticklabel_args is None:
|
|
63
|
+
yticklabel_args={"fontsize":fontsize, "fontfamily":"Arial"}
|
|
64
|
+
if xticklabel_args is None:
|
|
65
|
+
xticklabel_args={"rotation":45,"horizontalalignment":"left", "verticalalignment":"bottom","fontsize":fontsize, "fontfamily":"Arial"}
|
|
66
|
+
if sig_levels is None:
|
|
67
|
+
sig_levels = [0.05]
|
|
68
|
+
if corrections is None:
|
|
69
|
+
corrections = ["non", "fdr","bon"]
|
|
70
|
+
if panno_texts is None:
|
|
71
|
+
panno_texts = ["*"*(i+1) for i in range(len(sig_levels)*len(corrections))]
|
|
72
|
+
if full_cell is None:
|
|
73
|
+
full_cell = ("fdr",0.05)
|
|
74
|
+
if rganno_args is None:
|
|
75
|
+
rganno_args ={}
|
|
76
|
+
|
|
77
|
+
#drop na records in P column
|
|
78
|
+
if verbose: log.write("Raw dataset records:",len(ldscrg))
|
|
43
79
|
df=ldscrg.dropna(subset=[p]).copy()
|
|
80
|
+
|
|
81
|
+
if verbose: log.write(" -Raw dataset non-NA records:",len(df))
|
|
82
|
+
# create unique pair column
|
|
44
83
|
df["p1p2"]=df.apply(lambda x:"_".join(sorted([x[p1],x[p2]])),axis=1)
|
|
45
84
|
|
|
85
|
+
if verbose: log.write("Filling diagnal line and duplicated pair for plotting...")
|
|
86
|
+
# fill na
|
|
87
|
+
df_fill_reverse = df.loc[(df[p2].isin(df[p1].values)) & (df[p1].isin(df[p2].values)),:].copy()
|
|
88
|
+
df_fill_reverse = df_fill_reverse.rename(columns={p1:p2,p2:p1})
|
|
89
|
+
|
|
90
|
+
# fill dia
|
|
91
|
+
df_fill_dia = pd.DataFrame(columns=df.columns)
|
|
92
|
+
p1_dup_list = list(df.loc[(df[p2].isin(df[p1].values)),"p2"].values)
|
|
93
|
+
p2_dup_list = list(df.loc[(df[p1].isin(df[p2].values)),"p1"].values)
|
|
94
|
+
p_dup_list = p2_dup_list + p1_dup_list
|
|
95
|
+
if len(set(p_dup_list)) > 0:
|
|
96
|
+
if verbose: log.write(" -Diagnal records:", len(set(p_dup_list)))
|
|
97
|
+
df_fill_dia["p1"] = p_dup_list
|
|
98
|
+
df_fill_dia["p2"] = df_fill_dia["p1"]
|
|
99
|
+
df_fill_dia["rg"] = 1
|
|
100
|
+
|
|
101
|
+
df_fill_na = pd.DataFrame(columns=df.columns)
|
|
102
|
+
df_fill_na[[p1,p2]] = [(i,j) for i in df[p1].sort_values(ascending=False).drop_duplicates() for j in df[p2].sort_values(ascending=False).drop_duplicates()]
|
|
103
|
+
# fill diagonal
|
|
104
|
+
df = pd.concat([df,df_fill_reverse,df_fill_dia,df_fill_na],ignore_index=True).sort_values(by=p).drop_duplicates(subset=[p1,p2])
|
|
105
|
+
#if verbose: log.write(" -Dataset shape match:", len(df)==)
|
|
106
|
+
#
|
|
107
|
+
## remove record with p1 = p2, dropna in P column
|
|
46
108
|
dfp=ldscrg.loc[ldscrg[p1]!=ldscrg[p2],:].dropna(subset=[p]).copy()
|
|
109
|
+
|
|
110
|
+
## create pair column
|
|
47
111
|
dfp["p1p2"]=dfp.apply(lambda x:"_".join(sorted([x[p1],x[p2]])),axis=1)
|
|
112
|
+
|
|
113
|
+
## drop duplicate and keep only unique pairs
|
|
48
114
|
dfp = dfp.drop_duplicates(subset=["p1p2"]).copy()
|
|
49
115
|
|
|
50
|
-
if verbose: log.write("Valid unique
|
|
51
|
-
if verbose: log.write("
|
|
116
|
+
if verbose: log.write("Valid unique trait pairs:",len(dfp))
|
|
117
|
+
if verbose: log.write(" -Valid unique trait1:",dfp["p1"].nunique())
|
|
118
|
+
if verbose: log.write(" -Valid unique trait2:",dfp["p2"].nunique())
|
|
119
|
+
if verbose: log.write(" -Significant correlations with P < 0.05:",sum(dfp[p]<0.05))
|
|
120
|
+
if verbose: log.write(" -Significant correlations after Bonferroni correction:",sum(dfp[p]<(0.05/len(dfp))))
|
|
52
121
|
|
|
53
|
-
if correction=="fdr":
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
122
|
+
#if correction=="fdr":
|
|
123
|
+
# fdr corrected p
|
|
124
|
+
dfp["fdr_p"]=fdrcorrection(dfp[p],alpha=1,method=fdr_method)[1]
|
|
125
|
+
# is fdr < sig_level
|
|
126
|
+
dfp["fdr"]=fdrcorrection(dfp[p],alpha=0.05,method=fdr_method)[0]
|
|
127
|
+
if verbose: log.write(" -Significant correlations with FDR <0.05:",sum(dfp["fdr"]))
|
|
128
|
+
# convert to dict for annotation and plotting
|
|
129
|
+
df_rawp = dfp.set_index("p1p2").loc[:,p].to_dict()
|
|
130
|
+
dfp = dfp.set_index("p1p2").loc[:,"fdr_p"].to_dict()
|
|
131
|
+
|
|
63
132
|
#########ticks dict###########################################
|
|
64
133
|
dic_p1={}
|
|
65
134
|
dic_p2={}
|
|
135
|
+
|
|
66
136
|
dic_p1_r={}
|
|
67
137
|
dic_p2_r={}
|
|
68
138
|
|
|
139
|
+
## sort position
|
|
69
140
|
if sort_key is None:
|
|
141
|
+
# alphabetic order
|
|
70
142
|
for i,p1_name in enumerate(df[p1].sort_values(ascending=False).drop_duplicates()):
|
|
71
143
|
dic_p1[p1_name] = i
|
|
72
144
|
dic_p1_r[i] = p1_name
|
|
@@ -74,6 +146,7 @@ def plot_rg(ldscrg,
|
|
|
74
146
|
dic_p2[p2_name] = i
|
|
75
147
|
dic_p2_r[i] = p2_name
|
|
76
148
|
else:
|
|
149
|
+
# user-provided order
|
|
77
150
|
for i,p1_name in enumerate(df[p1].sort_values(ascending=False,key=sort_key).drop_duplicates()):
|
|
78
151
|
dic_p1[p1_name] = i
|
|
79
152
|
dic_p1_r[i] = p1_name
|
|
@@ -81,14 +154,17 @@ def plot_rg(ldscrg,
|
|
|
81
154
|
dic_p2[p2_name] = i
|
|
82
155
|
dic_p2_r[i] = p2_name
|
|
83
156
|
|
|
157
|
+
# assign coordinate
|
|
84
158
|
df["y"]=df[p1].map(dic_p1)
|
|
85
159
|
df["y_x"]=df[p1].map(dic_p2)
|
|
86
160
|
df["x"]=df[p2].map(dic_p2)
|
|
87
161
|
df["x_y"]=df[p2].map(dic_p1)
|
|
88
162
|
|
|
89
|
-
if verbose: log.write("Plotting...")
|
|
163
|
+
if verbose: log.write("Plotting heatmap...")
|
|
90
164
|
########ticks###############################################
|
|
91
|
-
fig,ax = plt.subplots(
|
|
165
|
+
fig,ax = plt.subplots(**fig_args)
|
|
166
|
+
|
|
167
|
+
# configure x/y ticks
|
|
92
168
|
xticks=df["x"].sort_values().drop_duplicates().astype(int)
|
|
93
169
|
yticks=df["y"].sort_values().drop_duplicates().astype(int)
|
|
94
170
|
ax.xaxis.tick_top()
|
|
@@ -103,88 +179,145 @@ def plot_rg(ldscrg,
|
|
|
103
179
|
ax.tick_params('both', length=0, width=0, which='minor')
|
|
104
180
|
|
|
105
181
|
#labels
|
|
106
|
-
ax.set_yticklabels(yticks.map(dic_p1_r)
|
|
107
|
-
ax.set_xticklabels(xticks.map(dic_p2_r),rotation=45,horizontalalignment="left", verticalalignment="bottom",fontsize=15)
|
|
108
|
-
|
|
109
|
-
width_max=1
|
|
182
|
+
ax.set_yticklabels(yticks.map(dic_p1_r),**yticklabel_args)
|
|
110
183
|
|
|
184
|
+
ax.set_xticklabels(xticks.map(dic_p2_r),**xticklabel_args)
|
|
111
185
|
|
|
112
186
|
#########patches###########################################
|
|
113
187
|
|
|
114
188
|
squares=[]
|
|
115
|
-
|
|
189
|
+
panno_list={1:{},2:{}}
|
|
116
190
|
rgtoanno=[]
|
|
117
|
-
maxsigp=sig_level
|
|
118
191
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
# maxsigp = df.loc[df["fdr"]==True,p].max()*1.0001
|
|
122
|
-
#
|
|
123
|
-
# else:
|
|
124
|
-
# maxsigp = sig_level/len(df.dropna(subset=[p]))
|
|
125
|
-
if correction=="fdr":
|
|
126
|
-
p="fdr_p"
|
|
127
|
-
|
|
128
|
-
|
|
192
|
+
if verbose: log.write("Full cell : {}-corrected P == {}".format(full_cell[0],full_cell[1]))
|
|
193
|
+
|
|
129
194
|
for i,row in df.iterrows():
|
|
130
195
|
xcenter=row["x"]
|
|
131
196
|
ycenter=row["y"]
|
|
132
|
-
|
|
197
|
+
|
|
198
|
+
if np.isnan(row[rg]):
|
|
133
199
|
width=1
|
|
134
200
|
x=xcenter-width/2
|
|
135
201
|
y=ycenter-width/2
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
else:
|
|
139
|
-
adjusted_p = dfp["_".join(sorted([row[p1],row[p2]]))]
|
|
140
|
-
if adjusted_p<0.05 and square is True:
|
|
141
|
-
if xcenter + ycenter < len(df[p1].unique()):
|
|
142
|
-
panno.append([xcenter,ycenter,adjusted_p])
|
|
143
|
-
elif adjusted_p<0.05:
|
|
144
|
-
panno.append([xcenter,ycenter,adjusted_p])
|
|
145
|
-
|
|
146
|
-
width= convert_p_to_width(adjusted_p,sig_level)
|
|
147
|
-
x=xcenter-width/2
|
|
148
|
-
y=ycenter-width/2
|
|
149
|
-
rgba = conver_rg_to_color(row[rg],cmap)
|
|
150
|
-
|
|
151
|
-
if xcenter + ycenter > len(df[p1].unique())-1 and (square is True) and (rganno is True):
|
|
152
|
-
rgtoanno.append([xcenter,ycenter,row[rg],rgba])
|
|
202
|
+
ax.plot([x,x+width],[y,y+width],c="grey")
|
|
203
|
+
ax.plot([x,x+width],[y+width,y],c="grey")
|
|
153
204
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
205
|
+
else:
|
|
206
|
+
if row[p1]==row[p2]:
|
|
207
|
+
# diagonal line
|
|
208
|
+
width=1
|
|
209
|
+
x=xcenter-width/2
|
|
210
|
+
y=ycenter-width/2
|
|
211
|
+
rgba = convert_rg_to_color(1,cmap)
|
|
212
|
+
else:
|
|
213
|
+
# get the adjusted p value from dict
|
|
214
|
+
if xcenter + ycenter < len(df[p1].unique()):
|
|
215
|
+
panno_set=1
|
|
216
|
+
else:
|
|
217
|
+
panno_set=2
|
|
218
|
+
for i,correction in enumerate(corrections):
|
|
219
|
+
for j,sig_level in enumerate(sig_levels):
|
|
220
|
+
|
|
221
|
+
index = len(sig_levels)*i + j
|
|
222
|
+
|
|
223
|
+
p1p2="_".join(sorted([row[p1],row[p2]]))
|
|
224
|
+
|
|
225
|
+
raw_p = df_rawp[p1p2]
|
|
226
|
+
|
|
227
|
+
if correction in ["B","bonferroni ","bon","Bon","b"]:
|
|
228
|
+
current_threhold = sig_level/len(dfp)
|
|
229
|
+
if raw_p < current_threhold:
|
|
230
|
+
panno_list[panno_set][p1p2] = [xcenter,ycenter,raw_p,"bon",panno_texts[index]]
|
|
231
|
+
|
|
232
|
+
elif correction in ["fdr","FDR","F","f"]:
|
|
233
|
+
adjusted_p = dfp[p1p2]
|
|
234
|
+
if adjusted_p < sig_level and square is True:
|
|
235
|
+
#if square is True, only annotate half
|
|
236
|
+
if xcenter + ycenter < len(df[p1].unique()):
|
|
237
|
+
panno_list[panno_set][p1p2]=[xcenter,ycenter,adjusted_p,"fdr",panno_texts[index]]
|
|
238
|
+
elif adjusted_p < sig_level:
|
|
239
|
+
panno_list[panno_set][p1p2]=[xcenter,ycenter,adjusted_p,"fdr",panno_texts[index]]
|
|
240
|
+
|
|
241
|
+
elif correction == "non":
|
|
242
|
+
if raw_p < sig_level:
|
|
243
|
+
panno_list[panno_set][p1p2]=[xcenter,ycenter,"raw",raw_p,panno_texts[index]]
|
|
244
|
+
|
|
245
|
+
# configuring the square
|
|
246
|
+
if full_cell[0] == "fdr":
|
|
247
|
+
width= convert_p_to_width(adjusted_p,full_cell[1])
|
|
248
|
+
elif full_cell[0] == "bon":
|
|
249
|
+
width= convert_p_to_width(raw_p*len(dfp),full_cell[1])
|
|
250
|
+
else:
|
|
251
|
+
width= convert_p_to_width(raw_p,full_cell[1])
|
|
252
|
+
|
|
253
|
+
x=xcenter-width/2
|
|
254
|
+
y=ycenter-width/2
|
|
255
|
+
rgba = convert_rg_to_color(row[rg],cmap)
|
|
256
|
+
if xcenter + ycenter > len(df[p1].unique())-1 and (square is True) and (rganno == "half"):
|
|
257
|
+
rgtoanno.append([xcenter,ycenter,row[rg],rgba])
|
|
258
|
+
elif "full" in rganno:
|
|
259
|
+
rgtoanno.append([xcenter,ycenter,row[rg],rgba])
|
|
260
|
+
|
|
261
|
+
#if xcenter + ycenter < len(df[p1].unique()) and (square is True) and (rganno == "half"):
|
|
262
|
+
# squares.append(patches.Rectangle((x,y),width=width,height=width,fc=rgba,ec="white",lw=0))
|
|
263
|
+
#elif (square is not True):
|
|
264
|
+
if ("nb" not in rganno):
|
|
265
|
+
if rganno == "half":
|
|
266
|
+
if xcenter + ycenter < len(df[p1].unique()) and (square is True):
|
|
267
|
+
squares.append(patches.Rectangle((x,y),width=width,height=width,fc=rgba,ec="white",lw=0))
|
|
268
|
+
else:
|
|
269
|
+
squares.append(patches.Rectangle((x,y),width=width,height=width,fc=rgba,ec="white",lw=0))
|
|
161
270
|
|
|
162
271
|
squares_collection = matplotlib.collections.PatchCollection(squares,match_original=True)
|
|
163
272
|
ax.add_collection(squares_collection)
|
|
164
273
|
|
|
165
274
|
if rganno is not False:
|
|
275
|
+
rganno_default_args = {"weight":"bold","ha":"center", "va":"center", "fontfamily":"Arial","fontsize":fontsize}
|
|
276
|
+
for key, value in rganno_args.items():
|
|
277
|
+
rganno_default_args[key] = value
|
|
166
278
|
for i in rgtoanno:
|
|
167
279
|
if i[2]>1: i[2]=1
|
|
168
280
|
if i[2]<-1: i[2]=-1
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
281
|
+
if "color" in rganno_default_args.keys() or "c" in rganno_default_args.keys():
|
|
282
|
+
ax.text(i[0],i[1],"{:.3f}".format(i[2]),**rganno_default_args)
|
|
283
|
+
else:
|
|
284
|
+
ax.text(i[0],i[1],"{:.3f}".format(i[2]),color=i[3],**rganno_default_args)
|
|
172
285
|
|
|
173
|
-
|
|
286
|
+
# configure args for p annotation
|
|
287
|
+
panno_default_args={"size":asize,"color":"white","weight":"bold","horizontalalignment":"center","verticalalignment":"center_baseline","font":"Arial"}
|
|
174
288
|
if panno_args is not None:
|
|
175
289
|
for key, value in panno_args.items():
|
|
176
290
|
panno_default_args[key] = value
|
|
177
291
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
292
|
+
# annotate p
|
|
293
|
+
if panno is True:
|
|
294
|
+
if verbose: log.write("P value annotation text : ")
|
|
295
|
+
for i,correction in enumerate(corrections):
|
|
296
|
+
for j,sig_level in enumerate(sig_levels):
|
|
297
|
+
index = len(sig_levels)*i + j
|
|
298
|
+
if verbose: log.write(" -{} : {}-corrected P < {}".format(panno_texts[index], correction, sig_level))
|
|
299
|
+
for panno_set_number in panno_list.keys():
|
|
300
|
+
for key, i in panno_list[panno_set_number].items():
|
|
301
|
+
if panno_set_number == 1:
|
|
302
|
+
ax.text(i[0],i[1]-0.1,i[4], **panno_default_args)
|
|
303
|
+
else:
|
|
304
|
+
ax.text(i[0],i[1]-0.1,i[4], **panno_default_args)
|
|
184
305
|
|
|
185
306
|
## color bar ###############################################
|
|
186
307
|
norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
|
|
187
|
-
fig.colorbar(matplotlib.cm.ScalarMappable(norm=norm, cmap=cmap), ax=ax, **
|
|
308
|
+
fig.colorbar(matplotlib.cm.ScalarMappable(norm=norm, cmap=cmap), ax=ax, **colorbar_args)
|
|
188
309
|
|
|
189
|
-
|
|
310
|
+
if equal_aspect is True:
|
|
311
|
+
ax.set_aspect('equal', adjustable='box')
|
|
312
|
+
|
|
313
|
+
if save:
|
|
314
|
+
if verbose: log.write("Saving plot:")
|
|
315
|
+
if save==True:
|
|
316
|
+
fig.savefig("./ldscrg_heatmap.png",bbox_inches="tight",**save_args)
|
|
317
|
+
log.write(" -Saved to "+ "./ldscrg_heatmap.png" + " successfully!" )
|
|
318
|
+
else:
|
|
319
|
+
fig.savefig(save,bbox_inches="tight",**save_args)
|
|
320
|
+
log.write(" -Saved to "+ save + " successfully!" )
|
|
321
|
+
if verbose: log.write("Finished creating ldsc genetic correlation heatmap!")
|
|
322
|
+
return fig,ax,log,df
|
|
190
323
|
|
gwaslab/regionalplot.py
CHANGED
|
@@ -10,6 +10,7 @@ from gwaslab.CommonData import get_chr_to_number
|
|
|
10
10
|
from gwaslab.CommonData import get_number_to_chr
|
|
11
11
|
from gwaslab.CommonData import get_recombination_rate
|
|
12
12
|
from gwaslab.CommonData import get_gtf
|
|
13
|
+
from gwaslab.retrievedata import check_vcf_chr_prefix
|
|
13
14
|
from pyensembl import EnsemblRelease
|
|
14
15
|
from allel import GenotypeArray
|
|
15
16
|
from allel import read_vcf
|
|
@@ -34,7 +35,7 @@ def _plot_regional(
|
|
|
34
35
|
chrom_df,
|
|
35
36
|
xtick_chr_dict,
|
|
36
37
|
cut_line_color,
|
|
37
|
-
vcf_chr_dict =
|
|
38
|
+
vcf_chr_dict = None,
|
|
38
39
|
gtf_path="default",
|
|
39
40
|
gtf_chr_dict = get_number_to_chr(),
|
|
40
41
|
gtf_gene_name=None,
|
|
@@ -65,7 +66,17 @@ def _plot_regional(
|
|
|
65
66
|
pos="POS",
|
|
66
67
|
verbose=True,
|
|
67
68
|
log=Log()
|
|
68
|
-
):
|
|
69
|
+
):
|
|
70
|
+
if vcf_path is not None:
|
|
71
|
+
if vcf_chr_dict is None:
|
|
72
|
+
if verbose: log.write(" -Checking prefix for chromosomes in vcf files..." )
|
|
73
|
+
prefix = check_vcf_chr_prefix(vcf_path)
|
|
74
|
+
if prefix is not None:
|
|
75
|
+
if verbose: log.write(" -Prefix for chromosomes: ",prefix)
|
|
76
|
+
vcf_chr_dict = get_number_to_chr(prefix=prefix)
|
|
77
|
+
else:
|
|
78
|
+
if verbose: log.write(" -No prefix for chromosomes." )
|
|
79
|
+
vcf_chr_dict = get_number_to_chr()
|
|
69
80
|
|
|
70
81
|
# if regional plot : pinpoint lead , add color bar ##################################################
|
|
71
82
|
if (region is not None) :
|
|
@@ -231,11 +242,17 @@ def _get_lead_id(sumstats, region_ref, log):
|
|
|
231
242
|
if len(lead_id)>0:
|
|
232
243
|
lead_id = int(lead_id[0])
|
|
233
244
|
if region_ref is not None:
|
|
234
|
-
|
|
245
|
+
if type(lead_id) is list:
|
|
246
|
+
if len(lead_id)==0 :
|
|
247
|
+
log.write(" -WARNING: {} not found. Roll back to lead variant...".format(region_ref))
|
|
248
|
+
lead_id = sumstats["scaled_P"].idxmax()
|
|
249
|
+
else:
|
|
250
|
+
log.write(" -Reference variant ID: {} - {}".format(region_ref, lead_id))
|
|
235
251
|
|
|
236
252
|
if lead_id is None:
|
|
237
253
|
log.write(" -Extracting lead variant...")
|
|
238
254
|
lead_id = sumstats["scaled_P"].idxmax()
|
|
255
|
+
|
|
239
256
|
return lead_id
|
|
240
257
|
|
|
241
258
|
def _pinpoint_lead(sumstats,ax1,region_ref, region_ld_threshold, region_ld_colors, marker_size, log):
|
|
@@ -464,6 +481,7 @@ def _plot_gene_track(
|
|
|
464
481
|
def process_vcf(sumstats, vcf_path, region,region_ref, region_ref2, log, verbose, pos ,nea,ea, region_ld_threshold, vcf_chr_dict,tabix):
|
|
465
482
|
if verbose: log.write("Start to load reference genotype...")
|
|
466
483
|
if verbose: log.write(" -reference vcf path : "+ vcf_path)
|
|
484
|
+
|
|
467
485
|
# load genotype data of the targeted region
|
|
468
486
|
ref_genotype = read_vcf(vcf_path,region=vcf_chr_dict[region[0]]+":"+str(region[1])+"-"+str(region[2]),tabix=tabix)
|
|
469
487
|
if ref_genotype is None:
|
gwaslab/retrievedata.py
CHANGED
|
@@ -290,7 +290,7 @@ def assign_rsid_single(sumstats,path,rsid="rsID",chr="CHR",pos="POS",ref="NEA",a
|
|
|
290
290
|
|
|
291
291
|
def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsID",chr="CHR",pos="POS",ref="NEA",alt="EA",status="STATUS",
|
|
292
292
|
n_cores=1,chunksize=5000000,ref_snpid="SNPID",ref_rsid="rsID",
|
|
293
|
-
overwrite="empty",verbose=True,log=Log(),chr_dict=
|
|
293
|
+
overwrite="empty",verbose=True,log=Log(),chr_dict=None):
|
|
294
294
|
'''
|
|
295
295
|
overwrite mode :
|
|
296
296
|
all , overwrite rsid for all availalbe rsid
|
|
@@ -303,7 +303,12 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
303
303
|
if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
|
|
304
304
|
if verbose: log.write(" -CPU Cores to use :",n_cores)
|
|
305
305
|
if verbose: log.write(" -Reference VCF file:", path)
|
|
306
|
+
|
|
307
|
+
chr_dict = auto_check_vcf_chr_dict(path, chr_dict, verbose, log)
|
|
308
|
+
|
|
306
309
|
if verbose: log.write(" -Assigning rsID based on chr:pos and ref:alt/alt:ref...")
|
|
310
|
+
|
|
311
|
+
|
|
307
312
|
##############################################
|
|
308
313
|
if rsid not in sumstats.columns:
|
|
309
314
|
sumstats[rsid]=pd.Series(dtype="string")
|
|
@@ -476,11 +481,13 @@ def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NE
|
|
|
476
481
|
|
|
477
482
|
def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,remove_snp="",mode="pi",n_cores=1,remove_indel="",
|
|
478
483
|
chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
|
|
479
|
-
chr_dict=
|
|
484
|
+
chr_dict=None,verbose=True,log=Log()):
|
|
480
485
|
if verbose: log.write("Start to infer strand for palindromic SNPs...")
|
|
481
486
|
if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
|
|
482
487
|
if verbose: log.write(" -Reference vcf file:", ref_infer)
|
|
483
|
-
|
|
488
|
+
|
|
489
|
+
chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
|
|
490
|
+
|
|
484
491
|
# check if the columns are complete
|
|
485
492
|
if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
|
|
486
493
|
raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
|
|
@@ -601,13 +608,16 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
601
608
|
|
|
602
609
|
|
|
603
610
|
################################################################################################################
|
|
604
|
-
def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,n_cores=1,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=
|
|
611
|
+
def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,column_name="DAF",suffix="",n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
|
|
605
612
|
|
|
606
613
|
if verbose: log.write("Start to check the difference between EAF and refence vcf alt frequency ...")
|
|
607
614
|
if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
|
|
608
615
|
if verbose: log.write(" -Reference vcf file:", ref_infer)
|
|
609
616
|
if verbose: log.write(" -CPU Cores to use :",n_cores)
|
|
617
|
+
|
|
618
|
+
chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
|
|
610
619
|
|
|
620
|
+
column_name = column_name + suffix
|
|
611
621
|
# check if the columns are complete
|
|
612
622
|
if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
|
|
613
623
|
raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
|
|
@@ -618,7 +628,7 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,n_co
|
|
|
618
628
|
if not force:
|
|
619
629
|
good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
|
|
620
630
|
if verbose: log.write(" -Checking variants:", sum(good_chrpos))
|
|
621
|
-
sumstats[
|
|
631
|
+
sumstats[column_name]=np.nan
|
|
622
632
|
|
|
623
633
|
########################
|
|
624
634
|
if sum(~sumstats[eaf].isna())<10000:
|
|
@@ -626,8 +636,8 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,n_co
|
|
|
626
636
|
df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
|
|
627
637
|
pool = Pool(n_cores)
|
|
628
638
|
if sum(~sumstats[eaf].isna())>0:
|
|
629
|
-
map_func = partial(checkaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
630
|
-
sumstats.loc[good_chrpos,[
|
|
639
|
+
map_func = partial(checkaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,column_name=column_name,chr_dict=chr_dict)
|
|
640
|
+
sumstats.loc[good_chrpos,[column_name]] = pd.concat(pool.map(map_func,df_split))
|
|
631
641
|
pool.close()
|
|
632
642
|
pool.join()
|
|
633
643
|
###########################
|
|
@@ -635,24 +645,24 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,n_co
|
|
|
635
645
|
|
|
636
646
|
#sumstats.loc[good_chrpos,"DAF"] = status_inferred.values
|
|
637
647
|
#sumstats.loc[:,"DAF"]=sumstats.loc[:,"DAF"].astype("float")
|
|
638
|
-
if verbose: log.write(" -
|
|
639
|
-
if verbose: log.write(" -
|
|
640
|
-
if verbose: log.write(" -
|
|
641
|
-
if verbose: log.write(" - abs(
|
|
642
|
-
if verbose: log.write(" -
|
|
643
|
-
if verbose: log.write(" - abs(
|
|
644
|
-
|
|
648
|
+
if verbose: log.write(" - {} min:".format(column_name), np.nanmax(sumstats.loc[:,column_name]))
|
|
649
|
+
if verbose: log.write(" - {} max:".format(column_name), np.nanmin(sumstats.loc[:,column_name]))
|
|
650
|
+
if verbose: log.write(" - {} sd:".format(column_name), np.nanstd(sumstats.loc[:,column_name]))
|
|
651
|
+
if verbose: log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats.loc[:,column_name])))
|
|
652
|
+
if verbose: log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats.loc[:,column_name])))
|
|
653
|
+
if verbose: log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats.loc[:,column_name])))
|
|
654
|
+
if verbose: log.write("Finished allele frequency checking!")
|
|
645
655
|
return sumstats
|
|
646
656
|
|
|
647
|
-
def checkaf(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
|
|
657
|
+
def checkaf(sumstats,ref_infer,ref_alt_freq=None,column_name="DAF",chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
|
|
648
658
|
#vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
|
|
649
659
|
vcf_reader = VariantFile(ref_infer)
|
|
650
660
|
def afapply(x,vcf,alt_freq,chr_dict):
|
|
651
661
|
return check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict)
|
|
652
662
|
map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
653
663
|
status_inferred = sumstats.apply(map_func,axis=1)
|
|
654
|
-
sumstats.loc[:,
|
|
655
|
-
sumstats.loc[:,
|
|
664
|
+
sumstats.loc[:,column_name] = status_inferred.values
|
|
665
|
+
sumstats.loc[:,column_name]=sumstats.loc[:,column_name].astype("float")
|
|
656
666
|
return sumstats
|
|
657
667
|
|
|
658
668
|
def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
|
|
@@ -665,4 +675,25 @@ def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
|
|
|
665
675
|
return eaf - record.info[alt_freq][0]
|
|
666
676
|
return np.nan
|
|
667
677
|
################################################################################################################
|
|
668
|
-
################################################################################################################
|
|
678
|
+
################################################################################################################
|
|
679
|
+
def auto_check_vcf_chr_dict(vcf_path, vcf_chr_dict, verbose, log):
|
|
680
|
+
if vcf_path is not None:
|
|
681
|
+
if vcf_chr_dict is None:
|
|
682
|
+
if verbose: log.write(" -Checking prefix for chromosomes in vcf files..." )
|
|
683
|
+
prefix = check_vcf_chr_prefix(vcf_path)
|
|
684
|
+
if prefix is not None:
|
|
685
|
+
if verbose: log.write(" -Prefix for chromosomes: ",prefix)
|
|
686
|
+
vcf_chr_dict = get_number_to_chr(prefix=prefix)
|
|
687
|
+
else:
|
|
688
|
+
if verbose: log.write(" -No prefix for chromosomes in the VCF files." )
|
|
689
|
+
vcf_chr_dict = get_number_to_chr()
|
|
690
|
+
return vcf_chr_dict
|
|
691
|
+
|
|
692
|
+
def check_vcf_chr_prefix(vcf_bcf_path):
|
|
693
|
+
vcf_bcf = VariantFile(vcf_bcf_path)
|
|
694
|
+
for i in list(vcf_bcf.header.contigs):
|
|
695
|
+
m = re.search('(chr|Chr|CHR)([0-9xXyYmM]+)', i)
|
|
696
|
+
if m is not None:
|
|
697
|
+
return m.group(1)
|
|
698
|
+
else:
|
|
699
|
+
return None
|
gwaslab/to_pickle.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import pickle
|
|
2
2
|
import os
|
|
3
|
+
import gc
|
|
3
4
|
from gwaslab.Log import Log
|
|
4
5
|
|
|
5
6
|
def dump_pickle(glsumstats,path="~/mysumstats.pickle",overwrite=False):
|
|
@@ -20,3 +21,14 @@ def load_pickle(path):
|
|
|
20
21
|
return glsumstats
|
|
21
22
|
else:
|
|
22
23
|
Log().write("File not exists : ", path)
|
|
24
|
+
|
|
25
|
+
def load_data_from_pickle(path,usecols=None):
|
|
26
|
+
data = load_pickle(path).data
|
|
27
|
+
existing_cols = []
|
|
28
|
+
if usecols is not None:
|
|
29
|
+
for i in usecols:
|
|
30
|
+
if i in data.columns:
|
|
31
|
+
existing_cols.append(i)
|
|
32
|
+
data = data.loc[:,existing_cols]
|
|
33
|
+
gc.collect()
|
|
34
|
+
return data
|
gwaslab/trumpetplot.py
ADDED
|
File without changes
|
gwaslab/version.py
CHANGED
|
@@ -2,13 +2,13 @@ from gwaslab.Log import Log
|
|
|
2
2
|
|
|
3
3
|
def _show_version(log=Log()):
|
|
4
4
|
# show when loading sumstats
|
|
5
|
-
log.write("GWASLab version 3.4.
|
|
5
|
+
log.write("GWASLab version 3.4.15 https://cloufield.github.io/gwaslab/")
|
|
6
6
|
log.write("(C) 2022-2023, Yunye He, Kamatani Lab, MIT License, gwaslab@gmail.com")
|
|
7
7
|
|
|
8
8
|
def gwaslab_info():
|
|
9
9
|
# for output header
|
|
10
10
|
dic={
|
|
11
|
-
"version":"3.4.
|
|
12
|
-
"release_date":"
|
|
11
|
+
"version":"3.4.15",
|
|
12
|
+
"release_date":"20230620"
|
|
13
13
|
}
|
|
14
14
|
return dic
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: gwaslab
|
|
3
|
-
Version: 3.4.
|
|
3
|
+
Version: 3.4.16
|
|
4
4
|
Summary: A collection of handy tools for GWAS SumStats
|
|
5
5
|
Author-email: Yunye <yunye@gwaslab.com>
|
|
6
6
|
Project-URL: Homepage, https://cloufield.github.io/gwaslab/
|
|
@@ -45,7 +45,7 @@ Note: GWASLab is being updated very frequently for now. I will release the first
|
|
|
45
45
|
## Install
|
|
46
46
|
|
|
47
47
|
```
|
|
48
|
-
pip install gwaslab==3.4.
|
|
48
|
+
pip install gwaslab==3.4.15
|
|
49
49
|
```
|
|
50
50
|
|
|
51
51
|
|