gwaslab 3.4.14__py3-none-any.whl → 3.4.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

gwaslab/plotrg.py CHANGED
@@ -17,7 +17,7 @@ def convert_p_to_width(p,sig_level):
17
17
  #scaled using mlog10(p)
18
18
  return max(-np.log10(p)/width_factor,0.1)
19
19
 
20
- def conver_rg_to_color(rg,cmap):
20
+ def convert_rg_to_color(rg,cmap):
21
21
  #(1,1)
22
22
  if rg>1: rg=1
23
23
  if rg<-1: rg=-1
@@ -25,48 +25,120 @@ def conver_rg_to_color(rg,cmap):
25
25
  ####################################################################################################
26
26
 
27
27
  def plot_rg(ldscrg,
28
- p1="p1",p2="p2",rg="rg",p="p",
29
- sig_level=0.05,
30
- rganno=False,
31
- correction="",
28
+ p1="p1",
29
+ p2="p2",
30
+ rg="rg",
31
+ p="p",
32
+ sig_levels=None,
33
+ rganno="non",
34
+ panno=True,
35
+ corrections=None,
36
+ panno_texts=None,
37
+ equal_aspect=True,
32
38
  cmap = matplotlib.cm.get_cmap('RdBu'),
39
+ full_cell =None,
33
40
  log=Log(),
34
41
  panno_args=None,
42
+ rganno_args=None,
35
43
  verbose=True,
36
44
  asize=10,
37
45
  sort_key=None,
38
46
  square=False,
39
- colorbarargs={"shrink":0.82},
40
- **args):
47
+ colorbar_args=None,
48
+ fig_args=None,
49
+ xticklabel_args=None,
50
+ yticklabel_args=None,
51
+ fdr_method="i",
52
+ fontsize=10,
53
+ save=None,
54
+ save_args=None):
41
55
 
42
- if verbose: log.write("Total non-NA records:",len(ldscrg.dropna(subset=[p])))
56
+ if verbose: log.write("Start to create ldsc genetic correlation heatmap...")
57
+ # configure arguments
58
+ if fig_args is None:
59
+ fig_args = {"dpi":300}
60
+ if colorbar_args is None:
61
+ colorbar_args={"shrink":0.82}
62
+ if yticklabel_args is None:
63
+ yticklabel_args={"fontsize":fontsize, "fontfamily":"Arial"}
64
+ if xticklabel_args is None:
65
+ xticklabel_args={"rotation":45,"horizontalalignment":"left", "verticalalignment":"bottom","fontsize":fontsize, "fontfamily":"Arial"}
66
+ if sig_levels is None:
67
+ sig_levels = [0.05]
68
+ if corrections is None:
69
+ corrections = ["non", "fdr","bon"]
70
+ if panno_texts is None:
71
+ panno_texts = ["*"*(i+1) for i in range(len(sig_levels)*len(corrections))]
72
+ if full_cell is None:
73
+ full_cell = ("fdr",0.05)
74
+ if rganno_args is None:
75
+ rganno_args ={}
76
+
77
+ #drop na records in P column
78
+ if verbose: log.write("Raw dataset records:",len(ldscrg))
43
79
  df=ldscrg.dropna(subset=[p]).copy()
80
+
81
+ if verbose: log.write(" -Raw dataset non-NA records:",len(df))
82
+ # create unique pair column
44
83
  df["p1p2"]=df.apply(lambda x:"_".join(sorted([x[p1],x[p2]])),axis=1)
45
84
 
85
+ if verbose: log.write("Filling diagnal line and duplicated pair for plotting...")
86
+ # fill na
87
+ df_fill_reverse = df.loc[(df[p2].isin(df[p1].values)) & (df[p1].isin(df[p2].values)),:].copy()
88
+ df_fill_reverse = df_fill_reverse.rename(columns={p1:p2,p2:p1})
89
+
90
+ # fill dia
91
+ df_fill_dia = pd.DataFrame(columns=df.columns)
92
+ p1_dup_list = list(df.loc[(df[p2].isin(df[p1].values)),"p2"].values)
93
+ p2_dup_list = list(df.loc[(df[p1].isin(df[p2].values)),"p1"].values)
94
+ p_dup_list = p2_dup_list + p1_dup_list
95
+ if len(set(p_dup_list)) > 0:
96
+ if verbose: log.write(" -Diagnal records:", len(set(p_dup_list)))
97
+ df_fill_dia["p1"] = p_dup_list
98
+ df_fill_dia["p2"] = df_fill_dia["p1"]
99
+ df_fill_dia["rg"] = 1
100
+
101
+ df_fill_na = pd.DataFrame(columns=df.columns)
102
+ df_fill_na[[p1,p2]] = [(i,j) for i in df[p1].sort_values(ascending=False).drop_duplicates() for j in df[p2].sort_values(ascending=False).drop_duplicates()]
103
+ # fill diagonal
104
+ df = pd.concat([df,df_fill_reverse,df_fill_dia,df_fill_na],ignore_index=True).sort_values(by=p).drop_duplicates(subset=[p1,p2])
105
+ #if verbose: log.write(" -Dataset shape match:", len(df)==)
106
+ #
107
+ ## remove record with p1 = p2, dropna in P column
46
108
  dfp=ldscrg.loc[ldscrg[p1]!=ldscrg[p2],:].dropna(subset=[p]).copy()
109
+
110
+ ## create pair column
47
111
  dfp["p1p2"]=dfp.apply(lambda x:"_".join(sorted([x[p1],x[p2]])),axis=1)
112
+
113
+ ## drop duplicate and keep only unique pairs
48
114
  dfp = dfp.drop_duplicates(subset=["p1p2"]).copy()
49
115
 
50
- if verbose: log.write("Valid unique records:",len(dfp))
51
- if verbose: log.write("Significant correlations after Bonferroni correction:",sum(dfp[p]<0.05/len(dfp)))
116
+ if verbose: log.write("Valid unique trait pairs:",len(dfp))
117
+ if verbose: log.write(" -Valid unique trait1:",dfp["p1"].nunique())
118
+ if verbose: log.write(" -Valid unique trait2:",dfp["p2"].nunique())
119
+ if verbose: log.write(" -Significant correlations with P < 0.05:",sum(dfp[p]<0.05))
120
+ if verbose: log.write(" -Significant correlations after Bonferroni correction:",sum(dfp[p]<(0.05/len(dfp))))
52
121
 
53
- if correction=="fdr":
54
- dfp["fdr_p"]=fdrcorrection(dfp[p],alpha=1)[1]
55
-
56
- dfp["fdr"]=fdrcorrection(dfp[p],alpha=sig_level)[0]
57
- if verbose: log.write("Significant correlations after FDR correction:",sum(dfp["fdr"]))
58
- dfp=dfp.set_index("p1p2").loc[:,"fdr_p"].to_dict()
59
-
60
- else:
61
- dfp=dfp.set_index("p1p2").loc[:,p].to_dict()
62
-
122
+ #if correction=="fdr":
123
+ # fdr corrected p
124
+ dfp["fdr_p"]=fdrcorrection(dfp[p],alpha=1,method=fdr_method)[1]
125
+ # is fdr < sig_level
126
+ dfp["fdr"]=fdrcorrection(dfp[p],alpha=0.05,method=fdr_method)[0]
127
+ if verbose: log.write(" -Significant correlations with FDR <0.05:",sum(dfp["fdr"]))
128
+ # convert to dict for annotation and plotting
129
+ df_rawp = dfp.set_index("p1p2").loc[:,p].to_dict()
130
+ dfp = dfp.set_index("p1p2").loc[:,"fdr_p"].to_dict()
131
+
63
132
  #########ticks dict###########################################
64
133
  dic_p1={}
65
134
  dic_p2={}
135
+
66
136
  dic_p1_r={}
67
137
  dic_p2_r={}
68
138
 
139
+ ## sort position
69
140
  if sort_key is None:
141
+ # alphabetic order
70
142
  for i,p1_name in enumerate(df[p1].sort_values(ascending=False).drop_duplicates()):
71
143
  dic_p1[p1_name] = i
72
144
  dic_p1_r[i] = p1_name
@@ -74,6 +146,7 @@ def plot_rg(ldscrg,
74
146
  dic_p2[p2_name] = i
75
147
  dic_p2_r[i] = p2_name
76
148
  else:
149
+ # user-provided order
77
150
  for i,p1_name in enumerate(df[p1].sort_values(ascending=False,key=sort_key).drop_duplicates()):
78
151
  dic_p1[p1_name] = i
79
152
  dic_p1_r[i] = p1_name
@@ -81,14 +154,17 @@ def plot_rg(ldscrg,
81
154
  dic_p2[p2_name] = i
82
155
  dic_p2_r[i] = p2_name
83
156
 
157
+ # assign coordinate
84
158
  df["y"]=df[p1].map(dic_p1)
85
159
  df["y_x"]=df[p1].map(dic_p2)
86
160
  df["x"]=df[p2].map(dic_p2)
87
161
  df["x_y"]=df[p2].map(dic_p1)
88
162
 
89
- if verbose: log.write("Plotting...")
163
+ if verbose: log.write("Plotting heatmap...")
90
164
  ########ticks###############################################
91
- fig,ax = plt.subplots(dpi=300,**args)
165
+ fig,ax = plt.subplots(**fig_args)
166
+
167
+ # configure x/y ticks
92
168
  xticks=df["x"].sort_values().drop_duplicates().astype(int)
93
169
  yticks=df["y"].sort_values().drop_duplicates().astype(int)
94
170
  ax.xaxis.tick_top()
@@ -103,88 +179,145 @@ def plot_rg(ldscrg,
103
179
  ax.tick_params('both', length=0, width=0, which='minor')
104
180
 
105
181
  #labels
106
- ax.set_yticklabels(yticks.map(dic_p1_r),fontsize=15)
107
- ax.set_xticklabels(xticks.map(dic_p2_r),rotation=45,horizontalalignment="left", verticalalignment="bottom",fontsize=15)
108
-
109
- width_max=1
182
+ ax.set_yticklabels(yticks.map(dic_p1_r),**yticklabel_args)
110
183
 
184
+ ax.set_xticklabels(xticks.map(dic_p2_r),**xticklabel_args)
111
185
 
112
186
  #########patches###########################################
113
187
 
114
188
  squares=[]
115
- panno=[]
189
+ panno_list={1:{},2:{}}
116
190
  rgtoanno=[]
117
- maxsigp=sig_level
118
191
 
119
- #if correction=="fdr":
120
- # if len(df.loc[df["fdr"]==True,p])>=1:
121
- # maxsigp = df.loc[df["fdr"]==True,p].max()*1.0001
122
- #
123
- # else:
124
- # maxsigp = sig_level/len(df.dropna(subset=[p]))
125
- if correction=="fdr":
126
- p="fdr_p"
127
-
128
-
192
+ if verbose: log.write("Full cell : {}-corrected P == {}".format(full_cell[0],full_cell[1]))
193
+
129
194
  for i,row in df.iterrows():
130
195
  xcenter=row["x"]
131
196
  ycenter=row["y"]
132
- if row[p1]==row[p2]:
197
+
198
+ if np.isnan(row[rg]):
133
199
  width=1
134
200
  x=xcenter-width/2
135
201
  y=ycenter-width/2
136
- rgba = conver_rg_to_color(1,cmap)
137
-
138
- else:
139
- adjusted_p = dfp["_".join(sorted([row[p1],row[p2]]))]
140
- if adjusted_p<0.05 and square is True:
141
- if xcenter + ycenter < len(df[p1].unique()):
142
- panno.append([xcenter,ycenter,adjusted_p])
143
- elif adjusted_p<0.05:
144
- panno.append([xcenter,ycenter,adjusted_p])
145
-
146
- width= convert_p_to_width(adjusted_p,sig_level)
147
- x=xcenter-width/2
148
- y=ycenter-width/2
149
- rgba = conver_rg_to_color(row[rg],cmap)
150
-
151
- if xcenter + ycenter > len(df[p1].unique())-1 and (square is True) and (rganno is True):
152
- rgtoanno.append([xcenter,ycenter,row[rg],rgba])
202
+ ax.plot([x,x+width],[y,y+width],c="grey")
203
+ ax.plot([x,x+width],[y+width,y],c="grey")
153
204
 
154
- if xcenter + ycenter < len(df[p1].unique()) and (square is True) and (rganno is True):
155
- squares.append(patches.Rectangle((x,y),width=width,height=width,fc=rgba,ec="white",lw=0))
156
- elif (square is not True):
157
- squares.append(patches.Rectangle((x,y),width=width,height=width,fc=rgba,ec="white",lw=0))
158
-
159
-
160
-
205
+ else:
206
+ if row[p1]==row[p2]:
207
+ # diagonal line
208
+ width=1
209
+ x=xcenter-width/2
210
+ y=ycenter-width/2
211
+ rgba = convert_rg_to_color(1,cmap)
212
+ else:
213
+ # get the adjusted p value from dict
214
+ if xcenter + ycenter < len(df[p1].unique()):
215
+ panno_set=1
216
+ else:
217
+ panno_set=2
218
+ for i,correction in enumerate(corrections):
219
+ for j,sig_level in enumerate(sig_levels):
220
+
221
+ index = len(sig_levels)*i + j
222
+
223
+ p1p2="_".join(sorted([row[p1],row[p2]]))
224
+
225
+ raw_p = df_rawp[p1p2]
226
+
227
+ if correction in ["B","bonferroni ","bon","Bon","b"]:
228
+ current_threhold = sig_level/len(dfp)
229
+ if raw_p < current_threhold:
230
+ panno_list[panno_set][p1p2] = [xcenter,ycenter,raw_p,"bon",panno_texts[index]]
231
+
232
+ elif correction in ["fdr","FDR","F","f"]:
233
+ adjusted_p = dfp[p1p2]
234
+ if adjusted_p < sig_level and square is True:
235
+ #if square is True, only annotate half
236
+ if xcenter + ycenter < len(df[p1].unique()):
237
+ panno_list[panno_set][p1p2]=[xcenter,ycenter,adjusted_p,"fdr",panno_texts[index]]
238
+ elif adjusted_p < sig_level:
239
+ panno_list[panno_set][p1p2]=[xcenter,ycenter,adjusted_p,"fdr",panno_texts[index]]
240
+
241
+ elif correction == "non":
242
+ if raw_p < sig_level:
243
+ panno_list[panno_set][p1p2]=[xcenter,ycenter,"raw",raw_p,panno_texts[index]]
244
+
245
+ # configuring the square
246
+ if full_cell[0] == "fdr":
247
+ width= convert_p_to_width(adjusted_p,full_cell[1])
248
+ elif full_cell[0] == "bon":
249
+ width= convert_p_to_width(raw_p*len(dfp),full_cell[1])
250
+ else:
251
+ width= convert_p_to_width(raw_p,full_cell[1])
252
+
253
+ x=xcenter-width/2
254
+ y=ycenter-width/2
255
+ rgba = convert_rg_to_color(row[rg],cmap)
256
+ if xcenter + ycenter > len(df[p1].unique())-1 and (square is True) and (rganno == "half"):
257
+ rgtoanno.append([xcenter,ycenter,row[rg],rgba])
258
+ elif "full" in rganno:
259
+ rgtoanno.append([xcenter,ycenter,row[rg],rgba])
260
+
261
+ #if xcenter + ycenter < len(df[p1].unique()) and (square is True) and (rganno == "half"):
262
+ # squares.append(patches.Rectangle((x,y),width=width,height=width,fc=rgba,ec="white",lw=0))
263
+ #elif (square is not True):
264
+ if ("nb" not in rganno):
265
+ if rganno == "half":
266
+ if xcenter + ycenter < len(df[p1].unique()) and (square is True):
267
+ squares.append(patches.Rectangle((x,y),width=width,height=width,fc=rgba,ec="white",lw=0))
268
+ else:
269
+ squares.append(patches.Rectangle((x,y),width=width,height=width,fc=rgba,ec="white",lw=0))
161
270
 
162
271
  squares_collection = matplotlib.collections.PatchCollection(squares,match_original=True)
163
272
  ax.add_collection(squares_collection)
164
273
 
165
274
  if rganno is not False:
275
+ rganno_default_args = {"weight":"bold","ha":"center", "va":"center", "fontfamily":"Arial","fontsize":fontsize}
276
+ for key, value in rganno_args.items():
277
+ rganno_default_args[key] = value
166
278
  for i in rgtoanno:
167
279
  if i[2]>1: i[2]=1
168
280
  if i[2]<-1: i[2]=-1
169
- ax.text(i[0],i[1],"{:.3f}".format(i[2]),color=i[3],weight="bold",ha="center", va="center",font="Arial")
170
-
171
-
281
+ if "color" in rganno_default_args.keys() or "c" in rganno_default_args.keys():
282
+ ax.text(i[0],i[1],"{:.3f}".format(i[2]),**rganno_default_args)
283
+ else:
284
+ ax.text(i[0],i[1],"{:.3f}".format(i[2]),color=i[3],**rganno_default_args)
172
285
 
173
- panno_default_args={"size":asize,"color":"white","weight":"bold","ha":"center","va":"center","font":"Arial"}
286
+ # configure args for p annotation
287
+ panno_default_args={"size":asize,"color":"white","weight":"bold","horizontalalignment":"center","verticalalignment":"center_baseline","font":"Arial"}
174
288
  if panno_args is not None:
175
289
  for key, value in panno_args.items():
176
290
  panno_default_args[key] = value
177
291
 
178
- for i in panno:
179
- if i[2]<sig_level/len(dfp):
180
- ax.text(i[0],i[1],"**", **panno_default_args)
181
- else:
182
- ax.text(i[0],i[1],"*", **panno_default_args)
183
-
292
+ # annotate p
293
+ if panno is True:
294
+ if verbose: log.write("P value annotation text : ")
295
+ for i,correction in enumerate(corrections):
296
+ for j,sig_level in enumerate(sig_levels):
297
+ index = len(sig_levels)*i + j
298
+ if verbose: log.write(" -{} : {}-corrected P < {}".format(panno_texts[index], correction, sig_level))
299
+ for panno_set_number in panno_list.keys():
300
+ for key, i in panno_list[panno_set_number].items():
301
+ if panno_set_number == 1:
302
+ ax.text(i[0],i[1]-0.1,i[4], **panno_default_args)
303
+ else:
304
+ ax.text(i[0],i[1]-0.1,i[4], **panno_default_args)
184
305
 
185
306
  ## color bar ###############################################
186
307
  norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
187
- fig.colorbar(matplotlib.cm.ScalarMappable(norm=norm, cmap=cmap), ax=ax, **colorbarargs)
308
+ fig.colorbar(matplotlib.cm.ScalarMappable(norm=norm, cmap=cmap), ax=ax, **colorbar_args)
188
309
 
189
- return fig,ax,log
310
+ if equal_aspect is True:
311
+ ax.set_aspect('equal', adjustable='box')
312
+
313
+ if save:
314
+ if verbose: log.write("Saving plot:")
315
+ if save==True:
316
+ fig.savefig("./ldscrg_heatmap.png",bbox_inches="tight",**save_args)
317
+ log.write(" -Saved to "+ "./ldscrg_heatmap.png" + " successfully!" )
318
+ else:
319
+ fig.savefig(save,bbox_inches="tight",**save_args)
320
+ log.write(" -Saved to "+ save + " successfully!" )
321
+ if verbose: log.write("Finished creating ldsc genetic correlation heatmap!")
322
+ return fig,ax,log,df
190
323
 
gwaslab/regionalplot.py CHANGED
@@ -10,6 +10,7 @@ from gwaslab.CommonData import get_chr_to_number
10
10
  from gwaslab.CommonData import get_number_to_chr
11
11
  from gwaslab.CommonData import get_recombination_rate
12
12
  from gwaslab.CommonData import get_gtf
13
+ from gwaslab.retrievedata import check_vcf_chr_prefix
13
14
  from pyensembl import EnsemblRelease
14
15
  from allel import GenotypeArray
15
16
  from allel import read_vcf
@@ -34,7 +35,7 @@ def _plot_regional(
34
35
  chrom_df,
35
36
  xtick_chr_dict,
36
37
  cut_line_color,
37
- vcf_chr_dict = get_number_to_chr(),
38
+ vcf_chr_dict = None,
38
39
  gtf_path="default",
39
40
  gtf_chr_dict = get_number_to_chr(),
40
41
  gtf_gene_name=None,
@@ -65,7 +66,17 @@ def _plot_regional(
65
66
  pos="POS",
66
67
  verbose=True,
67
68
  log=Log()
68
- ):
69
+ ):
70
+ if vcf_path is not None:
71
+ if vcf_chr_dict is None:
72
+ if verbose: log.write(" -Checking prefix for chromosomes in vcf files..." )
73
+ prefix = check_vcf_chr_prefix(vcf_path)
74
+ if prefix is not None:
75
+ if verbose: log.write(" -Prefix for chromosomes: ",prefix)
76
+ vcf_chr_dict = get_number_to_chr(prefix=prefix)
77
+ else:
78
+ if verbose: log.write(" -No prefix for chromosomes." )
79
+ vcf_chr_dict = get_number_to_chr()
69
80
 
70
81
  # if regional plot : pinpoint lead , add color bar ##################################################
71
82
  if (region is not None) :
@@ -231,11 +242,17 @@ def _get_lead_id(sumstats, region_ref, log):
231
242
  if len(lead_id)>0:
232
243
  lead_id = int(lead_id[0])
233
244
  if region_ref is not None:
234
- log.write(" -Lead variant ID: {} - {}".format(region_ref, lead_id))
245
+ if type(lead_id) is list:
246
+ if len(lead_id)==0 :
247
+ log.write(" -WARNING: {} not found. Roll back to lead variant...".format(region_ref))
248
+ lead_id = sumstats["scaled_P"].idxmax()
249
+ else:
250
+ log.write(" -Reference variant ID: {} - {}".format(region_ref, lead_id))
235
251
 
236
252
  if lead_id is None:
237
253
  log.write(" -Extracting lead variant...")
238
254
  lead_id = sumstats["scaled_P"].idxmax()
255
+
239
256
  return lead_id
240
257
 
241
258
  def _pinpoint_lead(sumstats,ax1,region_ref, region_ld_threshold, region_ld_colors, marker_size, log):
@@ -464,6 +481,7 @@ def _plot_gene_track(
464
481
  def process_vcf(sumstats, vcf_path, region,region_ref, region_ref2, log, verbose, pos ,nea,ea, region_ld_threshold, vcf_chr_dict,tabix):
465
482
  if verbose: log.write("Start to load reference genotype...")
466
483
  if verbose: log.write(" -reference vcf path : "+ vcf_path)
484
+
467
485
  # load genotype data of the targeted region
468
486
  ref_genotype = read_vcf(vcf_path,region=vcf_chr_dict[region[0]]+":"+str(region[1])+"-"+str(region[2]),tabix=tabix)
469
487
  if ref_genotype is None:
gwaslab/retrievedata.py CHANGED
@@ -290,7 +290,7 @@ def assign_rsid_single(sumstats,path,rsid="rsID",chr="CHR",pos="POS",ref="NEA",a
290
290
 
291
291
  def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsID",chr="CHR",pos="POS",ref="NEA",alt="EA",status="STATUS",
292
292
  n_cores=1,chunksize=5000000,ref_snpid="SNPID",ref_rsid="rsID",
293
- overwrite="empty",verbose=True,log=Log(),chr_dict=get_number_to_chr()):
293
+ overwrite="empty",verbose=True,log=Log(),chr_dict=None):
294
294
  '''
295
295
  overwrite mode :
296
296
  all , overwrite rsid for all availalbe rsid
@@ -303,7 +303,12 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
303
303
  if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
304
304
  if verbose: log.write(" -CPU Cores to use :",n_cores)
305
305
  if verbose: log.write(" -Reference VCF file:", path)
306
+
307
+ chr_dict = auto_check_vcf_chr_dict(path, chr_dict, verbose, log)
308
+
306
309
  if verbose: log.write(" -Assigning rsID based on chr:pos and ref:alt/alt:ref...")
310
+
311
+
307
312
  ##############################################
308
313
  if rsid not in sumstats.columns:
309
314
  sumstats[rsid]=pd.Series(dtype="string")
@@ -476,11 +481,13 @@ def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NE
476
481
 
477
482
  def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,remove_snp="",mode="pi",n_cores=1,remove_indel="",
478
483
  chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
479
- chr_dict=get_number_to_chr(),verbose=True,log=Log()):
484
+ chr_dict=None,verbose=True,log=Log()):
480
485
  if verbose: log.write("Start to infer strand for palindromic SNPs...")
481
486
  if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
482
487
  if verbose: log.write(" -Reference vcf file:", ref_infer)
483
-
488
+
489
+ chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
490
+
484
491
  # check if the columns are complete
485
492
  if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
486
493
  raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
@@ -601,13 +608,16 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
601
608
 
602
609
 
603
610
  ################################################################################################################
604
- def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,n_cores=1,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=get_number_to_chr(),force=False, verbose=True,log=Log()):
611
+ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,column_name="DAF",suffix="",n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
605
612
 
606
613
  if verbose: log.write("Start to check the difference between EAF and refence vcf alt frequency ...")
607
614
  if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
608
615
  if verbose: log.write(" -Reference vcf file:", ref_infer)
609
616
  if verbose: log.write(" -CPU Cores to use :",n_cores)
617
+
618
+ chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
610
619
 
620
+ column_name = column_name + suffix
611
621
  # check if the columns are complete
612
622
  if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
613
623
  raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
@@ -618,7 +628,7 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,n_co
618
628
  if not force:
619
629
  good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
620
630
  if verbose: log.write(" -Checking variants:", sum(good_chrpos))
621
- sumstats["DAF"]=np.nan
631
+ sumstats[column_name]=np.nan
622
632
 
623
633
  ########################
624
634
  if sum(~sumstats[eaf].isna())<10000:
@@ -626,8 +636,8 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,n_co
626
636
  df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
627
637
  pool = Pool(n_cores)
628
638
  if sum(~sumstats[eaf].isna())>0:
629
- map_func = partial(checkaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
630
- sumstats.loc[good_chrpos,["DAF"]] = pd.concat(pool.map(map_func,df_split))
639
+ map_func = partial(checkaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,column_name=column_name,chr_dict=chr_dict)
640
+ sumstats.loc[good_chrpos,[column_name]] = pd.concat(pool.map(map_func,df_split))
631
641
  pool.close()
632
642
  pool.join()
633
643
  ###########################
@@ -635,24 +645,24 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,n_co
635
645
 
636
646
  #sumstats.loc[good_chrpos,"DAF"] = status_inferred.values
637
647
  #sumstats.loc[:,"DAF"]=sumstats.loc[:,"DAF"].astype("float")
638
- if verbose: log.write(" - DAF min:", np.nanmax(sumstats.loc[:,"DAF"]))
639
- if verbose: log.write(" - DAF max:", np.nanmin(sumstats.loc[:,"DAF"]))
640
- if verbose: log.write(" - abs(DAF) min:", np.nanmax(np.abs(sumstats.loc[:,"DAF"])))
641
- if verbose: log.write(" - abs(DAF) max:", np.nanmin(np.abs(sumstats.loc[:,"DAF"])))
642
- if verbose: log.write(" - DAF sd:", np.nanstd(sumstats.loc[:,"DAF"]))
643
- if verbose: log.write(" - abs(DAF) sd:", np.nanstd(np.abs(sumstats.loc[:,"DAF"])))
644
-
648
+ if verbose: log.write(" - {} min:".format(column_name), np.nanmax(sumstats.loc[:,column_name]))
649
+ if verbose: log.write(" - {} max:".format(column_name), np.nanmin(sumstats.loc[:,column_name]))
650
+ if verbose: log.write(" - {} sd:".format(column_name), np.nanstd(sumstats.loc[:,column_name]))
651
+ if verbose: log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats.loc[:,column_name])))
652
+ if verbose: log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats.loc[:,column_name])))
653
+ if verbose: log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats.loc[:,column_name])))
654
+ if verbose: log.write("Finished allele frequency checking!")
645
655
  return sumstats
646
656
 
647
- def checkaf(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
657
+ def checkaf(sumstats,ref_infer,ref_alt_freq=None,column_name="DAF",chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
648
658
  #vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
649
659
  vcf_reader = VariantFile(ref_infer)
650
660
  def afapply(x,vcf,alt_freq,chr_dict):
651
661
  return check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict)
652
662
  map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
653
663
  status_inferred = sumstats.apply(map_func,axis=1)
654
- sumstats.loc[:,"DAF"] = status_inferred.values
655
- sumstats.loc[:,"DAF"]=sumstats.loc[:,"DAF"].astype("float")
664
+ sumstats.loc[:,column_name] = status_inferred.values
665
+ sumstats.loc[:,column_name]=sumstats.loc[:,column_name].astype("float")
656
666
  return sumstats
657
667
 
658
668
  def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
@@ -665,4 +675,25 @@ def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
665
675
  return eaf - record.info[alt_freq][0]
666
676
  return np.nan
667
677
  ################################################################################################################
668
- ################################################################################################################
678
+ ################################################################################################################
679
+ def auto_check_vcf_chr_dict(vcf_path, vcf_chr_dict, verbose, log):
680
+ if vcf_path is not None:
681
+ if vcf_chr_dict is None:
682
+ if verbose: log.write(" -Checking prefix for chromosomes in vcf files..." )
683
+ prefix = check_vcf_chr_prefix(vcf_path)
684
+ if prefix is not None:
685
+ if verbose: log.write(" -Prefix for chromosomes: ",prefix)
686
+ vcf_chr_dict = get_number_to_chr(prefix=prefix)
687
+ else:
688
+ if verbose: log.write(" -No prefix for chromosomes in the VCF files." )
689
+ vcf_chr_dict = get_number_to_chr()
690
+ return vcf_chr_dict
691
+
692
+ def check_vcf_chr_prefix(vcf_bcf_path):
693
+ vcf_bcf = VariantFile(vcf_bcf_path)
694
+ for i in list(vcf_bcf.header.contigs):
695
+ m = re.search('(chr|Chr|CHR)([0-9xXyYmM]+)', i)
696
+ if m is not None:
697
+ return m.group(1)
698
+ else:
699
+ return None
gwaslab/to_pickle.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import pickle
2
2
  import os
3
+ import gc
3
4
  from gwaslab.Log import Log
4
5
 
5
6
  def dump_pickle(glsumstats,path="~/mysumstats.pickle",overwrite=False):
@@ -20,3 +21,14 @@ def load_pickle(path):
20
21
  return glsumstats
21
22
  else:
22
23
  Log().write("File not exists : ", path)
24
+
25
+ def load_data_from_pickle(path,usecols=None):
26
+ data = load_pickle(path).data
27
+ existing_cols = []
28
+ if usecols is not None:
29
+ for i in usecols:
30
+ if i in data.columns:
31
+ existing_cols.append(i)
32
+ data = data.loc[:,existing_cols]
33
+ gc.collect()
34
+ return data
gwaslab/trumpetplot.py ADDED
File without changes
gwaslab/version.py CHANGED
@@ -2,13 +2,13 @@ from gwaslab.Log import Log
2
2
 
3
3
  def _show_version(log=Log()):
4
4
  # show when loading sumstats
5
- log.write("GWASLab version 3.4.14 https://cloufield.github.io/gwaslab/")
5
+ log.write("GWASLab version 3.4.15 https://cloufield.github.io/gwaslab/")
6
6
  log.write("(C) 2022-2023, Yunye He, Kamatani Lab, MIT License, gwaslab@gmail.com")
7
7
 
8
8
  def gwaslab_info():
9
9
  # for output header
10
10
  dic={
11
- "version":"3.4.14",
12
- "release_date":"20230609"
11
+ "version":"3.4.15",
12
+ "release_date":"20230620"
13
13
  }
14
14
  return dic
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: gwaslab
3
- Version: 3.4.14
3
+ Version: 3.4.16
4
4
  Summary: A collection of handy tools for GWAS SumStats
5
5
  Author-email: Yunye <yunye@gwaslab.com>
6
6
  Project-URL: Homepage, https://cloufield.github.io/gwaslab/
@@ -45,7 +45,7 @@ Note: GWASLab is being updated very frequently for now. I will release the first
45
45
  ## Install
46
46
 
47
47
  ```
48
- pip install gwaslab==3.4.13
48
+ pip install gwaslab==3.4.15
49
49
  ```
50
50
 
51
51