gwaslab 3.4.35__py3-none-any.whl → 3.4.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

@@ -3,31 +3,40 @@ import os
3
3
  import numpy as np
4
4
  from gwaslab.g_Log import Log
5
5
 
6
- def process_ref_vcf(vcf, directory=None, chr_dict=None, group_size=20000000,complevel=9,chunksize=20000000,log=Log()):
6
+ def process_vcf_to_hfd5(vcf,
7
+ directory=None,
8
+ chr_dict=None,
9
+ group_size=20000000,
10
+ complevel=9,
11
+ chunksize=20000000,
12
+ log=Log(),
13
+ verbose=True):
14
+
7
15
  #load vcf
8
- log.write("Start processing VCF files:")
9
- log.write(" -Reference VCF path:{}".format(vcf))
10
- log.write(" -Output group size:{}".format(group_size))
11
- log.write(" -Compression level:{}".format(complevel))
12
- log.write(" -Loading chunksize:{}".format(chunksize))
16
+ log.write("Start to process VCF file to HDF5:", verbose=verbose)
17
+ log.write(" -Reference VCF path:{}".format(vcf), verbose=verbose)
18
+ log.write(" -Output group size:{}".format(group_size), verbose=verbose)
19
+ log.write(" -Compression level:{}".format(complevel), verbose=verbose)
20
+ log.write(" -Loading chunksize:{}".format(chunksize), verbose=verbose)
13
21
 
22
+ vcf_file_name = os.path.basename(vcf)
23
+ vcf_dir_path = os.path.dirname(vcf)
24
+
14
25
  if directory is None:
15
- directory="./"
16
-
26
+ directory = vcf_dir_path
17
27
  elif directory[-1] == "/":
18
28
  directory = directory.rstrip('/')
19
29
 
20
- h5_path = "{}/rsID_CHR_POS_groups_{}.h5".format(directory,int(group_size))
21
- log_path = "{}/rsID_CHR_POS_groups_{}.log".format(directory,int(group_size))
22
- log.write(" -HDF5 Output path: {}".format(h5_path))
23
- log.write(" -Log output path: {}".format(log_path))
30
+ h5_path = "{}/{}.rsID_CHR_POS_groups_{}.h5".format(directory,vcf_file_name,int(group_size))
31
+ log_path = "{}/{}.rsID_CHR_POS_groups_{}.log".format(directory,vcf_file_name, int(group_size))
32
+ log.write(" -HDF5 Output path: {}".format(h5_path), verbose=verbose)
33
+ log.write(" -Log output path: {}".format(log_path), verbose=verbose)
24
34
  df = pd.read_table(vcf,comment="#",usecols=[0,1,2],header=None,chunksize=chunksize)
25
35
 
26
-
27
- log.write(" -Processing chunk: ",end="")
36
+ log.write(" -Processing chunk: ",end="", verbose=verbose)
28
37
 
29
38
  for index,chunk in enumerate(df):
30
- log.write(index,end=" ",show_time=False)
39
+ log.write(index,end=" ",show_time=False, verbose=verbose)
31
40
  chunk = chunk.rename(columns={0:"CHR",1:"POS",2:"rsn"})
32
41
  if chr_dict is not None:
33
42
  chunk["CHR"] = chunk["CHR"].map(chr_dict)
@@ -47,5 +56,5 @@ def process_ref_vcf(vcf, directory=None, chr_dict=None, group_size=20000000,comp
47
56
  dropna=True,
48
57
  format="table",
49
58
  complevel=complevel)
50
- log.write("Processing finished!")
51
- log.save(log_path, verbose=False)
59
+ log.write("Processing finished!", verbose=verbose)
60
+ log.save(log_path, verbose=verbose)
@@ -4,13 +4,13 @@ import scipy.stats as ss
4
4
  from scipy import stats
5
5
  from gwaslab.g_Log import Log
6
6
  import gc
7
- from gwaslab.qc_fix_sumstats import sortcolumn
7
+ #from gwaslab.qc_fix_sumstats import sortcolumn
8
8
  from gwaslab.g_version import _get_version
9
9
  from gwaslab.qc_check_datatype import check_datatype
10
10
 
11
11
  def filldata(
12
12
  sumstats,
13
- to_fill=[],
13
+ to_fill=None,
14
14
  df=None,
15
15
  overwrite=False,
16
16
  verbose=True,
@@ -38,7 +38,7 @@ def filldata(
38
38
  for i in skip_cols:
39
39
  to_fill.remove(i)
40
40
  if verbose: log.write(" -Skipping columns: ",skip_cols)
41
- if len(set(to_fill) & set(["OR","OR95L","OR95U","BETA","SE","P","Z","CHI2","MLOG10P"]))==0:
41
+ if len(set(to_fill) & set(["OR","OR_95L","OR_95U","BETA","SE","P","Z","CHISQ","MLOG10P","MAF"]))==0:
42
42
  log.write(" -No available columns to fill. Skipping.", verbose=verbose)
43
43
  log.write("Finished filling data using existing columns.", verbose=verbose)
44
44
  return sumstats
@@ -46,7 +46,7 @@ def filldata(
46
46
  fill_iteratively(sumstats,to_fill,log,only_sig,df,extreme,verbose,sig_level)
47
47
 
48
48
  # ###################################################################################
49
- sumstats = sortcolumn(sumstats, verbose=verbose, log=log)
49
+ #sumstats = sortcolumn(sumstats, verbose=verbose, log=log)
50
50
  gc.collect()
51
51
  if verbose: log.write("Finished filling data using existing columns.")
52
52
  return sumstats
@@ -224,12 +224,12 @@ def fill_extreme_mlog10(sumstats, z):
224
224
  return sumstats
225
225
 
226
226
  ####################################################################################################################
227
- def fill_iteratively(sumstats,to_fill,log,only_sig,df,extreme,verbose,sig_level):
227
+ def fill_iteratively(sumstats,raw_to_fill,log,only_sig,df,extreme,verbose,sig_level):
228
+ to_fill = raw_to_fill.copy()
228
229
  if verbose: log.write(" - Filling Columns iteratively...")
229
- filled=[]
230
- previous_count=0
230
+
231
231
  filled_count=0
232
- for i in range(len(to_fill)):
232
+ for i in range(len(to_fill)+1):
233
233
  # beta to or ####################################################################################################
234
234
  if "OR" in to_fill:
235
235
  status, filled_count = fill_or(sumstats,log,verbose=verbose,filled_count=filled_count)
@@ -269,9 +269,47 @@ def fill_iteratively(sumstats,to_fill,log,only_sig,df,extreme,verbose,sig_level)
269
269
  else:
270
270
  status,filled_count = fill_mlog10p(sumstats,log,verbose=verbose)
271
271
  if status == 1 : to_fill.remove("MLOG10P")
272
-
273
- previous_count+=filled_count
274
- if previous_count == filled_count:
272
+
273
+ if filled_count == 0:
275
274
  break
276
275
 
277
-
276
+ ###Base functions########################################################################################
277
+
278
+ def _convert_betase_to_z(beta, se):
279
+ return beta/se
280
+
281
+ def _convert_betase_to_p(beta, se):
282
+ z = _convert_betase_to_z(beta, se)
283
+ p = _convert_z_to_p(z)
284
+ return p
285
+
286
+ def _convert_betase_to_mlog10p(beta, se):
287
+ z = _convert_betase_to_z(beta, se)
288
+ mlog10p = _convert_z_to_mlog10p(z)
289
+ return mlog10p
290
+
291
+ def _convert_p_to_chisq(p):
292
+ return ss.chi2.isf(p, 1)
293
+
294
+ def _convert_z_to_chisq(z):
295
+ return (z)**2
296
+
297
+ def _convert_z_to_p(z):
298
+ return ss.chi2.sf(z**2,1)
299
+
300
+ def _convert_z_to_mlog10p(z):
301
+ log_pvalue = np.log(2) + ss.norm.logsf(np.abs(z)) #two-sided
302
+ mlog10p = log_pvalue/np.log(10)
303
+ return -mlog10p
304
+
305
+ def _conver_chisq_to_p(chisq):
306
+ return ss.chi2.sf(chisq,1)
307
+
308
+ def _convert_mlog10p_to_p(mlog10p):
309
+ return np.power(10, -mlog10p)
310
+
311
+ def _convert_or_to_beta(OR):
312
+ return np.log(OR)
313
+
314
+ def _convert_beta_to_or(beta):
315
+ return np.exp(beta)
@@ -159,63 +159,64 @@ def _quick_assign_i(sumstats, chrom="CHR",pos="POS"):
159
159
  return sumstats, chrom_df
160
160
 
161
161
  def _quick_assign_i_with_rank(sumstats, chrpad, use_rank=False, chrom="CHR",pos="POS",drop_chr_start=False,_posdiccul=None):
162
- sumstats = sumstats.sort_values([chrom,pos])
163
- if use_rank is True:
164
- sumstats["_POS_RANK"] = sumstats.groupby(chrom)[pos].rank("dense", ascending=True)
165
- pos="_POS_RANK"
166
- sumstats["_ID"]=range(len(sumstats))
167
- sumstats=sumstats.set_index("_ID")
168
-
169
- #create a df , groupby by chromosomes , and get the maximum position
170
- if use_rank is True:
171
- posdic = sumstats.groupby(chrom)["_POS_RANK"].max()
172
- else:
173
- posdic = sumstats.groupby(chrom)[pos].max()
162
+ # align all variants on a single axis (i)
163
+ sumstats = sumstats.sort_values([chrom,pos])
164
+ if use_rank is True:
165
+ sumstats["_POS_RANK"] = sumstats.groupby(chrom)[pos].rank("dense", ascending=True)
166
+ pos="_POS_RANK"
167
+ sumstats["_ID"]=range(len(sumstats))
168
+ sumstats=sumstats.set_index("_ID")
169
+
170
+ #create a df , groupby by chromosomes , and get the maximum position
171
+ if use_rank is True:
172
+ posdic = sumstats.groupby(chrom)["_POS_RANK"].max()
173
+ else:
174
+ posdic = sumstats.groupby(chrom)[pos].max()
175
+
176
+ if _posdiccul is None:
177
+ # convert to dictionary
178
+ posdiccul = dict(posdic)
174
179
 
175
- if _posdiccul is None:
176
- # convert to dictionary
177
- posdiccul = dict(posdic)
178
-
179
- # fill empty chr with 0
180
+ # fill empty chr with 0
181
+ for i in range(0,sumstats[chrom].max()+1):
182
+ if i in posdiccul:
183
+ continue
184
+ else:
185
+ posdiccul[i]=0
186
+
187
+ # cumulative sum dictionary
188
+ for i in range(1,sumstats[chrom].max()+1):
189
+ posdiccul[i]= posdiccul[i-1] + posdiccul[i] + sumstats[pos].max()*chrpad
190
+ else:
191
+ posdiccul = _posdiccul
192
+
193
+ # convert base pair postion to x axis position using the cumulative sum dictionary
194
+ sumstats["_ADD"]=sumstats[chrom].apply(lambda x : posdiccul[int(x)-1])
195
+
196
+ if drop_chr_start==True:
197
+ posdic_min = sumstats.groupby(chrom)[pos].min()
198
+ posdiccul_min= dict(posdic_min)
180
199
  for i in range(0,sumstats[chrom].max()+1):
181
- if i in posdiccul:
200
+ if i in posdiccul_min:
182
201
  continue
183
202
  else:
184
- posdiccul[i]=0
185
-
186
- # cumulative sum dictionary
203
+ posdiccul_min[i]=0
187
204
  for i in range(1,sumstats[chrom].max()+1):
188
- posdiccul[i]= posdiccul[i-1] + posdiccul[i] + sumstats[pos].max()*chrpad
189
- else:
190
- posdiccul = _posdiccul
191
-
192
- # convert base pair postion to x axis position using the cumulative sum dictionary
193
- sumstats["_ADD"]=sumstats[chrom].apply(lambda x : posdiccul[int(x)-1])
194
-
195
- if drop_chr_start==True:
196
- posdic_min = sumstats.groupby(chrom)[pos].min()
197
- posdiccul_min= dict(posdic_min)
198
- for i in range(0,sumstats[chrom].max()+1):
199
- if i in posdiccul_min:
200
- continue
201
- else:
202
- posdiccul_min[i]=0
203
- for i in range(1,sumstats[chrom].max()+1):
204
- posdiccul_min[i]= posdiccul_min[i-1] + posdiccul_min[i]
205
- sumstats["_ADD"]=sumstats["_ADD"] - sumstats[chrom].apply(lambda x : posdiccul_min[int(x)])
206
-
207
- if use_rank is True:
208
- sumstats["i"]=sumstats["_POS_RANK"]+sumstats["_ADD"]
209
- else:
210
- sumstats["i"]=sumstats[pos]+sumstats["_ADD"]
205
+ posdiccul_min[i]= posdiccul_min[i-1] + posdiccul_min[i]
206
+ sumstats["_ADD"]=sumstats["_ADD"] - sumstats[chrom].apply(lambda x : posdiccul_min[int(x)])
211
207
 
208
+ if use_rank is True:
209
+ sumstats["i"]=sumstats["_POS_RANK"]+sumstats["_ADD"]
210
+ else:
211
+ sumstats["i"]=sumstats[pos]+sumstats["_ADD"]
212
+
212
213
 
213
- #for plot, get the chr text tick position
214
- chrom_df=sumstats.groupby(chrom)['i'].agg(lambda x: (x.min()+x.max())/2)
215
- #sumstats["i"] = sumstats["i"]+((sumstats[chrom].map(dict(chrom_df)).astype("int")))*0.02
216
- #sumstats["i"] = sumstats["i"].astype("Int64")
217
- sumstats["i"] = np.floor(pd.to_numeric(sumstats["i"], errors='coerce')).astype('Int64')
218
- return sumstats, chrom_df
214
+ #for plot, get the chr text tick position
215
+ chrom_df=sumstats.groupby(chrom)['i'].agg(lambda x: (x.min()+x.max())/2)
216
+ #sumstats["i"] = sumstats["i"]+((sumstats[chrom].map(dict(chrom_df)).astype("int")))*0.02
217
+ #sumstats["i"] = sumstats["i"].astype("Int64")
218
+ sumstats["i"] = np.floor(pd.to_numeric(sumstats["i"], errors='coerce')).astype('Int64')
219
+ return sumstats, chrom_df
219
220
 
220
221
  def _quick_assign_marker_relative_size(series, sig_level = 5e-8, suggestive_sig_level=5e-6, lower_level=5e-4):
221
222
  size_series = series.copy()
@@ -286,9 +287,9 @@ def _cut(series, mode,cutfactor,cut,skip, ylabels, cut_log, verbose,lines_to_plo
286
287
  maxy = series.max()
287
288
  series = series.copy()
288
289
  if "b" not in mode:
289
- if verbose: log.write(" -Maximum -log10(P) values is "+str(maxy) +" .")
290
+ if verbose: log.write(" -Maximum -log10(P) value is "+str(maxy) +" .")
290
291
  elif "b" in mode:
291
- if verbose: log.write(" -Maximum DENSITY values is "+str(maxy) +" .")
292
+ if verbose: log.write(" -Maximum DENSITY value is "+str(maxy) +" .")
292
293
 
293
294
  maxticker=int(np.round(series.max(skipna=True)))
294
295
 
@@ -4,7 +4,7 @@ import matplotlib.pyplot as plt
4
4
  import scipy.stats as ss
5
5
  import seaborn as sns
6
6
  import gc
7
- from statsmodels.stats.multitest import fdrcorrection
7
+ import scipy.stats as ss
8
8
  from matplotlib.patches import Rectangle
9
9
  from adjustText import adjust_text
10
10
  from gwaslab.viz_aux_save_figure import save_figure
@@ -36,6 +36,7 @@ def compare_effect(path1,
36
36
  wc_correction=False,
37
37
  null_beta=0,
38
38
  is_q=False,
39
+ is_q_mc = False,
39
40
  include_all=True,
40
41
  q_level=0.05,
41
42
  sig_level=5e-8,
@@ -485,8 +486,10 @@ def compare_effect(path1,
485
486
  if verbose: log.write(" -No variants with EA not matching...")
486
487
  if fdr==True:
487
488
  if verbose: log.write(" -Using FDR...")
488
- sig_list_merged["P_1"] = fdrcorrection(sig_list_merged["P_1"])[1]
489
- sig_list_merged["P_2"] = fdrcorrection(sig_list_merged["P_2"])[1]
489
+ #sig_list_merged["P_1"] = fdrcorrection(sig_list_merged["P_1"])[1]
490
+ #sig_list_merged["P_2"] = fdrcorrection(sig_list_merged["P_2"])[1]
491
+ sig_list_merged["P_1"] =ss.false_discovery_control(sig_list_merged["P_1"])
492
+ sig_list_merged["P_2"] =ss.false_discovery_control(sig_list_merged["P_2"])
490
493
 
491
494
  ####################################################################################################################################
492
495
  ## winner's curse correction using aligned beta
@@ -528,9 +531,10 @@ def compare_effect(path1,
528
531
  if (is_q is True):
529
532
  if verbose: log.write(" -Calculating Cochran's Q statistics and peform chisq test...")
530
533
  if mode=="beta" or mode=="BETA" or mode=="Beta":
531
- sig_list_merged = test_q(sig_list_merged,"EFFECT_1","SE_1","EFFECT_2_aligned","SE_2",q_level=q_level)
534
+ sig_list_merged = test_q(sig_list_merged,"EFFECT_1","SE_1","EFFECT_2_aligned","SE_2",q_level=q_level,is_q_mc=is_q_mc, log=log, verbose=verbose)
532
535
  else:
533
- sig_list_merged = test_q(sig_list_merged,"BETA_1","SE_1","BETA_2_aligned","SE_2",q_level=q_level)
536
+ sig_list_merged = test_q(sig_list_merged,"BETA_1","SE_1","BETA_2_aligned","SE_2",q_level=q_level,is_q_mc=is_q_mc, log=log, verbose=verbose)
537
+
534
538
  ######################### save ###############################################################
535
539
  ## save the merged data
536
540
  save_path = label[0]+"_"+label[1]+"_beta_sig_list_merged.tsv"
@@ -804,8 +808,15 @@ def compare_effect(path1,
804
808
  if legend_mode == "full" and is_q==True :
805
809
  title_proxy = Rectangle((0,0), 0, 0, color='w',label=legend_title)
806
810
  title_proxy2 = Rectangle((0,0), 0, 0, color='w',label=legend_title2)
807
- het_label_sig = r"$P_{het} < $" + "${}$".format(q_level)
808
- het_label_sig2 = r"$P_{het} > $" + "${}$".format(q_level)
811
+ if is_q_mc=="fdr":
812
+ het_label_sig = r"$FDR_{het} < $" + "${}$".format(q_level)
813
+ het_label_sig2 = r"$FDR_{het} > $" + "${}$".format(q_level)
814
+ elif is_q_mc=="bon":
815
+ het_label_sig = r"$P_{het,bon} < $" + "${}$".format(q_level)
816
+ het_label_sig2 = r"$P_{het,bon} > $" + "${}$".format(q_level)
817
+ else:
818
+ het_label_sig = r"$P_{het} < $" + "${}$".format(q_level)
819
+ het_label_sig2 = r"$P_{het} > $" + "${}$".format(q_level)
809
820
  het_sig = Rectangle((0,0), 0, 0, facecolor='#cccccc',edgecolor="black", linewidth=1, label=het_label_sig)
810
821
  het_nonsig = Rectangle((0,0), 0, 0, facecolor='#cccccc',edgecolor="white",linewidth=1, label=het_label_sig2)
811
822
 
@@ -874,7 +885,7 @@ def reorderLegend(ax=None, order=None, add=None):
874
885
  new_handles = [info[l] for l in order]
875
886
  return new_handles, order
876
887
 
877
- def test_q(df,beta1,se1,beta2,se2,q_level=0.05):
888
+ def test_q(df,beta1,se1,beta2,se2,q_level=0.05,is_q_mc=False, log=Log(), verbose=False):
878
889
  w1="Weight_1"
879
890
  w2="Weight_2"
880
891
  beta="BETA_FE"
@@ -889,6 +900,14 @@ def test_q(df,beta1,se1,beta2,se2,q_level=0.05):
889
900
  df[q] = df[w1]*(df[beta1]-df[beta])**2 + df[w2]*(df[beta2]-df[beta])**2
890
901
  df[pq] = ss.chi2.sf(df[q], 1)
891
902
  df["Edge_color"]="white"
903
+
904
+ if is_q_mc=="fdr":
905
+ if verbose: log.write(" -FDR correction applied...")
906
+ df[pq] = ss.false_discovery_control(df[pq])
907
+ elif is_q_mc=="bon":
908
+ if verbose: log.write(" -Bonferroni correction applied...")
909
+ df[pq] = df[pq] * len(df[pq])
910
+
892
911
  df.loc[df[pq]<q_level,"Edge_color"]="black"
893
912
  df.drop(columns=["Weight_1","Weight_2","BETA_FE"],inplace=True)
894
913
  # Huedo-Medina, T. B., Sánchez-Meca, J., Marín-Martínez, F., & Botella, J. (2006). Assessing heterogeneity in meta-analysis: Q statistic or I² index?. Psychological methods, 11(2), 193.
@@ -2,7 +2,7 @@ import pandas as pd
2
2
  import numpy as np
3
3
  import matplotlib.pyplot as plt
4
4
  from scipy import stats, optimize
5
- from statsmodels.stats.meta_analysis import combine_effects
5
+ #from statsmodels.stats.meta_analysis import combine_effects
6
6
  from matplotlib.patches import Polygon
7
7
  from matplotlib.collections import PatchCollection
8
8
  # plot_forest