gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (51) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/g_Log.py +14 -5
  5. gwaslab/g_Sumstats.py +86 -18
  6. gwaslab/g_SumstatsPair.py +70 -23
  7. gwaslab/g_SumstatsT.py +2 -2
  8. gwaslab/g_version.py +10 -10
  9. gwaslab/hm_casting.py +9 -4
  10. gwaslab/hm_harmonize_sumstats.py +88 -83
  11. gwaslab/io_preformat_input.py +14 -14
  12. gwaslab/io_read_ldsc.py +49 -1
  13. gwaslab/ldsc_irwls.py +198 -0
  14. gwaslab/ldsc_jackknife.py +514 -0
  15. gwaslab/ldsc_ldscore.py +417 -0
  16. gwaslab/ldsc_parse.py +294 -0
  17. gwaslab/ldsc_regressions.py +747 -0
  18. gwaslab/ldsc_sumstats.py +629 -0
  19. gwaslab/qc_check_datatype.py +1 -1
  20. gwaslab/qc_fix_sumstats.py +163 -161
  21. gwaslab/util_ex_calculate_ldmatrix.py +2 -2
  22. gwaslab/util_ex_gwascatalog.py +24 -24
  23. gwaslab/util_ex_ldproxyfinder.py +9 -9
  24. gwaslab/util_ex_ldsc.py +189 -0
  25. gwaslab/util_in_calculate_gc.py +6 -6
  26. gwaslab/util_in_calculate_power.py +42 -43
  27. gwaslab/util_in_convert_h2.py +8 -8
  28. gwaslab/util_in_fill_data.py +28 -28
  29. gwaslab/util_in_filter_value.py +91 -52
  30. gwaslab/util_in_get_density.py +8 -8
  31. gwaslab/util_in_get_sig.py +407 -65
  32. gwaslab/viz_aux_annotate_plot.py +12 -12
  33. gwaslab/viz_aux_quickfix.py +18 -18
  34. gwaslab/viz_aux_reposition_text.py +3 -3
  35. gwaslab/viz_aux_save_figure.py +14 -5
  36. gwaslab/viz_plot_compare_af.py +29 -30
  37. gwaslab/viz_plot_compare_effect.py +63 -71
  38. gwaslab/viz_plot_miamiplot2.py +6 -6
  39. gwaslab/viz_plot_mqqplot.py +17 -3
  40. gwaslab/viz_plot_qqplot.py +1 -1
  41. gwaslab/viz_plot_regionalplot.py +33 -32
  42. gwaslab/viz_plot_rg_heatmap.py +28 -26
  43. gwaslab/viz_plot_stackedregional.py +40 -21
  44. gwaslab/viz_plot_trumpetplot.py +50 -55
  45. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  46. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
  47. gwaslab-3.4.39.dist-info/RECORD +80 -0
  48. gwaslab-3.4.38.dist-info/RECORD +0 -72
  49. /gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  50. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  51. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
@@ -24,31 +24,31 @@ def filldata(
24
24
  if type(to_fill) is str:
25
25
  to_fill = [to_fill]
26
26
  sumstats = insumstats.copy()
27
- if verbose: log.write("Start filling data using existing columns...{}".format(_get_version()))
27
+ log.write("Start filling data using existing columns...{}".format(_get_version()), verbose=verbose)
28
28
 
29
29
  check_datatype(sumstats,verbose=verbose,log=log)
30
30
 
31
31
  # check dupication ##############################################################################################
32
32
  skip_cols=[]
33
- if verbose: log.write(" -Overwrite mode: ",overwrite)
33
+ log.write(" -Overwrite mode: ",overwrite, verbose=verbose)
34
34
  if overwrite is False:
35
35
  for i in to_fill:
36
36
  if i in sumstats.columns:
37
37
  skip_cols.append(i)
38
38
  for i in skip_cols:
39
39
  to_fill.remove(i)
40
- if verbose: log.write(" -Skipping columns: ",skip_cols)
40
+ log.write(" -Skipping columns: ",skip_cols, verbose=verbose)
41
41
  if len(set(to_fill) & set(["OR","OR_95L","OR_95U","BETA","SE","P","Z","CHISQ","MLOG10P","MAF"]))==0:
42
42
  log.write(" -No available columns to fill. Skipping.", verbose=verbose)
43
43
  log.write("Finished filling data using existing columns.", verbose=verbose)
44
44
  return sumstats
45
- if verbose: log.write(" -Filling columns: ",to_fill)
45
+ log.write(" -Filling columns: ",to_fill, verbose=verbose)
46
46
  fill_iteratively(sumstats,to_fill,log,only_sig,df,extreme,verbose,sig_level)
47
47
 
48
48
  # ###################################################################################
49
49
  #sumstats = sortcolumn(sumstats, verbose=verbose, log=log)
50
50
  gc.collect()
51
- if verbose: log.write("Finished filling data using existing columns.")
51
+ log.write("Finished filling data using existing columns.", verbose=verbose)
52
52
  return sumstats
53
53
 
54
54
  ##########################################################################################################################
@@ -56,20 +56,20 @@ def filldata(
56
56
  def fill_p(sumstats,log,df=None,only_sig=False,sig_level=5e-8,overwrite=False,verbose=True,filled_count=0):
57
57
  # MLOG10P -> P
58
58
  if "MLOG10P" in sumstats.columns:
59
- if verbose: log.write(" - Filling P value using MLOG10P column...")
59
+ log.write(" - Filling P value using MLOG10P column...", verbose=verbose)
60
60
  sumstats["P"] = np.power(10,-sumstats["MLOG10P"])
61
61
  filled_count +=1
62
62
 
63
63
  # Z -> P
64
64
  elif "Z" in sumstats.columns:
65
- if verbose: log.write(" - Filling P value using Z column...")
65
+ log.write(" - Filling P value using Z column...", verbose=verbose)
66
66
  stats.chisqprob = lambda chisq, degree_of_freedom: stats.chi2.sf(chisq, degree_of_freedom)
67
67
  sumstats["P"] = ss.chisqprob(sumstats["Z"]**2,1)
68
68
  filled_count +=1
69
69
 
70
70
  elif "CHISQ" in sumstats.columns:
71
71
  #CHISQ -> P
72
- if verbose: log.write(" - Filling P value using CHISQ column...")
72
+ log.write(" - Filling P value using CHISQ column...", verbose=verbose)
73
73
  stats.chisqprob = lambda chisq, degree_of_freedom: stats.chi2.sf(chisq, degree_of_freedom)
74
74
  if df is None:
75
75
  if only_sig is True and overwrite is True:
@@ -80,11 +80,11 @@ def fill_p(sumstats,log,df=None,only_sig=False,sig_level=5e-8,overwrite=False,ve
80
80
  filled_count +=1
81
81
  else:
82
82
  if only_sig is True and overwrite is True:
83
- if verbose: log.write(" - Filling P value using CHISQ column for variants:" , sum(sumstats["P"]<sig_level))
83
+ log.write(" - Filling P value using CHISQ column for variants:" , sum(sumstats["P"]<sig_level), verbose=verbose)
84
84
  sumstats.loc[sumstats["P"]<sig_level,"P"] = stats.chisqprob(sumstats.loc[sumstats["P"]<sig_level,"CHISQ"],sumstats.loc[sumstats["P"]<sig_level,df].astype("int"))
85
85
  filled_count +=1
86
86
  else:
87
- if verbose: log.write(" - Filling P value using CHISQ column for all valid variants:")
87
+ log.write(" - Filling P value using CHISQ column for all valid variants:", verbose=verbose)
88
88
  sumstats["P"] = stats.chisqprob(sumstats["CHISQ"],sumstats[df].astype("int"))
89
89
  filled_count +=1
90
90
  else:
@@ -94,7 +94,7 @@ def fill_p(sumstats,log,df=None,only_sig=False,sig_level=5e-8,overwrite=False,ve
94
94
  def fill_z(sumstats,log,verbose=True,filled_count=0):
95
95
  # BETA/SE -> Z
96
96
  if ("BETA" in sumstats.columns) and ("SE" in sumstats.columns):
97
- if verbose: log.write(" - Filling Z using BETA/SE column...")
97
+ log.write(" - Filling Z using BETA/SE column...", verbose=verbose)
98
98
  sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
99
99
  filled_count +=1
100
100
  else:
@@ -104,12 +104,12 @@ def fill_z(sumstats,log,verbose=True,filled_count=0):
104
104
  def fill_chisq(sumstats,log,verbose=True,filled_count=0):
105
105
  # Z -> CHISQ
106
106
  if "Z" in sumstats.columns:
107
- if verbose: log.write(" - Filling CHISQ using Z column...")
107
+ log.write(" - Filling CHISQ using Z column...", verbose=verbose)
108
108
  sumstats["CHISQ"] = (sumstats["Z"])**2
109
109
  filled_count +=1
110
110
  elif "P" in sumstats.columns:
111
111
  # P -> CHISQ
112
- if verbose: log.write(" - Filling CHISQ using P column...")
112
+ log.write(" - Filling CHISQ using P column...", verbose=verbose)
113
113
  sumstats["CHISQ"] = ss.chi2.isf(sumstats["P"], 1)
114
114
  filled_count +=1
115
115
  else:
@@ -119,13 +119,13 @@ def fill_chisq(sumstats,log,verbose=True,filled_count=0):
119
119
  def fill_or(sumstats,log,verbose=True,filled_count=0):
120
120
  # BETA -> OR
121
121
  if "BETA" in sumstats.columns:
122
- if verbose: log.write(" - Filling OR using BETA column...")
122
+ log.write(" - Filling OR using BETA column...", verbose=verbose)
123
123
  sumstats["OR"] = np.exp(sumstats["BETA"])
124
124
  filled_count +=1
125
125
  # BETA/SE -> OR_95L / OR_95U
126
126
  # get confidence interval 95
127
127
  if ("BETA" in sumstats.columns) and ("SE" in sumstats.columns):
128
- if verbose: log.write(" - Filling OR_95L/OR_95U using BETA/SE columns...")
128
+ log.write(" - Filling OR_95L/OR_95U using BETA/SE columns...", verbose=verbose)
129
129
  # beta - 1.96 x se , beta + 1.96 x se
130
130
  sumstats["OR_95L"] = np.exp(sumstats["BETA"]-ss.norm.ppf(0.975)*sumstats["SE"])
131
131
  sumstats["OR_95U"] = np.exp(sumstats["BETA"]+ss.norm.ppf(0.975)*sumstats["SE"])
@@ -136,7 +136,7 @@ def fill_or(sumstats,log,verbose=True,filled_count=0):
136
136
  def fill_or95(sumstats,log,verbose=True,filled_count=0):
137
137
  # get confidence interval 95
138
138
  if ("BETA" in sumstats.columns) and ("SE" in sumstats.columns):
139
- if verbose: log.write(" - Filling OR_95L/OR_95U using BETA/SE columns...")
139
+ log.write(" - Filling OR_95L/OR_95U using BETA/SE columns...", verbose=verbose)
140
140
  # beta - 1.96 x se , beta + 1.96 x se
141
141
  sumstats["OR_95L"] = np.exp(sumstats["BETA"]-ss.norm.ppf(0.975)*sumstats["SE"])
142
142
  sumstats["OR_95U"] = np.exp(sumstats["BETA"]+ss.norm.ppf(0.975)*sumstats["SE"])
@@ -148,7 +148,7 @@ def fill_or95(sumstats,log,verbose=True,filled_count=0):
148
148
  def fill_beta(sumstats,log,verbose=True,filled_count=0):
149
149
  # OR -> beta
150
150
  if "OR" in sumstats.columns:
151
- if verbose: log.write(" - Filling BETA value using OR column...")
151
+ log.write(" - Filling BETA value using OR column...", verbose=verbose)
152
152
  sumstats["BETA"] = np.log(sumstats["OR"])
153
153
  filled_count +=1
154
154
  else:
@@ -158,27 +158,27 @@ def fill_beta(sumstats,log,verbose=True,filled_count=0):
158
158
  def fill_se(sumstats,log,verbose=True,filled_count=0):
159
159
  # OR / OR_95L /OR_95U -> SE
160
160
  if ("P" in sumstats.columns) and ("BETA" in sumstats.columns):
161
- if verbose: log.write(" - Filling SE value using BETA and P column...")
161
+ log.write(" - Filling SE value using BETA and P column...", verbose=verbose)
162
162
  sumstats["SE"]= np.abs(sumstats["BETA"]/ ss.norm.ppf(1-sumstats["P"]/2))
163
163
  filled_count +=1
164
164
  elif ("OR" in sumstats.columns) and ("OR_95U" in sumstats.columns):
165
- if verbose: log.write(" - Filling SE value using OR/OR_95U column...")
165
+ log.write(" - Filling SE value using OR/OR_95U column...", verbose=verbose)
166
166
  #
167
167
  sumstats["SE"]=(np.log(sumstats["OR_95U"]) - np.log(sumstats["OR"]))/ss.norm.ppf(0.975)
168
168
  filled_count +=1
169
169
  elif ("OR" in sumstats.columns) and ("OR_95L" in sumstats.columns):
170
- if verbose: log.write(" - Filling SE value using OR/OR_95L column...")
170
+ log.write(" - Filling SE value using OR/OR_95L column...", verbose=verbose)
171
171
  sumstats["SE"]=(np.log(sumstats["OR"]) - np.log(sumstats["OR_95L"]))/ss.norm.ppf(0.975)
172
172
  filled_count +=1
173
173
  else:
174
- if verbose: log.write(" - Not enough information to fill SE...")
174
+ log.write(" - Not enough information to fill SE...", verbose=verbose)
175
175
  return 0,filled_count
176
176
  return 1,filled_count
177
177
 
178
178
  def fill_mlog10p(sumstats,log,verbose=True,filled_count=0):
179
179
  if "P" in sumstats.columns:
180
180
  # P -> MLOG10P
181
- if verbose: log.write(" - Filling MLOG10P using P column...")
181
+ log.write(" - Filling MLOG10P using P column...", verbose=verbose)
182
182
  sumstats["MLOG10P"] = -np.log10(sumstats["P"])
183
183
  filled_count +=1
184
184
  else:
@@ -188,14 +188,14 @@ def fill_extreme_mlog10p(sumstats,log,verbose=True,filled_count=0):
188
188
  # ref: https://stackoverflow.com/questions/46416027/how-to-compute-p-values-from-z-scores-in-r-when-the-z-score-is-large-pvalue-muc/46416222#46416222
189
189
  if "Z" in sumstats.columns:
190
190
  # P -> MLOG10P
191
- if verbose: log.write(" - Filling MLOG10P using Z column...")
191
+ log.write(" - Filling MLOG10P using Z column...", verbose=verbose)
192
192
  sumstats = fill_extreme_mlog10(sumstats, "Z")
193
193
  filled_count +=1
194
194
  elif "BETA" in sumstats.columns and "SE" in sumstats.columns:
195
- if verbose: log.write(" - Z column not available...")
196
- if verbose: log.write(" - Filling Z using BETA/SE column...")
195
+ log.write(" - Z column not available...", verbose=verbose)
196
+ log.write(" - Filling Z using BETA/SE column...", verbose=verbose)
197
197
  sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
198
- if verbose: log.write(" - Filling MLOG10P using Z column...")
198
+ log.write(" - Filling MLOG10P using Z column...", verbose=verbose)
199
199
  sumstats = fill_extreme_mlog10(sumstats, "Z")
200
200
  filled_count +=1
201
201
  else:
@@ -205,7 +205,7 @@ def fill_extreme_mlog10p(sumstats,log,verbose=True,filled_count=0):
205
205
  def fill_maf(sumstats,log,verbose=True,filled_count=0):
206
206
  if "EAF" in sumstats.columns:
207
207
  # EAF -> MAF
208
- if verbose: log.write(" - Filling MAF using EAF column...")
208
+ log.write(" - Filling MAF using EAF column...", verbose=verbose)
209
209
  sumstats["MAF"] = sumstats["EAF"].apply(lambda x: min(x,1-x) if pd.notnull(x) else np.nan)
210
210
  filled_count +=1
211
211
  else:
@@ -226,7 +226,7 @@ def fill_extreme_mlog10(sumstats, z):
226
226
  ####################################################################################################################
227
227
  def fill_iteratively(sumstats,raw_to_fill,log,only_sig,df,extreme,verbose,sig_level):
228
228
  to_fill = raw_to_fill.copy()
229
- if verbose: log.write(" - Filling Columns iteratively...")
229
+ log.write(" - Filling Columns iteratively...", verbose=verbose)
230
230
 
231
231
  filled_count=0
232
232
  for i in range(len(to_fill)+1):
@@ -10,65 +10,66 @@ from gwaslab.g_vchange_status import vchange_status
10
10
  from gwaslab.qc_fix_sumstats import sortcoordinate
11
11
  from gwaslab.qc_fix_sumstats import start_to
12
12
  from gwaslab.qc_fix_sumstats import finished
13
+ from gwaslab.hm_harmonize_sumstats import is_palindromic
13
14
 
14
15
  import gc
15
16
  def filtervalues(sumstats,expr,remove=False,verbose=True,log=Log()):
16
- if verbose: log.write("Start filtering values by condition:",expr)
17
+ log.write("Start filtering values by condition:",expr, verbose=verbose)
17
18
  prenum = len(sumstats)
18
19
  sumstats = sumstats.query(expr,engine='python').copy()
19
20
  afternum = len(sumstats)
20
- if verbose: log.write(" -Removing "+ str(prenum-afternum) +" variants not meeting the conditions:",expr)
21
- if verbose: log.write("Finished filtering values.")
21
+ log.write(" -Removing "+ str(prenum-afternum) +" variants not meeting the conditions:",expr, verbose=verbose)
22
+ log.write("Finished filtering values.", verbose=verbose)
22
23
  gc.collect()
23
24
  return sumstats
24
25
 
25
26
  def filterout(sumstats,interval={},lt={},gt={},eq={},remove=False,verbose=True,log=Log()):
26
- if verbose: log.write("Start filtering values:")
27
+ log.write("Start filtering values:", verbose=verbose)
27
28
  for key,threshold in gt.items():
28
29
  num = len(sumstats.loc[sumstats[key]>threshold,:])
29
- if verbose:log.write(" -Removing "+ str(num) +" variants with "+key+" > "+ str(threshold)+" ...")
30
+ log.write(" -Removing "+ str(num) +" variants with "+key+" > "+ str(threshold)+" ...", verbose=verbose)
30
31
  sumstats = sumstats.loc[sumstats[key]<threshold,:]
31
32
  for key,threshold in lt.items():
32
33
  num = len(sumstats.loc[sumstats[key]<threshold,:])
33
- if verbose:log.write(" -Removing "+ str(num) +" variants with "+key+" < "+ str(threshold)+" ...")
34
+ log.write(" -Removing "+ str(num) +" variants with "+key+" < "+ str(threshold)+" ...", verbose=verbose)
34
35
  sumstats = sumstats.loc[sumstats[key]>threshold,:]
35
36
  for key,threshold in eq.items():
36
37
  num = len(sumstats.loc[sumstats[key]==threshold,:])
37
- if verbose:log.write(" -Removing "+ str(num) +" variants with "+key+" = "+ str(threshold)+" ...")
38
+ log.write(" -Removing "+ str(num) +" variants with "+key+" = "+ str(threshold)+" ...", verbose=verbose)
38
39
  sumstats = sumstats.loc[sumstats[key]!=threshold,:]
39
- if verbose: log.write("Finished filtering values.")
40
+ log.write("Finished filtering values.", verbose=verbose)
40
41
  gc.collect()
41
42
  return sumstats.copy()
42
43
 
43
44
  def filterin(sumstats,lt={},gt={},eq={},remove=False,verbose=True,log=Log()):
44
- if verbose: log.write("Start filtering values:")
45
+ log.write("Start filtering values:", verbose=verbose)
45
46
  for key,threshold in gt.items():
46
47
  num = len(sumstats.loc[sumstats[key]>threshold,:])
47
- if verbose:log.write(" -Keeping "+ str(num) +" variants with "+key+" > "+ str(threshold)+" ...")
48
+ log.write(" -Keeping "+ str(num) +" variants with "+key+" > "+ str(threshold)+" ...", verbose=verbose)
48
49
  sumstats = sumstats.loc[sumstats[key]>threshold,:]
49
50
  for key,threshold in lt.items():
50
51
  num = len(sumstats.loc[sumstats[key]<threshold,:])
51
- if verbose:log.write(" -Keeping "+ str(num) +" variants with "+key+" < "+ str(threshold)+" ...")
52
+ log.write(" -Keeping "+ str(num) +" variants with "+key+" < "+ str(threshold)+" ...", verbose=verbose)
52
53
  sumstats = sumstats.loc[sumstats[key]<threshold,:]
53
54
  for key,threshold in eq.items():
54
55
  num = len(sumstats.loc[sumstats[key]==threshold,:])
55
- if verbose:log.write(" -Keeping "+ str(num) +" variants with "+key+" = "+ str(threshold)+" ...")
56
+ log.write(" -Keeping "+ str(num) +" variants with "+key+" = "+ str(threshold)+" ...", verbose=verbose)
56
57
  sumstats = sumstats.loc[sumstats[key]==threshold,:]
57
- if verbose: log.write("Finished filtering values.")
58
+ log.write("Finished filtering values.", verbose=verbose)
58
59
  gc.collect()
59
60
  return sumstats.copy()
60
61
 
61
62
  def filterregionin(sumstats,path=None, chrom="CHR",pos="POS", high_ld=False, build="19", verbose=True,log=Log()):
62
63
  sumstats = sortcoordinate(sumstats,verbose=verbose)
63
- if verbose: log.write("Start to filter in variants if in intervals defined in bed files:")
64
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
64
+ log.write("Start to filter in variants if in intervals defined in bed files:", verbose=verbose)
65
+ log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns), verbose=verbose)
65
66
 
66
67
  if high_ld is True:
67
68
  path = get_high_ld(build=build)
68
- if verbose: log.write(" -Loading bed format file for hg"+build)
69
+ log.write(" -Loading bed format file for hg"+build, verbose=verbose)
69
70
 
70
71
  else:
71
- if verbose: log.write(" -Loading bed format file: " , path)
72
+ log.write(" -Loading bed format file: " , path, verbose=verbose)
72
73
  bed = pd.read_csv(path,sep="\s+",header=None,dtype={0:"string",1:"Int64",2:"Int64"})
73
74
 
74
75
  bed["tuple"] = bed.apply(lambda x: (x[1],x[2]),axis=1)
@@ -80,7 +81,7 @@ def filterregionin(sumstats,path=None, chrom="CHR",pos="POS", high_ld=False, bui
80
81
  sumstats = sumstats.sort_values(["CHR","POS"])
81
82
 
82
83
  if len(bed)<100:
83
- if verbose: log.write(" -Bed file < 100 lines: using pd IntervalIndex... ")
84
+ log.write(" -Bed file < 100 lines: using pd IntervalIndex... ", verbose=verbose)
84
85
  for i in sumstats[chrom].unique():
85
86
  if sum(bed[0]==i)>0:
86
87
  interval = pd.IntervalIndex.from_tuples(bed.loc[bed[0]==i,"tuple"])
@@ -88,7 +89,7 @@ def filterregionin(sumstats,path=None, chrom="CHR",pos="POS", high_ld=False, bui
88
89
  else:
89
90
  continue
90
91
  else:
91
- if verbose: log.write(" -Bed file > 100 lines: using two pointers, please make files are all sorted... ")
92
+ log.write(" -Bed file > 100 lines: using two pointers, please make files are all sorted... ", verbose=verbose)
92
93
  bed_num =0
93
94
  bed_chr =bed.iloc[bed_num,0]
94
95
  bed_left =bed.iloc[bed_num,1]
@@ -136,23 +137,23 @@ def filterregionin(sumstats,path=None, chrom="CHR",pos="POS", high_ld=False, bui
136
137
  ## in
137
138
 
138
139
  sumstats = sumstats.loc[sumstats["bed_indicator"],:]
139
- if verbose: log.write(" -Number of variants in the specified regions to keep:",sum(sumstats["bed_indicator"]))
140
- if verbose: log.write(" -Number of variants removed:",sum(~sumstats["bed_indicator"]))
140
+ log.write(" -Number of variants in the specified regions to keep:",sum(sumstats["bed_indicator"]), verbose=verbose)
141
+ log.write(" -Number of variants removed:",sum(~sumstats["bed_indicator"]), verbose=verbose)
141
142
  sumstats = sumstats.drop(columns="bed_indicator")
142
- if verbose: log.write("Finished filtering in variants.")
143
+ log.write("Finished filtering in variants.", verbose=verbose)
143
144
  gc.collect()
144
145
  return sumstats
145
146
 
146
147
  def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, build="19", verbose=True,log=Log()):
147
148
  sumstats = sortcoordinate(sumstats,verbose=verbose)
148
- if verbose: log.write("Start to filter out variants if in intervals defined in bed files:")
149
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
149
+ log.write("Start to filter out variants if in intervals defined in bed files:", verbose=verbose)
150
+ log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns), verbose=verbose)
150
151
  if high_ld is True:
151
152
  path = get_high_ld(build=build)
152
- if verbose: log.write(" -Loading bed format file for hg"+build)
153
+ log.write(" -Loading bed format file for hg"+build, verbose=verbose)
153
154
 
154
155
  else:
155
- if verbose: log.write(" -Loading bed format file: " , path)
156
+ log.write(" -Loading bed format file: " , path, verbose=verbose)
156
157
 
157
158
  bed = pd.read_csv(path,sep="\s+",header=None,dtype={0:"string",1:"Int64",2:"Int64"})
158
159
  bed["tuple"] = bed.apply(lambda x: (x[1],x[2]),axis=1)
@@ -164,7 +165,7 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
164
165
  bed[0]=bed[0].astype("Int64")
165
166
 
166
167
  if len(bed)<100:
167
- if verbose: log.write(" -Bed file < 100 lines: using pd IntervalIndex... ")
168
+ log.write(" -Bed file < 100 lines: using pd IntervalIndex... ", verbose=verbose)
168
169
  for i in sumstats[chrom].unique():
169
170
  if sum(bed[0]==i)>0:
170
171
  interval = pd.IntervalIndex.from_tuples(bed.loc[bed[0]==i,"tuple"])
@@ -172,7 +173,7 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
172
173
  else:
173
174
  continue
174
175
  else:
175
- if verbose: log.write(" -Bed file > 100 lines: using two pointers, please make files are all sorted... ")
176
+ log.write(" -Bed file > 100 lines: using two pointers, please make files are all sorted... ", verbose=verbose)
176
177
  bed_num =0
177
178
  bed_chr =bed.iloc[bed_num,0]
178
179
  bed_left =bed.iloc[bed_num,1]
@@ -208,10 +209,10 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
208
209
  ## out
209
210
 
210
211
  sumstats = sumstats.loc[~sumstats["bed_indicator"],:]
211
- if verbose: log.write(" -Number of variants in the specified regions to exclude:",sum(sumstats["bed_indicator"]))
212
- if verbose: log.write(" -Number of variants left:",len(sumstats))
212
+ log.write(" -Number of variants in the specified regions to exclude:",sum(sumstats["bed_indicator"]), verbose=verbose)
213
+ log.write(" -Number of variants left:",len(sumstats), verbose=verbose)
213
214
  sumstats = sumstats.drop(columns="bed_indicator")
214
- if verbose: log.write("Finished filtering out variants.")
215
+ log.write("Finished filtering out variants.", verbose=verbose)
215
216
  gc.collect()
216
217
  return sumstats
217
218
 
@@ -235,14 +236,14 @@ def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NE
235
236
  ############################################################################################
236
237
 
237
238
  inferred_build="Unknown"
238
- if verbose:log.write("Start to infer genome build version using hapmap3 SNPs...")
239
+ log.write("Start to infer genome build version using hapmap3 SNPs...", verbose=verbose)
239
240
  data_path_19 = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz'
240
241
  data_path_38 = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz'
241
- if verbose:log.write(" -Loading Hapmap3 variants data...")
242
+ log.write(" -Loading Hapmap3 variants data...", verbose=verbose)
242
243
  hapmap3_ref_19 = pd.read_csv(data_path_19,sep="\s+",usecols=["#CHROM","POS"],dtype={"#CHROM":"string","POS":"string"})
243
244
  hapmap3_ref_38 = pd.read_csv(data_path_38,sep="\s+",usecols=["#CHROM","POS"],dtype={"#CHROM":"string","POS":"string"})
244
245
 
245
- if verbose: log.write(" -CHR:POS will be used for matching...")
246
+ log.write(" -CHR:POS will be used for matching...", verbose=verbose)
246
247
  raw_chrpos = sumstats[chrom].astype("string")+":"+sumstats[pos].astype("string")
247
248
 
248
249
  hapmap3_ref_19["chr:pos"] = hapmap3_ref_19["#CHROM"]+":"+hapmap3_ref_19["POS"]
@@ -251,50 +252,50 @@ def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NE
251
252
  match_count_for_19 = sum(raw_chrpos.isin(hapmap3_ref_19["chr:pos"].values))
252
253
  match_count_for_38 = sum(raw_chrpos.isin(hapmap3_ref_38["chr:pos"].values))
253
254
 
254
- if verbose:log.write(" -Matching variants for hg19: num_hg19 = ",match_count_for_19)
255
- if verbose:log.write(" -Matching variants for hg38: num_hg38 = ",match_count_for_38)
255
+ log.write(" -Matching variants for hg19: num_hg19 = ",match_count_for_19, verbose=verbose)
256
+ log.write(" -Matching variants for hg38: num_hg38 = ",match_count_for_38, verbose=verbose)
256
257
 
257
258
  if max(match_count_for_19, match_count_for_38)<10000:
258
- if verbose:log.write(" -Warning: please be cautious due to the limited number of variants.")
259
+ log.warning("Please be cautious due to the limited number of variants.", verbose=verbose)
259
260
 
260
261
  if match_count_for_19 > match_count_for_38:
261
- if verbose:log.write(" -Since num_hg19 >> num_hg38, assigning genome build hg19...")
262
+ log.write(" -Since num_hg19 >> num_hg38, assigning genome build hg19...", verbose=verbose)
262
263
  sumstats[status] = vchange_status(sumstats[status],1,"9","1")
263
264
  sumstats[status] = vchange_status(sumstats[status],2,"9","9")
264
265
  inferred_build="19"
265
266
  elif match_count_for_19 < match_count_for_38:
266
- if verbose:log.write(" -Since num_hg19 << num_hg38, assigning genome build hg38...")
267
+ log.write(" -Since num_hg19 << num_hg38, assigning genome build hg38...", verbose=verbose)
267
268
  sumstats[status] = vchange_status(sumstats[status],1,"9","3")
268
269
  sumstats[status] = vchange_status(sumstats[status],2,"9","8")
269
270
  inferred_build="38"
270
271
  else:
271
- if verbose:log.write(" -Since num_hg19 = num_hg38, unable to infer...")
272
+ log.write(" -Since num_hg19 = num_hg38, unable to infer...", verbose=verbose)
272
273
 
273
274
  finished(log,verbose,_end_line)
274
275
  return sumstats, inferred_build
275
276
 
276
277
  def sampling(sumstats,n=1, p=None, verbose=True,log=Log(),**args):
277
278
 
278
- if verbose:log.write("Start to randomly select variants from the sumstats...")
279
+ log.write("Start to randomly select variants from the sumstats...", verbose=verbose)
279
280
  if p is None:
280
- if verbose:log.write(" -Number of variants selected from the sumstats:",n)
281
+ log.write(" -Number of variants selected from the sumstats:",n, verbose=verbose)
281
282
  if n > len(sumstats):
282
283
  raise ValueError("Please input a number < {}".format(len(sumstats)))
283
284
  else:
284
285
  if p>-0.00000001 and p<1.00000001:
285
- if verbose:log.write(" -Percentage of variants selected from the sumstats: ",p)
286
+ log.write(" -Percentage of variants selected from the sumstats: ",p, verbose=verbose)
286
287
  n = int(len(sumstats)*p)
287
- if verbose:log.write(" -Number of variants selected from the sumstats:",n)
288
+ log.write(" -Number of variants selected from the sumstats:",n, verbose=verbose)
288
289
  else:
289
290
  raise ValueError("Please input a number in (0,1)")
290
291
 
291
292
  if "random_state" in args.keys():
292
- if verbose:log.write(" -Random state (seed): {}".format(args["random_state"]))
293
+ log.write(" -Random state (seed): {}".format(args["random_state"]), verbose=verbose)
293
294
  else:
294
295
  args["random_state"] = np.random.randint(0,4294967295)
295
- if verbose:log.write(" -Random state (seed): {}".format(args["random_state"]))
296
+ log.write(" -Random state (seed): {}".format(args["random_state"]), verbose=verbose)
296
297
  sampled = sumstats.sample(n=n,**args)
297
- if verbose:log.write("Finished sampling...")
298
+ log.write("Finished sampling...", verbose=verbose)
298
299
  gc.collect()
299
300
  return sampled
300
301
 
@@ -322,8 +323,8 @@ def _get_flanking(sumstats, snpid, windowsizekb=500, verbose=True,log=Log(),**ar
322
323
  def _get_flanking_by_id(sumstats, snpid, windowsizekb=500, verbose=True,log=Log(),**args):
323
324
 
324
325
  log.write("Start to extract variants in the flanking regions using rsID or SNPID...",verbose=verbose)
325
- log.write(" - Central variants: {}".format(snpid))
326
- log.write(" - Flanking windowsize in kb: {}".format(windowsizekb))
326
+ log.write(" - Central variants: {}".format(snpid), verbose=verbose)
327
+ log.write(" - Flanking windowsize in kb: {}".format(windowsizekb), verbose=verbose)
327
328
 
328
329
  if type(snpid) == str:
329
330
  snpid = [snpid]
@@ -361,8 +362,8 @@ def _get_flanking_by_id(sumstats, snpid, windowsizekb=500, verbose=True,log=Log(
361
362
  def _get_flanking_by_chrpos(sumstats, chrpos, windowsizekb=500, verbose=True,log=Log(),**args):
362
363
 
363
364
  log.write("Start to extract variants in the flanking regions using CHR and POS...",verbose=verbose)
364
- log.write(" - Central positions: {}".format(chrpos))
365
- log.write(" - Flanking windowsize in kb: {}".format(windowsizekb))
365
+ log.write(" - Central positions: {}".format(chrpos), verbose=verbose)
366
+ log.write(" - Flanking windowsize in kb: {}".format(windowsizekb), verbose=verbose)
366
367
 
367
368
  if type(chrpos) == tuple:
368
369
  chrpos_to_check = [chrpos]
@@ -389,4 +390,42 @@ def _get_flanking_by_chrpos(sumstats, chrpos, windowsizekb=500, verbose=True,log
389
390
  log.write(" - Extracted {} variants in the regions.".format(len(flanking)),verbose=verbose)
390
391
  log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
391
392
 
392
- return flanking
393
+ return flanking
394
+
395
+ def _filter_palindromic(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
396
+ log.write("Start to filter palindromic variants...",verbose=verbose)
397
+ is_palindromic_snp = is_palindromic(sumstats[[nea,ea]],a1=nea,a2=ea)
398
+
399
+ log.write(" -Identified palindromic variants: {}".format(sum(is_palindromic_snp)),verbose=verbose)
400
+
401
+ if mode=="in":
402
+ palindromic = sumstats.loc[is_palindromic_snp,:]
403
+ else:
404
+ palindromic = sumstats.loc[~is_palindromic_snp,:]
405
+
406
+ log.write("Finished filtering palindromic variants.",verbose=verbose)
407
+ return palindromic
408
+
409
+ def _filter_indel(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
410
+ log.write("Start to filter indels...",verbose=verbose)
411
+ is_indel = (sumstats[ea].str.len()!=sumstats[nea].str.len())
412
+
413
+ log.write(" -Identified indels: {}".format(sum(is_indel)),verbose=verbose)
414
+ if mode=="in":
415
+ indel = sumstats.loc[is_indel,:]
416
+ else:
417
+ indel = sumstats.loc[~is_indel,:]
418
+ log.write("Finished filtering indels.",verbose=verbose)
419
+ return indel
420
+
421
+ def _filter_snp(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
422
+ log.write("Start to filter SNPs...",verbose=verbose)
423
+ is_snp = (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()==1)
424
+
425
+ log.write(" -Identified SNPs: {}".format(sum(is_snp)),verbose=verbose)
426
+ if mode=="in":
427
+ snp = sumstats.loc[is_snp,:]
428
+ else:
429
+ snp = sumstats.loc[~is_snp,:]
430
+ log.write("Finished filtering SNPs.",verbose=verbose)
431
+ return snp
@@ -5,9 +5,9 @@ from gwaslab.g_Log import Log
5
5
  import gc
6
6
 
7
7
  def getsignaldensity(insumstats, id="SNPID", chrom="CHR",pos="POS", bwindowsizekb=100,log=Log(),verbose=True):
8
- if verbose:log.write("Start to calculate signal DENSITY...")
8
+ log.write("Start to calculate signal DENSITY..." ,verbose=verbose)
9
9
  sumstats = insumstats[[id,chrom,pos]].copy()
10
- if verbose:log.write(" -Calculating DENSITY with windowsize of ",bwindowsizekb ," kb")
10
+ log.write(" -Calculating DENSITY with windowsize of ",bwindowsizekb ," kb",verbose=verbose)
11
11
  #stack=[]
12
12
 
13
13
  large_number = 1000000000
@@ -58,13 +58,13 @@ def getsignaldensity(insumstats, id="SNPID", chrom="CHR",pos="POS", bwindowsizek
58
58
  bmax = sumstats["DENSITY"].max()
59
59
  bmaxid = sumstats["DENSITY"].idxmax()
60
60
 
61
- if verbose:log.write(" -Mean : {} signals per {} kb".format(bmean,bwindowsizekb))
62
- if verbose:log.write(" -SD : {}".format(bsd))
63
- if verbose:log.write(" -Median : {} signals per {} kb".format(bmedian,bwindowsizekb))
64
- if verbose:log.write(" -Max : {} signals per {} kb at variant(s) {}".format(bmax,bwindowsizekb,sumstats.loc[bmaxid,id]))
61
+ log.write(" -Mean : {} signals per {} kb".format(bmean,bwindowsizekb),verbose=verbose)
62
+ log.write(" -SD : {}".format(bsd),verbose=verbose)
63
+ log.write(" -Median : {} signals per {} kb".format(bmedian,bwindowsizekb),verbose=verbose)
64
+ log.write(" -Max : {} signals per {} kb at variant(s) {}".format(bmax,bwindowsizekb,sumstats.loc[bmaxid,id]),verbose=verbose)
65
65
 
66
66
  sumstats = sumstats.drop("TCHR+POS",axis=1)
67
- if verbose:log.write("Finished calculating signal DENSITY successfully!")
67
+ log.write("Finished calculating signal DENSITY successfully!",verbose=verbose)
68
68
  return sumstats["DENSITY"]
69
69
 
70
70
  def assigndensity(insumstats,
@@ -92,7 +92,7 @@ def assigndensity(insumstats,
92
92
  to_add =(sumstats["TCHR+POS"]>=(row["TCHR+POS"]- 1000*bwindowsizekb)) & (sumstats["TCHR+POS"]<=(row["TCHR+POS"]+ 1000*bwindowsizekb))
93
93
  sumstats.loc[to_add,"DENSITY"] += 1
94
94
  if counter%1000==0:
95
- if verbose:log.write(" -Processed {} signals".format(counter//1000))
95
+ log.write(" -Processed {} signals".format(counter//1000),verbose=verbose)
96
96
  gc.collect()
97
97
 
98
98
  return sumstats["DENSITY"]