gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (57) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/data/formatbook.json +722 -721
  5. gwaslab/g_Log.py +22 -5
  6. gwaslab/g_Sumstats.py +110 -163
  7. gwaslab/g_SumstatsPair.py +76 -25
  8. gwaslab/g_SumstatsT.py +2 -2
  9. gwaslab/g_Sumstats_summary.py +3 -3
  10. gwaslab/g_version.py +10 -10
  11. gwaslab/hm_casting.py +36 -17
  12. gwaslab/hm_harmonize_sumstats.py +354 -221
  13. gwaslab/hm_rsid_to_chrpos.py +1 -1
  14. gwaslab/io_preformat_input.py +49 -43
  15. gwaslab/io_read_ldsc.py +49 -1
  16. gwaslab/io_to_formats.py +428 -295
  17. gwaslab/ldsc_irwls.py +198 -0
  18. gwaslab/ldsc_jackknife.py +514 -0
  19. gwaslab/ldsc_ldscore.py +417 -0
  20. gwaslab/ldsc_parse.py +294 -0
  21. gwaslab/ldsc_regressions.py +747 -0
  22. gwaslab/ldsc_sumstats.py +629 -0
  23. gwaslab/qc_check_datatype.py +3 -3
  24. gwaslab/qc_fix_sumstats.py +891 -778
  25. gwaslab/util_ex_calculate_ldmatrix.py +31 -13
  26. gwaslab/util_ex_gwascatalog.py +25 -25
  27. gwaslab/util_ex_ldproxyfinder.py +10 -10
  28. gwaslab/util_ex_ldsc.py +189 -0
  29. gwaslab/util_ex_process_ref.py +3 -3
  30. gwaslab/util_ex_run_coloc.py +26 -4
  31. gwaslab/util_in_calculate_gc.py +6 -6
  32. gwaslab/util_in_calculate_power.py +42 -43
  33. gwaslab/util_in_convert_h2.py +8 -8
  34. gwaslab/util_in_fill_data.py +30 -30
  35. gwaslab/util_in_filter_value.py +201 -74
  36. gwaslab/util_in_get_density.py +10 -10
  37. gwaslab/util_in_get_sig.py +445 -71
  38. gwaslab/viz_aux_annotate_plot.py +12 -12
  39. gwaslab/viz_aux_quickfix.py +42 -37
  40. gwaslab/viz_aux_reposition_text.py +10 -7
  41. gwaslab/viz_aux_save_figure.py +18 -8
  42. gwaslab/viz_plot_compare_af.py +32 -33
  43. gwaslab/viz_plot_compare_effect.py +63 -71
  44. gwaslab/viz_plot_miamiplot2.py +34 -26
  45. gwaslab/viz_plot_mqqplot.py +126 -75
  46. gwaslab/viz_plot_qqplot.py +11 -8
  47. gwaslab/viz_plot_regionalplot.py +36 -33
  48. gwaslab/viz_plot_rg_heatmap.py +28 -26
  49. gwaslab/viz_plot_stackedregional.py +40 -21
  50. gwaslab/viz_plot_trumpetplot.py +65 -61
  51. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  52. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
  53. gwaslab-3.4.39.dist-info/RECORD +80 -0
  54. gwaslab-3.4.37.dist-info/RECORD +0 -72
  55. /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  56. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  57. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
@@ -8,65 +8,68 @@ from gwaslab.bd_common_data import get_chr_to_number
8
8
  from gwaslab.g_Log import Log
9
9
  from gwaslab.g_vchange_status import vchange_status
10
10
  from gwaslab.qc_fix_sumstats import sortcoordinate
11
+ from gwaslab.qc_fix_sumstats import start_to
12
+ from gwaslab.qc_fix_sumstats import finished
13
+ from gwaslab.hm_harmonize_sumstats import is_palindromic
11
14
 
12
15
  import gc
13
16
  def filtervalues(sumstats,expr,remove=False,verbose=True,log=Log()):
14
- if verbose: log.write("Start filtering values by condition:",expr)
17
+ log.write("Start filtering values by condition:",expr, verbose=verbose)
15
18
  prenum = len(sumstats)
16
19
  sumstats = sumstats.query(expr,engine='python').copy()
17
20
  afternum = len(sumstats)
18
- if verbose: log.write(" -Removing "+ str(prenum-afternum) +" variants not meeting the conditions:",expr)
19
- if verbose: log.write("Finished filtering values.")
21
+ log.write(" -Removing "+ str(prenum-afternum) +" variants not meeting the conditions:",expr, verbose=verbose)
22
+ log.write("Finished filtering values.", verbose=verbose)
20
23
  gc.collect()
21
24
  return sumstats
22
25
 
23
26
  def filterout(sumstats,interval={},lt={},gt={},eq={},remove=False,verbose=True,log=Log()):
24
- if verbose: log.write("Start filtering values:")
27
+ log.write("Start filtering values:", verbose=verbose)
25
28
  for key,threshold in gt.items():
26
29
  num = len(sumstats.loc[sumstats[key]>threshold,:])
27
- if verbose:log.write(" -Removing "+ str(num) +" variants with "+key+" > "+ str(threshold)+" ...")
30
+ log.write(" -Removing "+ str(num) +" variants with "+key+" > "+ str(threshold)+" ...", verbose=verbose)
28
31
  sumstats = sumstats.loc[sumstats[key]<threshold,:]
29
32
  for key,threshold in lt.items():
30
33
  num = len(sumstats.loc[sumstats[key]<threshold,:])
31
- if verbose:log.write(" -Removing "+ str(num) +" variants with "+key+" < "+ str(threshold)+" ...")
34
+ log.write(" -Removing "+ str(num) +" variants with "+key+" < "+ str(threshold)+" ...", verbose=verbose)
32
35
  sumstats = sumstats.loc[sumstats[key]>threshold,:]
33
36
  for key,threshold in eq.items():
34
37
  num = len(sumstats.loc[sumstats[key]==threshold,:])
35
- if verbose:log.write(" -Removing "+ str(num) +" variants with "+key+" = "+ str(threshold)+" ...")
38
+ log.write(" -Removing "+ str(num) +" variants with "+key+" = "+ str(threshold)+" ...", verbose=verbose)
36
39
  sumstats = sumstats.loc[sumstats[key]!=threshold,:]
37
- if verbose: log.write("Finished filtering values.")
40
+ log.write("Finished filtering values.", verbose=verbose)
38
41
  gc.collect()
39
42
  return sumstats.copy()
40
43
 
41
44
  def filterin(sumstats,lt={},gt={},eq={},remove=False,verbose=True,log=Log()):
42
- if verbose: log.write("Start filtering values:")
45
+ log.write("Start filtering values:", verbose=verbose)
43
46
  for key,threshold in gt.items():
44
47
  num = len(sumstats.loc[sumstats[key]>threshold,:])
45
- if verbose:log.write(" -Keeping "+ str(num) +" variants with "+key+" > "+ str(threshold)+" ...")
48
+ log.write(" -Keeping "+ str(num) +" variants with "+key+" > "+ str(threshold)+" ...", verbose=verbose)
46
49
  sumstats = sumstats.loc[sumstats[key]>threshold,:]
47
50
  for key,threshold in lt.items():
48
51
  num = len(sumstats.loc[sumstats[key]<threshold,:])
49
- if verbose:log.write(" -Keeping "+ str(num) +" variants with "+key+" < "+ str(threshold)+" ...")
52
+ log.write(" -Keeping "+ str(num) +" variants with "+key+" < "+ str(threshold)+" ...", verbose=verbose)
50
53
  sumstats = sumstats.loc[sumstats[key]<threshold,:]
51
54
  for key,threshold in eq.items():
52
55
  num = len(sumstats.loc[sumstats[key]==threshold,:])
53
- if verbose:log.write(" -Keeping "+ str(num) +" variants with "+key+" = "+ str(threshold)+" ...")
56
+ log.write(" -Keeping "+ str(num) +" variants with "+key+" = "+ str(threshold)+" ...", verbose=verbose)
54
57
  sumstats = sumstats.loc[sumstats[key]==threshold,:]
55
- if verbose: log.write("Finished filtering values.")
58
+ log.write("Finished filtering values.", verbose=verbose)
56
59
  gc.collect()
57
60
  return sumstats.copy()
58
61
 
59
62
  def filterregionin(sumstats,path=None, chrom="CHR",pos="POS", high_ld=False, build="19", verbose=True,log=Log()):
60
63
  sumstats = sortcoordinate(sumstats,verbose=verbose)
61
- if verbose: log.write("Start to filter in variants if in intervals defined in bed files:")
62
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
64
+ log.write("Start to filter in variants if in intervals defined in bed files:", verbose=verbose)
65
+ log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns), verbose=verbose)
63
66
 
64
67
  if high_ld is True:
65
68
  path = get_high_ld(build=build)
66
- if verbose: log.write(" -Loading bed format file for hg"+build)
69
+ log.write(" -Loading bed format file for hg"+build, verbose=verbose)
67
70
 
68
71
  else:
69
- if verbose: log.write(" -Loading bed format file: " , path)
72
+ log.write(" -Loading bed format file: " , path, verbose=verbose)
70
73
  bed = pd.read_csv(path,sep="\s+",header=None,dtype={0:"string",1:"Int64",2:"Int64"})
71
74
 
72
75
  bed["tuple"] = bed.apply(lambda x: (x[1],x[2]),axis=1)
@@ -78,7 +81,7 @@ def filterregionin(sumstats,path=None, chrom="CHR",pos="POS", high_ld=False, bui
78
81
  sumstats = sumstats.sort_values(["CHR","POS"])
79
82
 
80
83
  if len(bed)<100:
81
- if verbose: log.write(" -Bed file < 100 lines: using pd IntervalIndex... ")
84
+ log.write(" -Bed file < 100 lines: using pd IntervalIndex... ", verbose=verbose)
82
85
  for i in sumstats[chrom].unique():
83
86
  if sum(bed[0]==i)>0:
84
87
  interval = pd.IntervalIndex.from_tuples(bed.loc[bed[0]==i,"tuple"])
@@ -86,7 +89,7 @@ def filterregionin(sumstats,path=None, chrom="CHR",pos="POS", high_ld=False, bui
86
89
  else:
87
90
  continue
88
91
  else:
89
- if verbose: log.write(" -Bed file > 100 lines: using two pointers, please make files are all sorted... ")
92
+ log.write(" -Bed file > 100 lines: using two pointers, please make files are all sorted... ", verbose=verbose)
90
93
  bed_num =0
91
94
  bed_chr =bed.iloc[bed_num,0]
92
95
  bed_left =bed.iloc[bed_num,1]
@@ -134,23 +137,23 @@ def filterregionin(sumstats,path=None, chrom="CHR",pos="POS", high_ld=False, bui
134
137
  ## in
135
138
 
136
139
  sumstats = sumstats.loc[sumstats["bed_indicator"],:]
137
- if verbose: log.write(" -Number of variants in the specified regions to keep:",sum(sumstats["bed_indicator"]))
138
- if verbose: log.write(" -Number of variants removed:",sum(~sumstats["bed_indicator"]))
140
+ log.write(" -Number of variants in the specified regions to keep:",sum(sumstats["bed_indicator"]), verbose=verbose)
141
+ log.write(" -Number of variants removed:",sum(~sumstats["bed_indicator"]), verbose=verbose)
139
142
  sumstats = sumstats.drop(columns="bed_indicator")
140
- if verbose: log.write("Finished filtering in variants.")
143
+ log.write("Finished filtering in variants.", verbose=verbose)
141
144
  gc.collect()
142
145
  return sumstats
143
146
 
144
147
  def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, build="19", verbose=True,log=Log()):
145
148
  sumstats = sortcoordinate(sumstats,verbose=verbose)
146
- if verbose: log.write("Start to filter out variants if in intervals defined in bed files:")
147
- if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
149
+ log.write("Start to filter out variants if in intervals defined in bed files:", verbose=verbose)
150
+ log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns), verbose=verbose)
148
151
  if high_ld is True:
149
152
  path = get_high_ld(build=build)
150
- if verbose: log.write(" -Loading bed format file for hg"+build)
153
+ log.write(" -Loading bed format file for hg"+build, verbose=verbose)
151
154
 
152
155
  else:
153
- if verbose: log.write(" -Loading bed format file: " , path)
156
+ log.write(" -Loading bed format file: " , path, verbose=verbose)
154
157
 
155
158
  bed = pd.read_csv(path,sep="\s+",header=None,dtype={0:"string",1:"Int64",2:"Int64"})
156
159
  bed["tuple"] = bed.apply(lambda x: (x[1],x[2]),axis=1)
@@ -162,7 +165,7 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
162
165
  bed[0]=bed[0].astype("Int64")
163
166
 
164
167
  if len(bed)<100:
165
- if verbose: log.write(" -Bed file < 100 lines: using pd IntervalIndex... ")
168
+ log.write(" -Bed file < 100 lines: using pd IntervalIndex... ", verbose=verbose)
166
169
  for i in sumstats[chrom].unique():
167
170
  if sum(bed[0]==i)>0:
168
171
  interval = pd.IntervalIndex.from_tuples(bed.loc[bed[0]==i,"tuple"])
@@ -170,7 +173,7 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
170
173
  else:
171
174
  continue
172
175
  else:
173
- if verbose: log.write(" -Bed file > 100 lines: using two pointers, please make files are all sorted... ")
176
+ log.write(" -Bed file > 100 lines: using two pointers, please make files are all sorted... ", verbose=verbose)
174
177
  bed_num =0
175
178
  bed_chr =bed.iloc[bed_num,0]
176
179
  bed_left =bed.iloc[bed_num,1]
@@ -206,78 +209,93 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
206
209
  ## out
207
210
 
208
211
  sumstats = sumstats.loc[~sumstats["bed_indicator"],:]
209
- if verbose: log.write(" -Number of variants in the specified regions to exclude:",sum(sumstats["bed_indicator"]))
210
- if verbose: log.write(" -Number of variants left:",len(sumstats))
212
+ log.write(" -Number of variants in the specified regions to exclude:",sum(sumstats["bed_indicator"]), verbose=verbose)
213
+ log.write(" -Number of variants left:",len(sumstats), verbose=verbose)
211
214
  sumstats = sumstats.drop(columns="bed_indicator")
212
- if verbose: log.write("Finished filtering out variants.")
215
+ log.write("Finished filtering out variants.", verbose=verbose)
213
216
  gc.collect()
214
217
  return sumstats
215
218
 
216
219
  def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True,log=Log()):
220
+ ##start function with col checking##########################################################
221
+ _start_line = "infer genome build version using hapmap3 SNPs"
222
+ _end_line = "inferring genome build version using hapmap3 SNPs"
223
+ _start_cols = [chrom,pos]
224
+ _start_function = ".infer_build()"
225
+ _must_args ={}
226
+
227
+ is_enough_info = start_to(sumstats=sumstats,
228
+ log=log,
229
+ verbose=verbose,
230
+ start_line=_start_line,
231
+ end_line=_end_line,
232
+ start_cols=_start_cols,
233
+ start_function=_start_function,
234
+ **_must_args)
235
+ if is_enough_info == False: return sumstats
236
+ ############################################################################################
237
+
217
238
  inferred_build="Unknown"
218
- if verbose:log.write("Start to infer genome build version using hapmap3 SNPs...")
239
+ log.write("Start to infer genome build version using hapmap3 SNPs...", verbose=verbose)
219
240
  data_path_19 = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz'
220
241
  data_path_38 = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz'
221
- if verbose:log.write(" -Loading Hapmap3 variants data...")
242
+ log.write(" -Loading Hapmap3 variants data...", verbose=verbose)
222
243
  hapmap3_ref_19 = pd.read_csv(data_path_19,sep="\s+",usecols=["#CHROM","POS"],dtype={"#CHROM":"string","POS":"string"})
223
244
  hapmap3_ref_38 = pd.read_csv(data_path_38,sep="\s+",usecols=["#CHROM","POS"],dtype={"#CHROM":"string","POS":"string"})
224
245
 
225
- if chrom in sumstats.columns and pos in sumstats.columns:
226
- if verbose: log.write(" -CHR:POS will be used for matching...")
227
- raw_chrpos = sumstats[chrom].astype("string")+":"+sumstats[pos].astype("string")
228
-
229
- hapmap3_ref_19["chr:pos"] = hapmap3_ref_19["#CHROM"]+":"+hapmap3_ref_19["POS"]
230
- hapmap3_ref_38["chr:pos"] = hapmap3_ref_38["#CHROM"]+":"+hapmap3_ref_38["POS"]
231
-
232
- match_count_for_19 = sum(raw_chrpos.isin(hapmap3_ref_19["chr:pos"].values))
233
- match_count_for_38 = sum(raw_chrpos.isin(hapmap3_ref_38["chr:pos"].values))
234
-
235
- if verbose:log.write(" -Matching variants for hg19: num_hg19 = ",match_count_for_19)
236
- if verbose:log.write(" -Matching variants for hg38: num_hg38 = ",match_count_for_38)
237
-
238
- if max(match_count_for_19, match_count_for_38)<10000:
239
- if verbose:log.write(" -Warning: please be cautious due to the limited number of variants.")
240
-
241
- if match_count_for_19 > match_count_for_38:
242
- if verbose:log.write(" -Since num_hg19 >> num_hg38, assigning genome build hg19...")
243
- sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status],1,"9","1")
244
- sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status],2,"9","9")
245
- inferred_build="19"
246
- elif match_count_for_19 < match_count_for_38:
247
- if verbose:log.write(" -Since num_hg19 << num_hg38, assigning genome build hg38...")
248
- sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status],1,"9","3")
249
- sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status],2,"9","8")
250
- inferred_build="38"
251
- else:
252
- if verbose:log.write(" -Since num_hg19 = num_hg38, unable to infer...")
253
- gc.collect()
254
- if verbose:log.write("Finished inferring genome build version using hapmap3 SNPs...")
255
- return sumstats, inferred_build
246
+ log.write(" -CHR:POS will be used for matching...", verbose=verbose)
247
+ raw_chrpos = sumstats[chrom].astype("string")+":"+sumstats[pos].astype("string")
248
+
249
+ hapmap3_ref_19["chr:pos"] = hapmap3_ref_19["#CHROM"]+":"+hapmap3_ref_19["POS"]
250
+ hapmap3_ref_38["chr:pos"] = hapmap3_ref_38["#CHROM"]+":"+hapmap3_ref_38["POS"]
251
+
252
+ match_count_for_19 = sum(raw_chrpos.isin(hapmap3_ref_19["chr:pos"].values))
253
+ match_count_for_38 = sum(raw_chrpos.isin(hapmap3_ref_38["chr:pos"].values))
254
+
255
+ log.write(" -Matching variants for hg19: num_hg19 = ",match_count_for_19, verbose=verbose)
256
+ log.write(" -Matching variants for hg38: num_hg38 = ",match_count_for_38, verbose=verbose)
257
+
258
+ if max(match_count_for_19, match_count_for_38)<10000:
259
+ log.warning("Please be cautious due to the limited number of variants.", verbose=verbose)
260
+
261
+ if match_count_for_19 > match_count_for_38:
262
+ log.write(" -Since num_hg19 >> num_hg38, assigning genome build hg19...", verbose=verbose)
263
+ sumstats[status] = vchange_status(sumstats[status],1,"9","1")
264
+ sumstats[status] = vchange_status(sumstats[status],2,"9","9")
265
+ inferred_build="19"
266
+ elif match_count_for_19 < match_count_for_38:
267
+ log.write(" -Since num_hg19 << num_hg38, assigning genome build hg38...", verbose=verbose)
268
+ sumstats[status] = vchange_status(sumstats[status],1,"9","3")
269
+ sumstats[status] = vchange_status(sumstats[status],2,"9","8")
270
+ inferred_build="38"
256
271
  else:
257
- gc.collect()
258
- raise ValueError("Not enough information to match SNPs. Please check if CHR and POS columns are in your sumstats...")
272
+ log.write(" -Since num_hg19 = num_hg38, unable to infer...", verbose=verbose)
273
+
274
+ finished(log,verbose,_end_line)
275
+ return sumstats, inferred_build
259
276
 
260
277
  def sampling(sumstats,n=1, p=None, verbose=True,log=Log(),**args):
261
- if verbose:log.write("Start to randomly select variants from the sumstats...")
278
+
279
+ log.write("Start to randomly select variants from the sumstats...", verbose=verbose)
262
280
  if p is None:
263
- if verbose:log.write(" -Number of variants selected from the sumstats:",n)
281
+ log.write(" -Number of variants selected from the sumstats:",n, verbose=verbose)
264
282
  if n > len(sumstats):
265
283
  raise ValueError("Please input a number < {}".format(len(sumstats)))
266
284
  else:
267
285
  if p>-0.00000001 and p<1.00000001:
268
- if verbose:log.write(" -Percentage of variants selected from the sumstats: ",p)
286
+ log.write(" -Percentage of variants selected from the sumstats: ",p, verbose=verbose)
269
287
  n = int(len(sumstats)*p)
270
- if verbose:log.write(" -Number of variants selected from the sumstats:",n)
288
+ log.write(" -Number of variants selected from the sumstats:",n, verbose=verbose)
271
289
  else:
272
290
  raise ValueError("Please input a number in (0,1)")
273
291
 
274
292
  if "random_state" in args.keys():
275
- if verbose:log.write(" -Random state (seed): {}".format(args["random_state"]))
293
+ log.write(" -Random state (seed): {}".format(args["random_state"]), verbose=verbose)
276
294
  else:
277
295
  args["random_state"] = np.random.randint(0,4294967295)
278
- if verbose:log.write(" -Random state (seed): {}".format(args["random_state"]))
296
+ log.write(" -Random state (seed): {}".format(args["random_state"]), verbose=verbose)
279
297
  sampled = sumstats.sample(n=n,**args)
280
- if verbose:log.write("Finished sampling...")
298
+ log.write("Finished sampling...", verbose=verbose)
281
299
  gc.collect()
282
300
  return sampled
283
301
 
@@ -301,4 +319,113 @@ def _get_flanking(sumstats, snpid, windowsizekb=500, verbose=True,log=Log(),**ar
301
319
  log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
302
320
 
303
321
  return flanking
304
-
322
+
323
+ def _get_flanking_by_id(sumstats, snpid, windowsizekb=500, verbose=True,log=Log(),**args):
324
+
325
+ log.write("Start to extract variants in the flanking regions using rsID or SNPID...",verbose=verbose)
326
+ log.write(" - Central variants: {}".format(snpid), verbose=verbose)
327
+ log.write(" - Flanking windowsize in kb: {}".format(windowsizekb), verbose=verbose)
328
+
329
+ if type(snpid) == str:
330
+ snpid = [snpid]
331
+
332
+ if "rsID" in sumstats.columns and "SNPID" not in sumstats.columns:
333
+ is_specified = sumstats["rsID"].isin(snpid)
334
+ elif "rsID" not in sumstats.columns and "SNPID" in sumstats.columns:
335
+ is_specified = sumstats["SNPID"].isin(snpid)
336
+ else:
337
+ is_specified = sumstats["rsID"].isin(snpid) | sumstats["SNPID"].isin(snpid)
338
+
339
+ row = sumstats.loc[is_specified,:]
340
+
341
+ is_flanking = None
342
+ for index, row in row.iterrows():
343
+ chrom = row["CHR"]
344
+ left = row["POS"] - 1000 * windowsizekb
345
+ right = row["POS"] + 1000 * windowsizekb
346
+ is_flancking_in_this_region = (sumstats["CHR"] == chrom) & (sumstats["POS"] >= left) & (sumstats["POS"] <= right)
347
+
348
+ log.write(" - Variants in flanking region {}:{}-{} : {}".format(chrom, left, right, sum(is_flancking_in_this_region) ))
349
+
350
+ if is_flanking is None:
351
+ is_flanking = is_flancking_in_this_region
352
+ else:
353
+ is_flanking = is_flanking | is_flancking_in_this_region
354
+
355
+ flanking = sumstats.loc[is_flanking,:]
356
+
357
+ log.write(" - Extracted {} variants in the regions.".format(len(flanking)),verbose=verbose)
358
+ log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
359
+
360
+ return flanking
361
+
362
+ def _get_flanking_by_chrpos(sumstats, chrpos, windowsizekb=500, verbose=True,log=Log(),**args):
363
+
364
+ log.write("Start to extract variants in the flanking regions using CHR and POS...",verbose=verbose)
365
+ log.write(" - Central positions: {}".format(chrpos), verbose=verbose)
366
+ log.write(" - Flanking windowsize in kb: {}".format(windowsizekb), verbose=verbose)
367
+
368
+ if type(chrpos) == tuple:
369
+ chrpos_to_check = [chrpos]
370
+ else:
371
+ chrpos_to_check = chrpos
372
+
373
+ is_flanking = None
374
+
375
+ for index, row in enumerate(chrpos_to_check):
376
+ chrom = row[0]
377
+ left = row[1] - 1000 * windowsizekb
378
+ right = row[1] + 1000 * windowsizekb
379
+ is_flancking_in_this_region = (sumstats["CHR"] == chrom) & (sumstats["POS"] >= left) & (sumstats["POS"] <= right)
380
+
381
+ log.write(" - Variants in flanking region {}:{}-{} : {}".format(chrom, left, right, sum(is_flancking_in_this_region) ))
382
+
383
+ if is_flanking is None:
384
+ is_flanking = is_flancking_in_this_region
385
+ else:
386
+ is_flanking = is_flanking | is_flancking_in_this_region
387
+
388
+ flanking = sumstats.loc[is_flanking,:]
389
+
390
+ log.write(" - Extracted {} variants in the regions.".format(len(flanking)),verbose=verbose)
391
+ log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
392
+
393
+ return flanking
394
+
395
+ def _filter_palindromic(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
396
+ log.write("Start to filter palindromic variants...",verbose=verbose)
397
+ is_palindromic_snp = is_palindromic(sumstats[[nea,ea]],a1=nea,a2=ea)
398
+
399
+ log.write(" -Identified palindromic variants: {}".format(sum(is_palindromic_snp)),verbose=verbose)
400
+
401
+ if mode=="in":
402
+ palindromic = sumstats.loc[is_palindromic_snp,:]
403
+ else:
404
+ palindromic = sumstats.loc[~is_palindromic_snp,:]
405
+
406
+ log.write("Finished filtering palindromic variants.",verbose=verbose)
407
+ return palindromic
408
+
409
+ def _filter_indel(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
410
+ log.write("Start to filter indels...",verbose=verbose)
411
+ is_indel = (sumstats[ea].str.len()!=sumstats[nea].str.len())
412
+
413
+ log.write(" -Identified indels: {}".format(sum(is_indel)),verbose=verbose)
414
+ if mode=="in":
415
+ indel = sumstats.loc[is_indel,:]
416
+ else:
417
+ indel = sumstats.loc[~is_indel,:]
418
+ log.write("Finished filtering indels.",verbose=verbose)
419
+ return indel
420
+
421
+ def _filter_snp(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
422
+ log.write("Start to filter SNPs...",verbose=verbose)
423
+ is_snp = (sumstats[ea].str.len()==1) &(sumstats[nea].str.len()==1)
424
+
425
+ log.write(" -Identified SNPs: {}".format(sum(is_snp)),verbose=verbose)
426
+ if mode=="in":
427
+ snp = sumstats.loc[is_snp,:]
428
+ else:
429
+ snp = sumstats.loc[~is_snp,:]
430
+ log.write("Finished filtering SNPs.",verbose=verbose)
431
+ return snp
@@ -5,9 +5,9 @@ from gwaslab.g_Log import Log
5
5
  import gc
6
6
 
7
7
  def getsignaldensity(insumstats, id="SNPID", chrom="CHR",pos="POS", bwindowsizekb=100,log=Log(),verbose=True):
8
- if verbose:log.write("Start to calculate signal DENSITY...")
9
- sumstats = insumstats.loc[:,[id,chrom,pos]].copy()
10
- if verbose:log.write(" -Calculating DENSITY with windowsize of ",bwindowsizekb ," kb")
8
+ log.write("Start to calculate signal DENSITY..." ,verbose=verbose)
9
+ sumstats = insumstats[[id,chrom,pos]].copy()
10
+ log.write(" -Calculating DENSITY with windowsize of ",bwindowsizekb ," kb",verbose=verbose)
11
11
  #stack=[]
12
12
 
13
13
  large_number = 1000000000
@@ -58,13 +58,13 @@ def getsignaldensity(insumstats, id="SNPID", chrom="CHR",pos="POS", bwindowsizek
58
58
  bmax = sumstats["DENSITY"].max()
59
59
  bmaxid = sumstats["DENSITY"].idxmax()
60
60
 
61
- if verbose:log.write(" -Mean : {} signals per {} kb".format(bmean,bwindowsizekb))
62
- if verbose:log.write(" -SD : {}".format(bsd))
63
- if verbose:log.write(" -Median : {} signals per {} kb".format(bmedian,bwindowsizekb))
64
- if verbose:log.write(" -Max : {} signals per {} kb at variant(s) {}".format(bmax,bwindowsizekb,sumstats.loc[bmaxid,id]))
61
+ log.write(" -Mean : {} signals per {} kb".format(bmean,bwindowsizekb),verbose=verbose)
62
+ log.write(" -SD : {}".format(bsd),verbose=verbose)
63
+ log.write(" -Median : {} signals per {} kb".format(bmedian,bwindowsizekb),verbose=verbose)
64
+ log.write(" -Max : {} signals per {} kb at variant(s) {}".format(bmax,bwindowsizekb,sumstats.loc[bmaxid,id]),verbose=verbose)
65
65
 
66
66
  sumstats = sumstats.drop("TCHR+POS",axis=1)
67
- if verbose:log.write("Finished calculating signal DENSITY successfully!")
67
+ log.write("Finished calculating signal DENSITY successfully!",verbose=verbose)
68
68
  return sumstats["DENSITY"]
69
69
 
70
70
  def assigndensity(insumstats,
@@ -81,7 +81,7 @@ def assigndensity(insumstats,
81
81
  large_number = int(large_number * 10)
82
82
  else:
83
83
  break
84
- sumstats = insumstats.loc[:,[id,chrom,pos]].copy()
84
+ sumstats = insumstats[[id,chrom,pos]].copy()
85
85
  sumstats["DENSITY"] = 0
86
86
  sumstats["TCHR+POS"] = sumstats[chrom]*large_number + sumstats[pos]
87
87
  sig_sumstats["TCHR+POS"] = sig_sumstats[chrom]*large_number + sig_sumstats[pos]
@@ -92,7 +92,7 @@ def assigndensity(insumstats,
92
92
  to_add =(sumstats["TCHR+POS"]>=(row["TCHR+POS"]- 1000*bwindowsizekb)) & (sumstats["TCHR+POS"]<=(row["TCHR+POS"]+ 1000*bwindowsizekb))
93
93
  sumstats.loc[to_add,"DENSITY"] += 1
94
94
  if counter%1000==0:
95
- if verbose:log.write(" -Processed {} signals".format(counter//1000))
95
+ log.write(" -Processed {} signals".format(counter//1000),verbose=verbose)
96
96
  gc.collect()
97
97
 
98
98
  return sumstats["DENSITY"]