gwaslab 3.4.37__py3-none-any.whl → 3.4.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (37) hide show
  1. gwaslab/data/formatbook.json +722 -721
  2. gwaslab/g_Log.py +8 -0
  3. gwaslab/g_Sumstats.py +26 -147
  4. gwaslab/g_SumstatsPair.py +6 -2
  5. gwaslab/g_Sumstats_summary.py +3 -3
  6. gwaslab/g_version.py +2 -2
  7. gwaslab/hm_casting.py +29 -15
  8. gwaslab/hm_harmonize_sumstats.py +291 -163
  9. gwaslab/hm_rsid_to_chrpos.py +1 -1
  10. gwaslab/io_preformat_input.py +43 -37
  11. gwaslab/io_to_formats.py +428 -295
  12. gwaslab/qc_check_datatype.py +3 -3
  13. gwaslab/qc_fix_sumstats.py +793 -682
  14. gwaslab/util_ex_calculate_ldmatrix.py +29 -11
  15. gwaslab/util_ex_gwascatalog.py +1 -1
  16. gwaslab/util_ex_ldproxyfinder.py +1 -1
  17. gwaslab/util_ex_process_ref.py +3 -3
  18. gwaslab/util_ex_run_coloc.py +26 -4
  19. gwaslab/util_in_convert_h2.py +1 -1
  20. gwaslab/util_in_fill_data.py +2 -2
  21. gwaslab/util_in_filter_value.py +122 -34
  22. gwaslab/util_in_get_density.py +2 -2
  23. gwaslab/util_in_get_sig.py +41 -9
  24. gwaslab/viz_aux_quickfix.py +24 -19
  25. gwaslab/viz_aux_reposition_text.py +7 -4
  26. gwaslab/viz_aux_save_figure.py +6 -5
  27. gwaslab/viz_plot_compare_af.py +5 -5
  28. gwaslab/viz_plot_miamiplot2.py +28 -20
  29. gwaslab/viz_plot_mqqplot.py +109 -72
  30. gwaslab/viz_plot_qqplot.py +11 -8
  31. gwaslab/viz_plot_regionalplot.py +3 -1
  32. gwaslab/viz_plot_trumpetplot.py +15 -6
  33. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/METADATA +2 -2
  34. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/RECORD +37 -37
  35. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
  36. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
  37. {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0
@@ -12,6 +12,7 @@ def tofinemapping(sumstats,
12
12
  study=None,
13
13
  bfile=None,
14
14
  vcf=None,
15
+ loci=None,
15
16
  out="./",
16
17
  windowsizekb=1000,
17
18
  n_cores=1,
@@ -27,8 +28,13 @@ def tofinemapping(sumstats,
27
28
  suffixes=[""]
28
29
  if getlead_args is None:
29
30
  getlead_args={"windowsizekb":1000}
30
- sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
31
-
31
+
32
+ if loci is None:
33
+ log.write(" -Loci were not provided. All significant loci will be automatically extracted...")
34
+ sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
35
+ else:
36
+ sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
37
+
32
38
  # Drop duplicate!!!!
33
39
  log.write(" -Dropping duplicated SNPIDs...")
34
40
  sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
@@ -170,6 +176,7 @@ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, w
170
176
  def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=None):
171
177
  if suffixes is None:
172
178
  suffixes=[""]
179
+
173
180
  log.write(" -#variants in locus ({}): {}".format(row["SNPID"],len(locus_sumstats)))
174
181
  # convert category to string
175
182
  locus_sumstats["EA"] = locus_sumstats["EA"].astype("string")
@@ -180,28 +187,35 @@ def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=No
180
187
  combined_df = pd.merge(ref_bim, locus_sumstats, on="SNPID",how="inner")
181
188
 
182
189
  # match allele
183
- allele_match = ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) ) | ((combined_df["EA"] == combined_df["NEA_bim"])& (combined_df["NEA"] == combined_df["EA_bim"]))
184
- log.write(" -#Variants with matched alleles:{}".format(sum(allele_match)))
190
+ perfect_match = ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) )
191
+ log.write(" -#Variants with perfect matched alleles:{}".format(sum(perfect_match)))
185
192
 
186
193
  # fliipped allele
187
- ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
188
- log.write(" -#Variants with flipped alleles:{}".format(sum(ea_mis_match)))
194
+ #ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
195
+ flipped_match = ((combined_df["EA"] == combined_df["NEA_bim"])& (combined_df["NEA"] == combined_df["EA_bim"]))
196
+ log.write(" -#Variants with flipped alleles:{}".format(sum(flipped_match)))
189
197
 
190
- if row["SNPID"] not in combined_df.loc[allele_match,"SNPID"].values:
198
+ allele_match = perfect_match | flipped_match
199
+ log.write(" -#Total Variants matched:{}".format(sum(allele_match)))
200
+
201
+ if row["SNPID"] not in combined_df.loc[perfect_match,"SNPID"].values:
191
202
  log.write(" -Warning: Lead variant was not available in reference!!!!!!!!!!!!!!!")
192
203
 
193
204
  # adjust statistics
194
205
  output_columns=["SNPID","CHR","POS","EA_bim","NEA_bim"]
195
206
  for suffix in suffixes:
196
207
  if ("BETA"+suffix in locus_sumstats.columns) and ("SE"+suffix in locus_sumstats.columns):
197
- combined_df.loc[ea_mis_match,"BETA"+suffix] = - combined_df.loc[ea_mis_match,"BETA"+suffix]
208
+ log.write(" -Flipping BETA{} for variants with flipped alleles...".format(suffix))
209
+ combined_df.loc[flipped_match,"BETA"+suffix] = - combined_df.loc[flipped_match,"BETA"+suffix]
198
210
  output_columns.append("BETA"+suffix)
199
211
  output_columns.append("SE"+suffix)
200
212
  if "Z" in locus_sumstats.columns:
201
- combined_df.loc[ea_mis_match,"Z"+suffix] = - combined_df.loc[ea_mis_match,"Z"+suffix]
213
+ log.write(" -Flipping Z{} for variants with flipped alleles...".format(suffix))
214
+ combined_df.loc[flipped_match,"Z"+suffix] = - combined_df.loc[flipped_match,"Z"+suffix]
202
215
  output_columns.append("Z"+suffix)
203
216
  if "EAF" in locus_sumstats.columns:
204
- combined_df.loc[ea_mis_match,"EAF"+suffix] = 1 - combined_df.loc[ea_mis_match,"EAF"+suffix]
217
+ log.write(" -Flipping EAF{} for variants with flipped alleles...".format(suffix))
218
+ combined_df.loc[flipped_match,"EAF"+suffix] = 1 - combined_df.loc[flipped_match,"EAF"+suffix]
205
219
  output_columns.append("EAF"+suffix)
206
220
  if "N" in locus_sumstats.columns:
207
221
  output_columns.append("N"+suffix)
@@ -215,6 +229,7 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
215
229
  matched_snp_list_path = "{}/{}_{}_{}.snplist.raw".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
216
230
 
217
231
  matched_sumstats["SNPID"].to_csv(matched_snp_list_path, index=None, header=None)
232
+ log.write(" -Exporting SNP list of {} to: {}...".format(len(matched_sumstats) ,matched_snp_list_path))
218
233
 
219
234
  # create locus-sumstats EA, NEA, (BETA, SE), Z
220
235
  matched_sumstats_path = "{}/{}_{}_{}.sumstats.gz".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
@@ -230,7 +245,10 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
230
245
  to_export_columns.append("EAF"+suffix)
231
246
  if "N"+suffix in matched_sumstats.columns:
232
247
  to_export_columns.append("N"+suffix)
233
- matched_sumstats.loc[:, ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, index=None)
248
+
249
+ log.write(" -Exporting locus sumstats to: {}...".format(matched_sumstats_path))
250
+ log.write(" -Exported columns: {}...".format(["SNPID"]+to_export_columns))
251
+ matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, index=None)
234
252
  return matched_snp_list_path, matched_sumstats_path
235
253
 
236
254
  def _check_snpid_order(snplist_path, matched_sumstats_snpid,log):
@@ -127,7 +127,7 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
127
127
  #rsid locations
128
128
  gwascatalog_lead_snps = pd.DataFrame(records,columns=["SNPID","CHR","POS","REPORT_GENENAME","CLOSEST_GENENAMES","FUNCTION_CLASS","OR","BETA","SE","P","TRAIT","STUDY","PUBMEDID","AUTHOR"])
129
129
  if verbose: log.write(" -Loading retrieved data into gwaslab Sumstats object ...")
130
- sigs = gl.Sumstats(gwascatalog_lead_snps,fmt="gwaslab",other=['REPORT_GENENAME', 'CLOSEST_GENENAMES','TRAIT', 'STUDY', 'PUBMEDID','AUTHOR'],verbose=False)
130
+ sigs = gl.Sumstats(gwascatalog_lead_snps.copy(),fmt="gwaslab",other=['REPORT_GENENAME', 'CLOSEST_GENENAMES','TRAIT', 'STUDY', 'PUBMEDID','AUTHOR'],verbose=False)
131
131
  sigs.fix_pos(verbose=False)
132
132
  sigs.fix_chr(verbose=False)
133
133
  sigs.sort_coordinate(verbose=False)
@@ -46,7 +46,7 @@ def _extract_with_ld_proxy( snplist=None,
46
46
  log=Log(),
47
47
  verbose=True,
48
48
  windowsizekb=100,
49
- ld_threshold=0.8,
49
+ ld_threshold=0.8
50
50
  ):
51
51
  ### Load vcf#######################################################################################
52
52
  if verbose: log.write("Start to load reference genotype...")
@@ -89,7 +89,7 @@ def _load_single_bim_to_ref_bims(bpfile_prefix, ref_bims, log):
89
89
  sep="\s+",
90
90
  usecols=[0,1,3,4,5],
91
91
  header=None,
92
- dtype={1:"string",0:"category", 3:"int", 4:"string", 5:"string"}).rename(columns={1:"SNPID",0:"CHR_bim",3:"POS_bim",4:"NEA_bim",5:"EA_bim"})
92
+ dtype={1:"string",0:"category", 3:"int", 4:"string", 5:"string"}).rename(columns={1:"SNPID",0:"CHR_bim",3:"POS_bim",4:"EA_bim",5:"NEA_bim"})
93
93
  log.write(" -#variants in ref file: {}".format(len(single_bim)))
94
94
  ref_bims.append(single_bim)
95
95
  return ref_bims
@@ -104,7 +104,7 @@ def _load_single_pvar_to_ref_bims(bpfile_prefix, ref_bims, log):
104
104
  usecols=[0,1,2,3,4],
105
105
  header=None,
106
106
  comment="#",
107
- dtype={2:"string",0:"category", 1:"int", 3:"string", 4:"string"}).rename(columns={2:"SNPID",0:"CHR_bim",1:"POS_bim",3:"NEA_bim",4:"EA_bim"})
107
+ dtype={2:"string",0:"category", 1:"int", 3:"string", 4:"string"}).rename(columns={2:"SNPID",0:"CHR_bim",1:"POS_bim",3:"EA_bim",4:"NEA_bim"})
108
108
  log.write(" -#variants in ref file: {}".format(len(single_bim)))
109
109
  ref_bims.append(single_bim)
110
110
  return ref_bims
@@ -265,7 +265,7 @@ def _process_vcf(ref_file_prefix,
265
265
  except subprocess.CalledProcessError as e:
266
266
  log.write(e.output)
267
267
  else:
268
- log.write(" -Plink {} for CHR {} exists. Skipping...".format(convert ,i))
268
+ log.write(" -Plink {} for CHR {} exists: {}. Skipping...".format(convert ,i, bpfile_prefix))
269
269
 
270
270
  if load_bim == True:
271
271
  if convert == "bfile":
@@ -68,12 +68,16 @@ def _run_coloc_susie(filepath, r="Rscript",
68
68
  D1 <- list( "LD"=R, "beta"=df[,"BETA_1"],"varbeta"=df[,"SE_1"]**2,"snp"=df[,"SNPID"],"position"=df[,"POS"],"type"="{type1}","N"={n1}{d1_args})
69
69
  D2 <- list( "LD"=R, "beta"=df[,"BETA_2"],"varbeta"=df[,"SE_2"]**2,"snp"=df[,"SNPID"],"position"=df[,"POS"],"type"="{type2}","N"={n2}{d2_args})
70
70
 
71
+ abf <- coloc.abf(dataset1=D1,dataset2=D2)
72
+ write.csv(t(data.frame(abf$summary)) , "{output_prefix}.coloc.abf", row.names = FALSE)
73
+
71
74
  S1=runsusie(D1{susie_args})
72
75
  S2=runsusie(D2{susie_args})
73
76
 
74
77
  susie.res=coloc.susie(S1,S2{coloc_args})
75
78
 
76
79
  write.csv(susie.res$summary, "{output_prefix}.coloc.susie", row.names = FALSE)
80
+
77
81
  '''.format(sumstats_path = sumstats,
78
82
  ld_r_matrix_path = ld_r_matrix,
79
83
  fillna_script = "R[is.na(R)] <- 0" if fillldna==True else "",
@@ -87,7 +91,9 @@ def _run_coloc_susie(filepath, r="Rscript",
87
91
  coloc_args = coloc_args,
88
92
  output_prefix = output_prefix)
89
93
 
90
- log.write(" -coloc script: {}".format("coloc.susie(S1,S2)"), verbose=verbose)
94
+ log.write(" -coloc abf script: {}".format("coloc.abf(dataset1=D1,dataset2=D2)"), verbose=verbose)
95
+ log.write(" -coloc susie script: {}".format("coloc.susie(S1,S2)"), verbose=verbose)
96
+
91
97
  with open("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]),"w") as file:
92
98
  file.write(rscript)
93
99
 
@@ -101,21 +107,37 @@ def _run_coloc_susie(filepath, r="Rscript",
101
107
  #plink_process.kill()
102
108
  log.write(" Running coloc.SuSieR from command line...", verbose=verbose)
103
109
  r_log+= output + "\n"
110
+
111
+ pip_cs = pd.read_csv("{}.coloc.abf".format(output_prefix))
112
+ if len(pip_cs)==0:
113
+ log.write(" -SuSieR result for {} is empty. Please check parameters.".format(output_prefix), verbose=verbose)
114
+ else:
115
+ pip_cs["Locus"] = row["SNPID"]
116
+ pip_cs["STUDY"] = row["study"]
117
+ pip_cs["hit1"] = row["SNPID"]
118
+ pip_cs["METHOD"] = "abf"
119
+ locus_pip_cs = pd.concat([locus_pip_cs,pip_cs],ignore_index=True)
120
+
104
121
  pip_cs = pd.read_csv("{}.coloc.susie".format(output_prefix))
105
122
  if len(pip_cs)==0:
106
123
  log.write(" -SuSieR result for {} is empty. Please check parameters.".format(output_prefix), verbose=verbose)
107
124
  else:
108
125
  pip_cs["Locus"] = row["SNPID"]
109
126
  pip_cs["STUDY"] = row["study"]
127
+ pip_cs["METHOD"] = "susie"
110
128
  locus_pip_cs = pd.concat([locus_pip_cs,pip_cs],ignore_index=True)
129
+
111
130
  os.remove("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]))
131
+
112
132
  if delete == True:
113
- os.remove("{}.pipcs".format(output_prefix))
133
+ os.remove("{}.coloc.susie".format(output_prefix))
134
+ os.remove("{}.coloc.abf".format(output_prefix))
114
135
  else:
115
- log.write(" -SuSieR result summary to: {}".format("{}.pipcs".format(output_prefix)), verbose=verbose)
136
+ log.write(" -coloc-abf result summary to: {}".format("{}.coloc.abf".format(output_prefix)), verbose=verbose)
137
+ log.write(" -coloc-susie result summary to: {}".format("{}.coloc.susie".format(output_prefix)), verbose=verbose)
116
138
 
117
139
  except subprocess.CalledProcessError as e:
118
140
  log.write(e.output)
119
141
  os.remove("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]))
120
- log.write("Finished finemapping using SuSieR.", verbose=verbose)
142
+ log.write("Finished clocalization using coloc and SuSiE.", verbose=verbose)
121
143
  return locus_pip_cs
@@ -121,7 +121,7 @@ def _get_per_snp_r2(sumstats,
121
121
  if verbose: log.write(" -For r2, {} is used.".format(snpr2))
122
122
  sumstats["F"] = sumstats[snpr2]*(sumstats[n]-1 -k)/((1-sumstats[snpr2]) * k)
123
123
 
124
- if verbose: log.write("Finished calculating per-SNP heritibility!")
124
+ if verbose: log.write("Finished calculating per-SNP heritability!")
125
125
  return sumstats
126
126
  #
127
127
  def get_population_allele_frequency(af, prop, odds_ratio, prevalence,eps=1e-15):
@@ -9,7 +9,7 @@ from gwaslab.g_version import _get_version
9
9
  from gwaslab.qc_check_datatype import check_datatype
10
10
 
11
11
  def filldata(
12
- sumstats,
12
+ insumstats,
13
13
  to_fill=None,
14
14
  df=None,
15
15
  overwrite=False,
@@ -23,7 +23,7 @@ def filldata(
23
23
  # if a string is passed to to_fill, convert it to list
24
24
  if type(to_fill) is str:
25
25
  to_fill = [to_fill]
26
-
26
+ sumstats = insumstats.copy()
27
27
  if verbose: log.write("Start filling data using existing columns...{}".format(_get_version()))
28
28
 
29
29
  check_datatype(sumstats,verbose=verbose,log=log)
@@ -8,6 +8,8 @@ from gwaslab.bd_common_data import get_chr_to_number
8
8
  from gwaslab.g_Log import Log
9
9
  from gwaslab.g_vchange_status import vchange_status
10
10
  from gwaslab.qc_fix_sumstats import sortcoordinate
11
+ from gwaslab.qc_fix_sumstats import start_to
12
+ from gwaslab.qc_fix_sumstats import finished
11
13
 
12
14
  import gc
13
15
  def filtervalues(sumstats,expr,remove=False,verbose=True,log=Log()):
@@ -214,6 +216,24 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
214
216
  return sumstats
215
217
 
216
218
  def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True,log=Log()):
219
+ ##start function with col checking##########################################################
220
+ _start_line = "infer genome build version using hapmap3 SNPs"
221
+ _end_line = "inferring genome build version using hapmap3 SNPs"
222
+ _start_cols = [chrom,pos]
223
+ _start_function = ".infer_build()"
224
+ _must_args ={}
225
+
226
+ is_enough_info = start_to(sumstats=sumstats,
227
+ log=log,
228
+ verbose=verbose,
229
+ start_line=_start_line,
230
+ end_line=_end_line,
231
+ start_cols=_start_cols,
232
+ start_function=_start_function,
233
+ **_must_args)
234
+ if is_enough_info == False: return sumstats
235
+ ############################################################################################
236
+
217
237
  inferred_build="Unknown"
218
238
  if verbose:log.write("Start to infer genome build version using hapmap3 SNPs...")
219
239
  data_path_19 = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz'
@@ -222,42 +242,39 @@ def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NE
222
242
  hapmap3_ref_19 = pd.read_csv(data_path_19,sep="\s+",usecols=["#CHROM","POS"],dtype={"#CHROM":"string","POS":"string"})
223
243
  hapmap3_ref_38 = pd.read_csv(data_path_38,sep="\s+",usecols=["#CHROM","POS"],dtype={"#CHROM":"string","POS":"string"})
224
244
 
225
- if chrom in sumstats.columns and pos in sumstats.columns:
226
- if verbose: log.write(" -CHR:POS will be used for matching...")
227
- raw_chrpos = sumstats[chrom].astype("string")+":"+sumstats[pos].astype("string")
228
-
229
- hapmap3_ref_19["chr:pos"] = hapmap3_ref_19["#CHROM"]+":"+hapmap3_ref_19["POS"]
230
- hapmap3_ref_38["chr:pos"] = hapmap3_ref_38["#CHROM"]+":"+hapmap3_ref_38["POS"]
231
-
232
- match_count_for_19 = sum(raw_chrpos.isin(hapmap3_ref_19["chr:pos"].values))
233
- match_count_for_38 = sum(raw_chrpos.isin(hapmap3_ref_38["chr:pos"].values))
234
-
235
- if verbose:log.write(" -Matching variants for hg19: num_hg19 = ",match_count_for_19)
236
- if verbose:log.write(" -Matching variants for hg38: num_hg38 = ",match_count_for_38)
237
-
238
- if max(match_count_for_19, match_count_for_38)<10000:
239
- if verbose:log.write(" -Warning: please be cautious due to the limited number of variants.")
240
-
241
- if match_count_for_19 > match_count_for_38:
242
- if verbose:log.write(" -Since num_hg19 >> num_hg38, assigning genome build hg19...")
243
- sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status],1,"9","1")
244
- sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status],2,"9","9")
245
- inferred_build="19"
246
- elif match_count_for_19 < match_count_for_38:
247
- if verbose:log.write(" -Since num_hg19 << num_hg38, assigning genome build hg38...")
248
- sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status],1,"9","3")
249
- sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status],2,"9","8")
250
- inferred_build="38"
251
- else:
252
- if verbose:log.write(" -Since num_hg19 = num_hg38, unable to infer...")
253
- gc.collect()
254
- if verbose:log.write("Finished inferring genome build version using hapmap3 SNPs...")
255
- return sumstats, inferred_build
245
+ if verbose: log.write(" -CHR:POS will be used for matching...")
246
+ raw_chrpos = sumstats[chrom].astype("string")+":"+sumstats[pos].astype("string")
247
+
248
+ hapmap3_ref_19["chr:pos"] = hapmap3_ref_19["#CHROM"]+":"+hapmap3_ref_19["POS"]
249
+ hapmap3_ref_38["chr:pos"] = hapmap3_ref_38["#CHROM"]+":"+hapmap3_ref_38["POS"]
250
+
251
+ match_count_for_19 = sum(raw_chrpos.isin(hapmap3_ref_19["chr:pos"].values))
252
+ match_count_for_38 = sum(raw_chrpos.isin(hapmap3_ref_38["chr:pos"].values))
253
+
254
+ if verbose:log.write(" -Matching variants for hg19: num_hg19 = ",match_count_for_19)
255
+ if verbose:log.write(" -Matching variants for hg38: num_hg38 = ",match_count_for_38)
256
+
257
+ if max(match_count_for_19, match_count_for_38)<10000:
258
+ if verbose:log.write(" -Warning: please be cautious due to the limited number of variants.")
259
+
260
+ if match_count_for_19 > match_count_for_38:
261
+ if verbose:log.write(" -Since num_hg19 >> num_hg38, assigning genome build hg19...")
262
+ sumstats[status] = vchange_status(sumstats[status],1,"9","1")
263
+ sumstats[status] = vchange_status(sumstats[status],2,"9","9")
264
+ inferred_build="19"
265
+ elif match_count_for_19 < match_count_for_38:
266
+ if verbose:log.write(" -Since num_hg19 << num_hg38, assigning genome build hg38...")
267
+ sumstats[status] = vchange_status(sumstats[status],1,"9","3")
268
+ sumstats[status] = vchange_status(sumstats[status],2,"9","8")
269
+ inferred_build="38"
256
270
  else:
257
- gc.collect()
258
- raise ValueError("Not enough information to match SNPs. Please check if CHR and POS columns are in your sumstats...")
271
+ if verbose:log.write(" -Since num_hg19 = num_hg38, unable to infer...")
272
+
273
+ finished(log,verbose,_end_line)
274
+ return sumstats, inferred_build
259
275
 
260
276
  def sampling(sumstats,n=1, p=None, verbose=True,log=Log(),**args):
277
+
261
278
  if verbose:log.write("Start to randomly select variants from the sumstats...")
262
279
  if p is None:
263
280
  if verbose:log.write(" -Number of variants selected from the sumstats:",n)
@@ -301,4 +318,75 @@ def _get_flanking(sumstats, snpid, windowsizekb=500, verbose=True,log=Log(),**ar
301
318
  log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
302
319
 
303
320
  return flanking
304
-
321
+
322
+ def _get_flanking_by_id(sumstats, snpid, windowsizekb=500, verbose=True,log=Log(),**args):
323
+
324
+ log.write("Start to extract variants in the flanking regions using rsID or SNPID...",verbose=verbose)
325
+ log.write(" - Central variants: {}".format(snpid))
326
+ log.write(" - Flanking windowsize in kb: {}".format(windowsizekb))
327
+
328
+ if type(snpid) == str:
329
+ snpid = [snpid]
330
+
331
+ if "rsID" in sumstats.columns and "SNPID" not in sumstats.columns:
332
+ is_specified = sumstats["rsID"].isin(snpid)
333
+ elif "rsID" not in sumstats.columns and "SNPID" in sumstats.columns:
334
+ is_specified = sumstats["SNPID"].isin(snpid)
335
+ else:
336
+ is_specified = sumstats["rsID"].isin(snpid) | sumstats["SNPID"].isin(snpid)
337
+
338
+ row = sumstats.loc[is_specified,:]
339
+
340
+ is_flanking = None
341
+ for index, row in row.iterrows():
342
+ chrom = row["CHR"]
343
+ left = row["POS"] - 1000 * windowsizekb
344
+ right = row["POS"] + 1000 * windowsizekb
345
+ is_flancking_in_this_region = (sumstats["CHR"] == chrom) & (sumstats["POS"] >= left) & (sumstats["POS"] <= right)
346
+
347
+ log.write(" - Variants in flanking region {}:{}-{} : {}".format(chrom, left, right, sum(is_flancking_in_this_region) ))
348
+
349
+ if is_flanking is None:
350
+ is_flanking = is_flancking_in_this_region
351
+ else:
352
+ is_flanking = is_flanking | is_flancking_in_this_region
353
+
354
+ flanking = sumstats.loc[is_flanking,:]
355
+
356
+ log.write(" - Extracted {} variants in the regions.".format(len(flanking)),verbose=verbose)
357
+ log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
358
+
359
+ return flanking
360
+
361
+ def _get_flanking_by_chrpos(sumstats, chrpos, windowsizekb=500, verbose=True,log=Log(),**args):
362
+
363
+ log.write("Start to extract variants in the flanking regions using CHR and POS...",verbose=verbose)
364
+ log.write(" - Central positions: {}".format(chrpos))
365
+ log.write(" - Flanking windowsize in kb: {}".format(windowsizekb))
366
+
367
+ if type(chrpos) == tuple:
368
+ chrpos_to_check = [chrpos]
369
+ else:
370
+ chrpos_to_check = chrpos
371
+
372
+ is_flanking = None
373
+
374
+ for index, row in enumerate(chrpos_to_check):
375
+ chrom = row[0]
376
+ left = row[1] - 1000 * windowsizekb
377
+ right = row[1] + 1000 * windowsizekb
378
+ is_flancking_in_this_region = (sumstats["CHR"] == chrom) & (sumstats["POS"] >= left) & (sumstats["POS"] <= right)
379
+
380
+ log.write(" - Variants in flanking region {}:{}-{} : {}".format(chrom, left, right, sum(is_flancking_in_this_region) ))
381
+
382
+ if is_flanking is None:
383
+ is_flanking = is_flancking_in_this_region
384
+ else:
385
+ is_flanking = is_flanking | is_flancking_in_this_region
386
+
387
+ flanking = sumstats.loc[is_flanking,:]
388
+
389
+ log.write(" - Extracted {} variants in the regions.".format(len(flanking)),verbose=verbose)
390
+ log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
391
+
392
+ return flanking
@@ -6,7 +6,7 @@ import gc
6
6
 
7
7
  def getsignaldensity(insumstats, id="SNPID", chrom="CHR",pos="POS", bwindowsizekb=100,log=Log(),verbose=True):
8
8
  if verbose:log.write("Start to calculate signal DENSITY...")
9
- sumstats = insumstats.loc[:,[id,chrom,pos]].copy()
9
+ sumstats = insumstats[[id,chrom,pos]].copy()
10
10
  if verbose:log.write(" -Calculating DENSITY with windowsize of ",bwindowsizekb ," kb")
11
11
  #stack=[]
12
12
 
@@ -81,7 +81,7 @@ def assigndensity(insumstats,
81
81
  large_number = int(large_number * 10)
82
82
  else:
83
83
  break
84
- sumstats = insumstats.loc[:,[id,chrom,pos]].copy()
84
+ sumstats = insumstats[[id,chrom,pos]].copy()
85
85
  sumstats["DENSITY"] = 0
86
86
  sumstats["TCHR+POS"] = sumstats[chrom]*large_number + sumstats[pos]
87
87
  sig_sumstats["TCHR+POS"] = sig_sumstats[chrom]*large_number + sig_sumstats[pos]
@@ -13,8 +13,9 @@ from gwaslab.bd_common_data import get_chr_to_NC
13
13
  from gwaslab.bd_common_data import gtf_to_protein_coding
14
14
  from gwaslab.bd_download import check_and_download
15
15
  from gwaslab.util_ex_gwascatalog import gwascatalog_trait
16
-
17
-
16
+ from gwaslab.qc_fix_sumstats import check_dataframe_shape
17
+ from gwaslab.qc_fix_sumstats import start_to
18
+ from gwaslab.qc_fix_sumstats import finished
18
19
  # getsig
19
20
  # closest_gene
20
21
  # annogene
@@ -39,8 +40,24 @@ def getsig(insumstats,
39
40
  """
40
41
  Extract the lead variants using a sliding window. P or MLOG10P will be used and converted to SCALEDP for sorting.
41
42
  """
43
+ ##start function with col checking##########################################################
44
+ _start_line = "extract lead variants"
45
+ _end_line = "extracting lead variants"
46
+ _start_cols = [chrom,pos]
47
+ _start_function = ".get_lead()"
48
+ _must_args ={}
49
+
50
+ is_enough_info = start_to(sumstats=insumstats,
51
+ log=log,
52
+ verbose=verbose,
53
+ start_line=_start_line,
54
+ end_line=_end_line,
55
+ start_cols=_start_cols,
56
+ start_function=_start_function,
57
+ **_must_args)
58
+ if is_enough_info == False: return None
59
+ ############################################################################################
42
60
 
43
- if verbose: log.write("Start to extract lead variants...")
44
61
  if verbose: log.write(" -Processing "+str(len(insumstats))+" variants...")
45
62
  if verbose: log.write(" -Significance threshold :", sig_level)
46
63
  if verbose: log.write(" -Sliding window size:", str(windowsizekb) ," kb")
@@ -155,11 +172,9 @@ def getsig(insumstats,
155
172
  source=source,
156
173
  verbose=verbose)
157
174
 
158
- # Finishing
159
- if verbose: log.write("Finished extracting lead variants successfully!")
160
175
  # drop internal id
161
176
  output = output.drop("__ID",axis=1)
162
- gc.collect()
177
+ finished(log,verbose,_end_line)
163
178
  return output.copy()
164
179
 
165
180
 
@@ -329,7 +344,24 @@ def getnovel(insumstats,
329
344
  gwascatalog_source="NCBI",
330
345
  output_known=False,
331
346
  verbose=True):
332
- if verbose: log.write("Start to check if lead variants are known...")
347
+ ##start function with col checking##########################################################
348
+ _start_line = "check if lead variants are known"
349
+ _end_line = "checking if lead variants are known"
350
+ _start_cols = [chrom,pos]
351
+ _start_function = ".get_novel()"
352
+ _must_args ={}
353
+
354
+ is_enough_info = start_to(sumstats=insumstats,
355
+ log=log,
356
+ verbose=verbose,
357
+ start_line=_start_line,
358
+ end_line=_end_line,
359
+ start_cols=_start_cols,
360
+ start_function=_start_function,
361
+ **_must_args)
362
+ if is_enough_info == False: return None
363
+ ############################################################################################
364
+
333
365
  allsig = getsig(insumstats=insumstats,
334
366
  id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
335
367
  xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
@@ -438,8 +470,8 @@ def getnovel(insumstats,
438
470
 
439
471
  if verbose: log.write(" -Identified ",len(allsig)-sum(allsig["NOVEL"])," known vairants in current sumstats...")
440
472
  if verbose: log.write(" -Identified ",sum(allsig["NOVEL"])," novel vairants in current sumstats...")
441
- if verbose: log.write("Finished checking known or novel successfully!")
442
- gc.collect()
473
+
474
+ finished(log,verbose,_end_line)
443
475
 
444
476
  # how to return
445
477
  if only_novel is True: