gwaslab 3.4.37__py3-none-any.whl → 3.4.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +8 -0
- gwaslab/g_Sumstats.py +26 -147
- gwaslab/g_SumstatsPair.py +6 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +29 -15
- gwaslab/hm_harmonize_sumstats.py +291 -163
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +43 -37
- gwaslab/io_to_formats.py +428 -295
- gwaslab/qc_check_datatype.py +3 -3
- gwaslab/qc_fix_sumstats.py +793 -682
- gwaslab/util_ex_calculate_ldmatrix.py +29 -11
- gwaslab/util_ex_gwascatalog.py +1 -1
- gwaslab/util_ex_ldproxyfinder.py +1 -1
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_convert_h2.py +1 -1
- gwaslab/util_in_fill_data.py +2 -2
- gwaslab/util_in_filter_value.py +122 -34
- gwaslab/util_in_get_density.py +2 -2
- gwaslab/util_in_get_sig.py +41 -9
- gwaslab/viz_aux_quickfix.py +24 -19
- gwaslab/viz_aux_reposition_text.py +7 -4
- gwaslab/viz_aux_save_figure.py +6 -5
- gwaslab/viz_plot_compare_af.py +5 -5
- gwaslab/viz_plot_miamiplot2.py +28 -20
- gwaslab/viz_plot_mqqplot.py +109 -72
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +3 -1
- gwaslab/viz_plot_trumpetplot.py +15 -6
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/METADATA +2 -2
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/RECORD +37 -37
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0
|
@@ -12,6 +12,7 @@ def tofinemapping(sumstats,
|
|
|
12
12
|
study=None,
|
|
13
13
|
bfile=None,
|
|
14
14
|
vcf=None,
|
|
15
|
+
loci=None,
|
|
15
16
|
out="./",
|
|
16
17
|
windowsizekb=1000,
|
|
17
18
|
n_cores=1,
|
|
@@ -27,8 +28,13 @@ def tofinemapping(sumstats,
|
|
|
27
28
|
suffixes=[""]
|
|
28
29
|
if getlead_args is None:
|
|
29
30
|
getlead_args={"windowsizekb":1000}
|
|
30
|
-
|
|
31
|
-
|
|
31
|
+
|
|
32
|
+
if loci is None:
|
|
33
|
+
log.write(" -Loci were not provided. All significant loci will be automatically extracted...")
|
|
34
|
+
sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
|
|
35
|
+
else:
|
|
36
|
+
sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
|
|
37
|
+
|
|
32
38
|
# Drop duplicate!!!!
|
|
33
39
|
log.write(" -Dropping duplicated SNPIDs...")
|
|
34
40
|
sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
|
|
@@ -170,6 +176,7 @@ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, w
|
|
|
170
176
|
def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=None):
|
|
171
177
|
if suffixes is None:
|
|
172
178
|
suffixes=[""]
|
|
179
|
+
|
|
173
180
|
log.write(" -#variants in locus ({}): {}".format(row["SNPID"],len(locus_sumstats)))
|
|
174
181
|
# convert category to string
|
|
175
182
|
locus_sumstats["EA"] = locus_sumstats["EA"].astype("string")
|
|
@@ -180,28 +187,35 @@ def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=No
|
|
|
180
187
|
combined_df = pd.merge(ref_bim, locus_sumstats, on="SNPID",how="inner")
|
|
181
188
|
|
|
182
189
|
# match allele
|
|
183
|
-
|
|
184
|
-
log.write(" -#Variants with matched alleles:{}".format(sum(
|
|
190
|
+
perfect_match = ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) )
|
|
191
|
+
log.write(" -#Variants with perfect matched alleles:{}".format(sum(perfect_match)))
|
|
185
192
|
|
|
186
193
|
# fliipped allele
|
|
187
|
-
ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
|
|
188
|
-
|
|
194
|
+
#ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
|
|
195
|
+
flipped_match = ((combined_df["EA"] == combined_df["NEA_bim"])& (combined_df["NEA"] == combined_df["EA_bim"]))
|
|
196
|
+
log.write(" -#Variants with flipped alleles:{}".format(sum(flipped_match)))
|
|
189
197
|
|
|
190
|
-
|
|
198
|
+
allele_match = perfect_match | flipped_match
|
|
199
|
+
log.write(" -#Total Variants matched:{}".format(sum(allele_match)))
|
|
200
|
+
|
|
201
|
+
if row["SNPID"] not in combined_df.loc[perfect_match,"SNPID"].values:
|
|
191
202
|
log.write(" -Warning: Lead variant was not available in reference!!!!!!!!!!!!!!!")
|
|
192
203
|
|
|
193
204
|
# adjust statistics
|
|
194
205
|
output_columns=["SNPID","CHR","POS","EA_bim","NEA_bim"]
|
|
195
206
|
for suffix in suffixes:
|
|
196
207
|
if ("BETA"+suffix in locus_sumstats.columns) and ("SE"+suffix in locus_sumstats.columns):
|
|
197
|
-
|
|
208
|
+
log.write(" -Flipping BETA{} for variants with flipped alleles...".format(suffix))
|
|
209
|
+
combined_df.loc[flipped_match,"BETA"+suffix] = - combined_df.loc[flipped_match,"BETA"+suffix]
|
|
198
210
|
output_columns.append("BETA"+suffix)
|
|
199
211
|
output_columns.append("SE"+suffix)
|
|
200
212
|
if "Z" in locus_sumstats.columns:
|
|
201
|
-
|
|
213
|
+
log.write(" -Flipping Z{} for variants with flipped alleles...".format(suffix))
|
|
214
|
+
combined_df.loc[flipped_match,"Z"+suffix] = - combined_df.loc[flipped_match,"Z"+suffix]
|
|
202
215
|
output_columns.append("Z"+suffix)
|
|
203
216
|
if "EAF" in locus_sumstats.columns:
|
|
204
|
-
|
|
217
|
+
log.write(" -Flipping EAF{} for variants with flipped alleles...".format(suffix))
|
|
218
|
+
combined_df.loc[flipped_match,"EAF"+suffix] = 1 - combined_df.loc[flipped_match,"EAF"+suffix]
|
|
205
219
|
output_columns.append("EAF"+suffix)
|
|
206
220
|
if "N" in locus_sumstats.columns:
|
|
207
221
|
output_columns.append("N"+suffix)
|
|
@@ -215,6 +229,7 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
|
|
|
215
229
|
matched_snp_list_path = "{}/{}_{}_{}.snplist.raw".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
|
|
216
230
|
|
|
217
231
|
matched_sumstats["SNPID"].to_csv(matched_snp_list_path, index=None, header=None)
|
|
232
|
+
log.write(" -Exporting SNP list of {} to: {}...".format(len(matched_sumstats) ,matched_snp_list_path))
|
|
218
233
|
|
|
219
234
|
# create locus-sumstats EA, NEA, (BETA, SE), Z
|
|
220
235
|
matched_sumstats_path = "{}/{}_{}_{}.sumstats.gz".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
|
|
@@ -230,7 +245,10 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
|
|
|
230
245
|
to_export_columns.append("EAF"+suffix)
|
|
231
246
|
if "N"+suffix in matched_sumstats.columns:
|
|
232
247
|
to_export_columns.append("N"+suffix)
|
|
233
|
-
|
|
248
|
+
|
|
249
|
+
log.write(" -Exporting locus sumstats to: {}...".format(matched_sumstats_path))
|
|
250
|
+
log.write(" -Exported columns: {}...".format(["SNPID"]+to_export_columns))
|
|
251
|
+
matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, index=None)
|
|
234
252
|
return matched_snp_list_path, matched_sumstats_path
|
|
235
253
|
|
|
236
254
|
def _check_snpid_order(snplist_path, matched_sumstats_snpid,log):
|
gwaslab/util_ex_gwascatalog.py
CHANGED
|
@@ -127,7 +127,7 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
|
|
|
127
127
|
#rsid locations
|
|
128
128
|
gwascatalog_lead_snps = pd.DataFrame(records,columns=["SNPID","CHR","POS","REPORT_GENENAME","CLOSEST_GENENAMES","FUNCTION_CLASS","OR","BETA","SE","P","TRAIT","STUDY","PUBMEDID","AUTHOR"])
|
|
129
129
|
if verbose: log.write(" -Loading retrieved data into gwaslab Sumstats object ...")
|
|
130
|
-
sigs = gl.Sumstats(gwascatalog_lead_snps,fmt="gwaslab",other=['REPORT_GENENAME', 'CLOSEST_GENENAMES','TRAIT', 'STUDY', 'PUBMEDID','AUTHOR'],verbose=False)
|
|
130
|
+
sigs = gl.Sumstats(gwascatalog_lead_snps.copy(),fmt="gwaslab",other=['REPORT_GENENAME', 'CLOSEST_GENENAMES','TRAIT', 'STUDY', 'PUBMEDID','AUTHOR'],verbose=False)
|
|
131
131
|
sigs.fix_pos(verbose=False)
|
|
132
132
|
sigs.fix_chr(verbose=False)
|
|
133
133
|
sigs.sort_coordinate(verbose=False)
|
gwaslab/util_ex_ldproxyfinder.py
CHANGED
|
@@ -46,7 +46,7 @@ def _extract_with_ld_proxy( snplist=None,
|
|
|
46
46
|
log=Log(),
|
|
47
47
|
verbose=True,
|
|
48
48
|
windowsizekb=100,
|
|
49
|
-
ld_threshold=0.8
|
|
49
|
+
ld_threshold=0.8
|
|
50
50
|
):
|
|
51
51
|
### Load vcf#######################################################################################
|
|
52
52
|
if verbose: log.write("Start to load reference genotype...")
|
gwaslab/util_ex_process_ref.py
CHANGED
|
@@ -89,7 +89,7 @@ def _load_single_bim_to_ref_bims(bpfile_prefix, ref_bims, log):
|
|
|
89
89
|
sep="\s+",
|
|
90
90
|
usecols=[0,1,3,4,5],
|
|
91
91
|
header=None,
|
|
92
|
-
dtype={1:"string",0:"category", 3:"int", 4:"string", 5:"string"}).rename(columns={1:"SNPID",0:"CHR_bim",3:"POS_bim",4:"
|
|
92
|
+
dtype={1:"string",0:"category", 3:"int", 4:"string", 5:"string"}).rename(columns={1:"SNPID",0:"CHR_bim",3:"POS_bim",4:"EA_bim",5:"NEA_bim"})
|
|
93
93
|
log.write(" -#variants in ref file: {}".format(len(single_bim)))
|
|
94
94
|
ref_bims.append(single_bim)
|
|
95
95
|
return ref_bims
|
|
@@ -104,7 +104,7 @@ def _load_single_pvar_to_ref_bims(bpfile_prefix, ref_bims, log):
|
|
|
104
104
|
usecols=[0,1,2,3,4],
|
|
105
105
|
header=None,
|
|
106
106
|
comment="#",
|
|
107
|
-
dtype={2:"string",0:"category", 1:"int", 3:"string", 4:"string"}).rename(columns={2:"SNPID",0:"CHR_bim",1:"POS_bim",3:"
|
|
107
|
+
dtype={2:"string",0:"category", 1:"int", 3:"string", 4:"string"}).rename(columns={2:"SNPID",0:"CHR_bim",1:"POS_bim",3:"EA_bim",4:"NEA_bim"})
|
|
108
108
|
log.write(" -#variants in ref file: {}".format(len(single_bim)))
|
|
109
109
|
ref_bims.append(single_bim)
|
|
110
110
|
return ref_bims
|
|
@@ -265,7 +265,7 @@ def _process_vcf(ref_file_prefix,
|
|
|
265
265
|
except subprocess.CalledProcessError as e:
|
|
266
266
|
log.write(e.output)
|
|
267
267
|
else:
|
|
268
|
-
log.write(" -Plink {} for CHR {} exists. Skipping...".format(convert ,i))
|
|
268
|
+
log.write(" -Plink {} for CHR {} exists: {}. Skipping...".format(convert ,i, bpfile_prefix))
|
|
269
269
|
|
|
270
270
|
if load_bim == True:
|
|
271
271
|
if convert == "bfile":
|
gwaslab/util_ex_run_coloc.py
CHANGED
|
@@ -68,12 +68,16 @@ def _run_coloc_susie(filepath, r="Rscript",
|
|
|
68
68
|
D1 <- list( "LD"=R, "beta"=df[,"BETA_1"],"varbeta"=df[,"SE_1"]**2,"snp"=df[,"SNPID"],"position"=df[,"POS"],"type"="{type1}","N"={n1}{d1_args})
|
|
69
69
|
D2 <- list( "LD"=R, "beta"=df[,"BETA_2"],"varbeta"=df[,"SE_2"]**2,"snp"=df[,"SNPID"],"position"=df[,"POS"],"type"="{type2}","N"={n2}{d2_args})
|
|
70
70
|
|
|
71
|
+
abf <- coloc.abf(dataset1=D1,dataset2=D2)
|
|
72
|
+
write.csv(t(data.frame(abf$summary)) , "{output_prefix}.coloc.abf", row.names = FALSE)
|
|
73
|
+
|
|
71
74
|
S1=runsusie(D1{susie_args})
|
|
72
75
|
S2=runsusie(D2{susie_args})
|
|
73
76
|
|
|
74
77
|
susie.res=coloc.susie(S1,S2{coloc_args})
|
|
75
78
|
|
|
76
79
|
write.csv(susie.res$summary, "{output_prefix}.coloc.susie", row.names = FALSE)
|
|
80
|
+
|
|
77
81
|
'''.format(sumstats_path = sumstats,
|
|
78
82
|
ld_r_matrix_path = ld_r_matrix,
|
|
79
83
|
fillna_script = "R[is.na(R)] <- 0" if fillldna==True else "",
|
|
@@ -87,7 +91,9 @@ def _run_coloc_susie(filepath, r="Rscript",
|
|
|
87
91
|
coloc_args = coloc_args,
|
|
88
92
|
output_prefix = output_prefix)
|
|
89
93
|
|
|
90
|
-
log.write(" -coloc script: {}".format("coloc.
|
|
94
|
+
log.write(" -coloc abf script: {}".format("coloc.abf(dataset1=D1,dataset2=D2)"), verbose=verbose)
|
|
95
|
+
log.write(" -coloc susie script: {}".format("coloc.susie(S1,S2)"), verbose=verbose)
|
|
96
|
+
|
|
91
97
|
with open("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]),"w") as file:
|
|
92
98
|
file.write(rscript)
|
|
93
99
|
|
|
@@ -101,21 +107,37 @@ def _run_coloc_susie(filepath, r="Rscript",
|
|
|
101
107
|
#plink_process.kill()
|
|
102
108
|
log.write(" Running coloc.SuSieR from command line...", verbose=verbose)
|
|
103
109
|
r_log+= output + "\n"
|
|
110
|
+
|
|
111
|
+
pip_cs = pd.read_csv("{}.coloc.abf".format(output_prefix))
|
|
112
|
+
if len(pip_cs)==0:
|
|
113
|
+
log.write(" -SuSieR result for {} is empty. Please check parameters.".format(output_prefix), verbose=verbose)
|
|
114
|
+
else:
|
|
115
|
+
pip_cs["Locus"] = row["SNPID"]
|
|
116
|
+
pip_cs["STUDY"] = row["study"]
|
|
117
|
+
pip_cs["hit1"] = row["SNPID"]
|
|
118
|
+
pip_cs["METHOD"] = "abf"
|
|
119
|
+
locus_pip_cs = pd.concat([locus_pip_cs,pip_cs],ignore_index=True)
|
|
120
|
+
|
|
104
121
|
pip_cs = pd.read_csv("{}.coloc.susie".format(output_prefix))
|
|
105
122
|
if len(pip_cs)==0:
|
|
106
123
|
log.write(" -SuSieR result for {} is empty. Please check parameters.".format(output_prefix), verbose=verbose)
|
|
107
124
|
else:
|
|
108
125
|
pip_cs["Locus"] = row["SNPID"]
|
|
109
126
|
pip_cs["STUDY"] = row["study"]
|
|
127
|
+
pip_cs["METHOD"] = "susie"
|
|
110
128
|
locus_pip_cs = pd.concat([locus_pip_cs,pip_cs],ignore_index=True)
|
|
129
|
+
|
|
111
130
|
os.remove("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]))
|
|
131
|
+
|
|
112
132
|
if delete == True:
|
|
113
|
-
os.remove("{}.
|
|
133
|
+
os.remove("{}.coloc.susie".format(output_prefix))
|
|
134
|
+
os.remove("{}.coloc.abf".format(output_prefix))
|
|
114
135
|
else:
|
|
115
|
-
log.write(" -
|
|
136
|
+
log.write(" -coloc-abf result summary to: {}".format("{}.coloc.abf".format(output_prefix)), verbose=verbose)
|
|
137
|
+
log.write(" -coloc-susie result summary to: {}".format("{}.coloc.susie".format(output_prefix)), verbose=verbose)
|
|
116
138
|
|
|
117
139
|
except subprocess.CalledProcessError as e:
|
|
118
140
|
log.write(e.output)
|
|
119
141
|
os.remove("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]))
|
|
120
|
-
log.write("Finished
|
|
142
|
+
log.write("Finished clocalization using coloc and SuSiE.", verbose=verbose)
|
|
121
143
|
return locus_pip_cs
|
gwaslab/util_in_convert_h2.py
CHANGED
|
@@ -121,7 +121,7 @@ def _get_per_snp_r2(sumstats,
|
|
|
121
121
|
if verbose: log.write(" -For r2, {} is used.".format(snpr2))
|
|
122
122
|
sumstats["F"] = sumstats[snpr2]*(sumstats[n]-1 -k)/((1-sumstats[snpr2]) * k)
|
|
123
123
|
|
|
124
|
-
if verbose: log.write("Finished calculating per-SNP
|
|
124
|
+
if verbose: log.write("Finished calculating per-SNP heritability!")
|
|
125
125
|
return sumstats
|
|
126
126
|
#
|
|
127
127
|
def get_population_allele_frequency(af, prop, odds_ratio, prevalence,eps=1e-15):
|
gwaslab/util_in_fill_data.py
CHANGED
|
@@ -9,7 +9,7 @@ from gwaslab.g_version import _get_version
|
|
|
9
9
|
from gwaslab.qc_check_datatype import check_datatype
|
|
10
10
|
|
|
11
11
|
def filldata(
|
|
12
|
-
|
|
12
|
+
insumstats,
|
|
13
13
|
to_fill=None,
|
|
14
14
|
df=None,
|
|
15
15
|
overwrite=False,
|
|
@@ -23,7 +23,7 @@ def filldata(
|
|
|
23
23
|
# if a string is passed to to_fill, convert it to list
|
|
24
24
|
if type(to_fill) is str:
|
|
25
25
|
to_fill = [to_fill]
|
|
26
|
-
|
|
26
|
+
sumstats = insumstats.copy()
|
|
27
27
|
if verbose: log.write("Start filling data using existing columns...{}".format(_get_version()))
|
|
28
28
|
|
|
29
29
|
check_datatype(sumstats,verbose=verbose,log=log)
|
gwaslab/util_in_filter_value.py
CHANGED
|
@@ -8,6 +8,8 @@ from gwaslab.bd_common_data import get_chr_to_number
|
|
|
8
8
|
from gwaslab.g_Log import Log
|
|
9
9
|
from gwaslab.g_vchange_status import vchange_status
|
|
10
10
|
from gwaslab.qc_fix_sumstats import sortcoordinate
|
|
11
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
12
|
+
from gwaslab.qc_fix_sumstats import finished
|
|
11
13
|
|
|
12
14
|
import gc
|
|
13
15
|
def filtervalues(sumstats,expr,remove=False,verbose=True,log=Log()):
|
|
@@ -214,6 +216,24 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
|
|
|
214
216
|
return sumstats
|
|
215
217
|
|
|
216
218
|
def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True,log=Log()):
|
|
219
|
+
##start function with col checking##########################################################
|
|
220
|
+
_start_line = "infer genome build version using hapmap3 SNPs"
|
|
221
|
+
_end_line = "inferring genome build version using hapmap3 SNPs"
|
|
222
|
+
_start_cols = [chrom,pos]
|
|
223
|
+
_start_function = ".infer_build()"
|
|
224
|
+
_must_args ={}
|
|
225
|
+
|
|
226
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
227
|
+
log=log,
|
|
228
|
+
verbose=verbose,
|
|
229
|
+
start_line=_start_line,
|
|
230
|
+
end_line=_end_line,
|
|
231
|
+
start_cols=_start_cols,
|
|
232
|
+
start_function=_start_function,
|
|
233
|
+
**_must_args)
|
|
234
|
+
if is_enough_info == False: return sumstats
|
|
235
|
+
############################################################################################
|
|
236
|
+
|
|
217
237
|
inferred_build="Unknown"
|
|
218
238
|
if verbose:log.write("Start to infer genome build version using hapmap3 SNPs...")
|
|
219
239
|
data_path_19 = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz'
|
|
@@ -222,42 +242,39 @@ def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NE
|
|
|
222
242
|
hapmap3_ref_19 = pd.read_csv(data_path_19,sep="\s+",usecols=["#CHROM","POS"],dtype={"#CHROM":"string","POS":"string"})
|
|
223
243
|
hapmap3_ref_38 = pd.read_csv(data_path_38,sep="\s+",usecols=["#CHROM","POS"],dtype={"#CHROM":"string","POS":"string"})
|
|
224
244
|
|
|
225
|
-
if
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
if
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
if
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
inferred_build="38"
|
|
251
|
-
else:
|
|
252
|
-
if verbose:log.write(" -Since num_hg19 = num_hg38, unable to infer...")
|
|
253
|
-
gc.collect()
|
|
254
|
-
if verbose:log.write("Finished inferring genome build version using hapmap3 SNPs...")
|
|
255
|
-
return sumstats, inferred_build
|
|
245
|
+
if verbose: log.write(" -CHR:POS will be used for matching...")
|
|
246
|
+
raw_chrpos = sumstats[chrom].astype("string")+":"+sumstats[pos].astype("string")
|
|
247
|
+
|
|
248
|
+
hapmap3_ref_19["chr:pos"] = hapmap3_ref_19["#CHROM"]+":"+hapmap3_ref_19["POS"]
|
|
249
|
+
hapmap3_ref_38["chr:pos"] = hapmap3_ref_38["#CHROM"]+":"+hapmap3_ref_38["POS"]
|
|
250
|
+
|
|
251
|
+
match_count_for_19 = sum(raw_chrpos.isin(hapmap3_ref_19["chr:pos"].values))
|
|
252
|
+
match_count_for_38 = sum(raw_chrpos.isin(hapmap3_ref_38["chr:pos"].values))
|
|
253
|
+
|
|
254
|
+
if verbose:log.write(" -Matching variants for hg19: num_hg19 = ",match_count_for_19)
|
|
255
|
+
if verbose:log.write(" -Matching variants for hg38: num_hg38 = ",match_count_for_38)
|
|
256
|
+
|
|
257
|
+
if max(match_count_for_19, match_count_for_38)<10000:
|
|
258
|
+
if verbose:log.write(" -Warning: please be cautious due to the limited number of variants.")
|
|
259
|
+
|
|
260
|
+
if match_count_for_19 > match_count_for_38:
|
|
261
|
+
if verbose:log.write(" -Since num_hg19 >> num_hg38, assigning genome build hg19...")
|
|
262
|
+
sumstats[status] = vchange_status(sumstats[status],1,"9","1")
|
|
263
|
+
sumstats[status] = vchange_status(sumstats[status],2,"9","9")
|
|
264
|
+
inferred_build="19"
|
|
265
|
+
elif match_count_for_19 < match_count_for_38:
|
|
266
|
+
if verbose:log.write(" -Since num_hg19 << num_hg38, assigning genome build hg38...")
|
|
267
|
+
sumstats[status] = vchange_status(sumstats[status],1,"9","3")
|
|
268
|
+
sumstats[status] = vchange_status(sumstats[status],2,"9","8")
|
|
269
|
+
inferred_build="38"
|
|
256
270
|
else:
|
|
257
|
-
|
|
258
|
-
|
|
271
|
+
if verbose:log.write(" -Since num_hg19 = num_hg38, unable to infer...")
|
|
272
|
+
|
|
273
|
+
finished(log,verbose,_end_line)
|
|
274
|
+
return sumstats, inferred_build
|
|
259
275
|
|
|
260
276
|
def sampling(sumstats,n=1, p=None, verbose=True,log=Log(),**args):
|
|
277
|
+
|
|
261
278
|
if verbose:log.write("Start to randomly select variants from the sumstats...")
|
|
262
279
|
if p is None:
|
|
263
280
|
if verbose:log.write(" -Number of variants selected from the sumstats:",n)
|
|
@@ -301,4 +318,75 @@ def _get_flanking(sumstats, snpid, windowsizekb=500, verbose=True,log=Log(),**ar
|
|
|
301
318
|
log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
|
|
302
319
|
|
|
303
320
|
return flanking
|
|
304
|
-
|
|
321
|
+
|
|
322
|
+
def _get_flanking_by_id(sumstats, snpid, windowsizekb=500, verbose=True,log=Log(),**args):
|
|
323
|
+
|
|
324
|
+
log.write("Start to extract variants in the flanking regions using rsID or SNPID...",verbose=verbose)
|
|
325
|
+
log.write(" - Central variants: {}".format(snpid))
|
|
326
|
+
log.write(" - Flanking windowsize in kb: {}".format(windowsizekb))
|
|
327
|
+
|
|
328
|
+
if type(snpid) == str:
|
|
329
|
+
snpid = [snpid]
|
|
330
|
+
|
|
331
|
+
if "rsID" in sumstats.columns and "SNPID" not in sumstats.columns:
|
|
332
|
+
is_specified = sumstats["rsID"].isin(snpid)
|
|
333
|
+
elif "rsID" not in sumstats.columns and "SNPID" in sumstats.columns:
|
|
334
|
+
is_specified = sumstats["SNPID"].isin(snpid)
|
|
335
|
+
else:
|
|
336
|
+
is_specified = sumstats["rsID"].isin(snpid) | sumstats["SNPID"].isin(snpid)
|
|
337
|
+
|
|
338
|
+
row = sumstats.loc[is_specified,:]
|
|
339
|
+
|
|
340
|
+
is_flanking = None
|
|
341
|
+
for index, row in row.iterrows():
|
|
342
|
+
chrom = row["CHR"]
|
|
343
|
+
left = row["POS"] - 1000 * windowsizekb
|
|
344
|
+
right = row["POS"] + 1000 * windowsizekb
|
|
345
|
+
is_flancking_in_this_region = (sumstats["CHR"] == chrom) & (sumstats["POS"] >= left) & (sumstats["POS"] <= right)
|
|
346
|
+
|
|
347
|
+
log.write(" - Variants in flanking region {}:{}-{} : {}".format(chrom, left, right, sum(is_flancking_in_this_region) ))
|
|
348
|
+
|
|
349
|
+
if is_flanking is None:
|
|
350
|
+
is_flanking = is_flancking_in_this_region
|
|
351
|
+
else:
|
|
352
|
+
is_flanking = is_flanking | is_flancking_in_this_region
|
|
353
|
+
|
|
354
|
+
flanking = sumstats.loc[is_flanking,:]
|
|
355
|
+
|
|
356
|
+
log.write(" - Extracted {} variants in the regions.".format(len(flanking)),verbose=verbose)
|
|
357
|
+
log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
|
|
358
|
+
|
|
359
|
+
return flanking
|
|
360
|
+
|
|
361
|
+
def _get_flanking_by_chrpos(sumstats, chrpos, windowsizekb=500, verbose=True,log=Log(),**args):
|
|
362
|
+
|
|
363
|
+
log.write("Start to extract variants in the flanking regions using CHR and POS...",verbose=verbose)
|
|
364
|
+
log.write(" - Central positions: {}".format(chrpos))
|
|
365
|
+
log.write(" - Flanking windowsize in kb: {}".format(windowsizekb))
|
|
366
|
+
|
|
367
|
+
if type(chrpos) == tuple:
|
|
368
|
+
chrpos_to_check = [chrpos]
|
|
369
|
+
else:
|
|
370
|
+
chrpos_to_check = chrpos
|
|
371
|
+
|
|
372
|
+
is_flanking = None
|
|
373
|
+
|
|
374
|
+
for index, row in enumerate(chrpos_to_check):
|
|
375
|
+
chrom = row[0]
|
|
376
|
+
left = row[1] - 1000 * windowsizekb
|
|
377
|
+
right = row[1] + 1000 * windowsizekb
|
|
378
|
+
is_flancking_in_this_region = (sumstats["CHR"] == chrom) & (sumstats["POS"] >= left) & (sumstats["POS"] <= right)
|
|
379
|
+
|
|
380
|
+
log.write(" - Variants in flanking region {}:{}-{} : {}".format(chrom, left, right, sum(is_flancking_in_this_region) ))
|
|
381
|
+
|
|
382
|
+
if is_flanking is None:
|
|
383
|
+
is_flanking = is_flancking_in_this_region
|
|
384
|
+
else:
|
|
385
|
+
is_flanking = is_flanking | is_flancking_in_this_region
|
|
386
|
+
|
|
387
|
+
flanking = sumstats.loc[is_flanking,:]
|
|
388
|
+
|
|
389
|
+
log.write(" - Extracted {} variants in the regions.".format(len(flanking)),verbose=verbose)
|
|
390
|
+
log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
|
|
391
|
+
|
|
392
|
+
return flanking
|
gwaslab/util_in_get_density.py
CHANGED
|
@@ -6,7 +6,7 @@ import gc
|
|
|
6
6
|
|
|
7
7
|
def getsignaldensity(insumstats, id="SNPID", chrom="CHR",pos="POS", bwindowsizekb=100,log=Log(),verbose=True):
|
|
8
8
|
if verbose:log.write("Start to calculate signal DENSITY...")
|
|
9
|
-
sumstats = insumstats
|
|
9
|
+
sumstats = insumstats[[id,chrom,pos]].copy()
|
|
10
10
|
if verbose:log.write(" -Calculating DENSITY with windowsize of ",bwindowsizekb ," kb")
|
|
11
11
|
#stack=[]
|
|
12
12
|
|
|
@@ -81,7 +81,7 @@ def assigndensity(insumstats,
|
|
|
81
81
|
large_number = int(large_number * 10)
|
|
82
82
|
else:
|
|
83
83
|
break
|
|
84
|
-
sumstats = insumstats
|
|
84
|
+
sumstats = insumstats[[id,chrom,pos]].copy()
|
|
85
85
|
sumstats["DENSITY"] = 0
|
|
86
86
|
sumstats["TCHR+POS"] = sumstats[chrom]*large_number + sumstats[pos]
|
|
87
87
|
sig_sumstats["TCHR+POS"] = sig_sumstats[chrom]*large_number + sig_sumstats[pos]
|
gwaslab/util_in_get_sig.py
CHANGED
|
@@ -13,8 +13,9 @@ from gwaslab.bd_common_data import get_chr_to_NC
|
|
|
13
13
|
from gwaslab.bd_common_data import gtf_to_protein_coding
|
|
14
14
|
from gwaslab.bd_download import check_and_download
|
|
15
15
|
from gwaslab.util_ex_gwascatalog import gwascatalog_trait
|
|
16
|
-
|
|
17
|
-
|
|
16
|
+
from gwaslab.qc_fix_sumstats import check_dataframe_shape
|
|
17
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
18
|
+
from gwaslab.qc_fix_sumstats import finished
|
|
18
19
|
# getsig
|
|
19
20
|
# closest_gene
|
|
20
21
|
# annogene
|
|
@@ -39,8 +40,24 @@ def getsig(insumstats,
|
|
|
39
40
|
"""
|
|
40
41
|
Extract the lead variants using a sliding window. P or MLOG10P will be used and converted to SCALEDP for sorting.
|
|
41
42
|
"""
|
|
43
|
+
##start function with col checking##########################################################
|
|
44
|
+
_start_line = "extract lead variants"
|
|
45
|
+
_end_line = "extracting lead variants"
|
|
46
|
+
_start_cols = [chrom,pos]
|
|
47
|
+
_start_function = ".get_lead()"
|
|
48
|
+
_must_args ={}
|
|
49
|
+
|
|
50
|
+
is_enough_info = start_to(sumstats=insumstats,
|
|
51
|
+
log=log,
|
|
52
|
+
verbose=verbose,
|
|
53
|
+
start_line=_start_line,
|
|
54
|
+
end_line=_end_line,
|
|
55
|
+
start_cols=_start_cols,
|
|
56
|
+
start_function=_start_function,
|
|
57
|
+
**_must_args)
|
|
58
|
+
if is_enough_info == False: return None
|
|
59
|
+
############################################################################################
|
|
42
60
|
|
|
43
|
-
if verbose: log.write("Start to extract lead variants...")
|
|
44
61
|
if verbose: log.write(" -Processing "+str(len(insumstats))+" variants...")
|
|
45
62
|
if verbose: log.write(" -Significance threshold :", sig_level)
|
|
46
63
|
if verbose: log.write(" -Sliding window size:", str(windowsizekb) ," kb")
|
|
@@ -155,11 +172,9 @@ def getsig(insumstats,
|
|
|
155
172
|
source=source,
|
|
156
173
|
verbose=verbose)
|
|
157
174
|
|
|
158
|
-
# Finishing
|
|
159
|
-
if verbose: log.write("Finished extracting lead variants successfully!")
|
|
160
175
|
# drop internal id
|
|
161
176
|
output = output.drop("__ID",axis=1)
|
|
162
|
-
|
|
177
|
+
finished(log,verbose,_end_line)
|
|
163
178
|
return output.copy()
|
|
164
179
|
|
|
165
180
|
|
|
@@ -329,7 +344,24 @@ def getnovel(insumstats,
|
|
|
329
344
|
gwascatalog_source="NCBI",
|
|
330
345
|
output_known=False,
|
|
331
346
|
verbose=True):
|
|
332
|
-
|
|
347
|
+
##start function with col checking##########################################################
|
|
348
|
+
_start_line = "check if lead variants are known"
|
|
349
|
+
_end_line = "checking if lead variants are known"
|
|
350
|
+
_start_cols = [chrom,pos]
|
|
351
|
+
_start_function = ".get_novel()"
|
|
352
|
+
_must_args ={}
|
|
353
|
+
|
|
354
|
+
is_enough_info = start_to(sumstats=insumstats,
|
|
355
|
+
log=log,
|
|
356
|
+
verbose=verbose,
|
|
357
|
+
start_line=_start_line,
|
|
358
|
+
end_line=_end_line,
|
|
359
|
+
start_cols=_start_cols,
|
|
360
|
+
start_function=_start_function,
|
|
361
|
+
**_must_args)
|
|
362
|
+
if is_enough_info == False: return None
|
|
363
|
+
############################################################################################
|
|
364
|
+
|
|
333
365
|
allsig = getsig(insumstats=insumstats,
|
|
334
366
|
id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
|
|
335
367
|
xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
|
|
@@ -438,8 +470,8 @@ def getnovel(insumstats,
|
|
|
438
470
|
|
|
439
471
|
if verbose: log.write(" -Identified ",len(allsig)-sum(allsig["NOVEL"])," known vairants in current sumstats...")
|
|
440
472
|
if verbose: log.write(" -Identified ",sum(allsig["NOVEL"])," novel vairants in current sumstats...")
|
|
441
|
-
|
|
442
|
-
|
|
473
|
+
|
|
474
|
+
finished(log,verbose,_end_line)
|
|
443
475
|
|
|
444
476
|
# how to return
|
|
445
477
|
if only_novel is True:
|