gwaslab 3.4.47__py3-none-any.whl → 3.4.49__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +3 -1
- gwaslab/data/reference.json +10 -2
- gwaslab/g_Sumstats.py +22 -2
- gwaslab/g_vchange_status.py +1 -1
- gwaslab/g_version.py +2 -2
- gwaslab/hm_harmonize_sumstats.py +23 -7
- gwaslab/io_preformat_input.py +73 -8
- gwaslab/io_read_ldsc.py +16 -2
- gwaslab/io_to_formats.py +5 -5
- gwaslab/qc_fix_sumstats.py +109 -7
- gwaslab/util_abf_finemapping.py +67 -0
- gwaslab/util_ex_ldsc.py +8 -1
- gwaslab/util_ex_run_clumping.py +6 -6
- gwaslab/util_in_fill_data.py +20 -2
- gwaslab/viz_aux_annotate_plot.py +2 -1
- gwaslab/viz_aux_quickfix.py +2 -1
- gwaslab/viz_plot_compare_effect.py +4 -2
- gwaslab/viz_plot_miamiplot2.py +10 -9
- gwaslab/viz_plot_mqqplot.py +42 -21
- gwaslab/viz_plot_regional2.py +75 -29
- gwaslab/viz_plot_stackedregional.py +37 -16
- {gwaslab-3.4.47.dist-info → gwaslab-3.4.49.dist-info}/METADATA +15 -15
- {gwaslab-3.4.47.dist-info → gwaslab-3.4.49.dist-info}/RECORD +27 -26
- {gwaslab-3.4.47.dist-info → gwaslab-3.4.49.dist-info}/WHEEL +1 -1
- {gwaslab-3.4.47.dist-info → gwaslab-3.4.49.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.47.dist-info → gwaslab-3.4.49.dist-info}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.47.dist-info → gwaslab-3.4.49.dist-info}/top_level.txt +0 -0
gwaslab/util_ex_ldsc.py
CHANGED
|
@@ -304,9 +304,16 @@ def _estimate_h2_by_ldsc(insumstats, log, verbose=True, munge=False, munge_args=
|
|
|
304
304
|
log.write(" -LDSC log:", verbose=verbose)
|
|
305
305
|
summary = estimate_h2(sumstats, default_args, log)
|
|
306
306
|
|
|
307
|
+
results_table = None
|
|
308
|
+
if type(summary) is tuple:
|
|
309
|
+
results_table = summary[1]
|
|
310
|
+
summary = summary[0]
|
|
311
|
+
log.write(" -Coefficient results have been stored in .ldsc_h2_results", verbose=verbose)
|
|
312
|
+
|
|
313
|
+
|
|
307
314
|
log.write(" -Results have been stored in .ldsc_h2", verbose=verbose)
|
|
308
315
|
finished(log=log,verbose=verbose,end_line=_end_line)
|
|
309
|
-
return parse_ldsc_summary(summary)
|
|
316
|
+
return parse_ldsc_summary(summary), results_table
|
|
310
317
|
|
|
311
318
|
|
|
312
319
|
####################################################################################################################
|
gwaslab/util_ex_run_clumping.py
CHANGED
|
@@ -11,7 +11,7 @@ from gwaslab.g_version import _checking_plink_version
|
|
|
11
11
|
def _clump(insumstats, vcf=None, scaled=False, out="clumping_plink2",
|
|
12
12
|
p="P",mlog10p="MLOG10P", overwrite=False, study=None, bfile=None,
|
|
13
13
|
n_cores=1, memory=None, chrom=None, clump_p1=5e-8, clump_p2=5e-8, clump_r2=0.01, clump_kb=250,
|
|
14
|
-
log=Log(),verbose=True):
|
|
14
|
+
log=Log(),verbose=True,plink="plink",plink2="plink2"):
|
|
15
15
|
##start function with col checking##########################################################
|
|
16
16
|
_start_line = "perfrom clumping"
|
|
17
17
|
_end_line = "clumping"
|
|
@@ -111,7 +111,7 @@ def _clump(insumstats, vcf=None, scaled=False, out="clumping_plink2",
|
|
|
111
111
|
bfile_to_use = bfile
|
|
112
112
|
|
|
113
113
|
log.write(" -Performing clumping for CHR {}...".format(i),verbose=verbose)
|
|
114
|
-
log = _checking_plink_version(
|
|
114
|
+
log = _checking_plink_version(plink2=plink2, log=log)
|
|
115
115
|
if memory is not None:
|
|
116
116
|
memory_flag = "--memory {}".format(memory)
|
|
117
117
|
|
|
@@ -123,7 +123,7 @@ def _clump(insumstats, vcf=None, scaled=False, out="clumping_plink2",
|
|
|
123
123
|
if scaled == True:
|
|
124
124
|
# clumping using LOG10P
|
|
125
125
|
script = """
|
|
126
|
-
|
|
126
|
+
{} \
|
|
127
127
|
{}\
|
|
128
128
|
--chr {} \
|
|
129
129
|
--clump {} \
|
|
@@ -136,11 +136,11 @@ def _clump(insumstats, vcf=None, scaled=False, out="clumping_plink2",
|
|
|
136
136
|
--clump-kb {} \
|
|
137
137
|
--threads {} {}\
|
|
138
138
|
--out {}
|
|
139
|
-
""".format(file_flag, chrom, clump, mlog10p,clump_log10_p1, clump_log10_p2, clump_r2, clump_kb, n_cores, memory_flag if memory is not None else "", out_single_chr)
|
|
139
|
+
""".format(plink2, file_flag, chrom, clump, mlog10p,clump_log10_p1, clump_log10_p2, clump_r2, clump_kb, n_cores, memory_flag if memory is not None else "", out_single_chr)
|
|
140
140
|
else:
|
|
141
141
|
# clumping using P
|
|
142
142
|
script = """
|
|
143
|
-
|
|
143
|
+
{} \
|
|
144
144
|
{}\
|
|
145
145
|
--chr {} \
|
|
146
146
|
--clump {} \
|
|
@@ -152,7 +152,7 @@ def _clump(insumstats, vcf=None, scaled=False, out="clumping_plink2",
|
|
|
152
152
|
--clump-kb {} \
|
|
153
153
|
--threads {} {}\
|
|
154
154
|
--out {}
|
|
155
|
-
""".format(file_flag, chrom, clump, p, clump_p1, clump_p2, clump_r2, clump_kb, n_cores,memory_flag if memory is not None else "", out_single_chr)
|
|
155
|
+
""".format(plink2,file_flag, chrom, clump, p, clump_p1, clump_p2, clump_r2, clump_kb, n_cores,memory_flag if memory is not None else "", out_single_chr)
|
|
156
156
|
|
|
157
157
|
try:
|
|
158
158
|
output = subprocess.check_output(script, stderr=subprocess.STDOUT, shell=True,text=True)
|
gwaslab/util_in_fill_data.py
CHANGED
|
@@ -184,7 +184,8 @@ def fill_mlog10p(sumstats,log,verbose=True,filled_count=0):
|
|
|
184
184
|
else:
|
|
185
185
|
return 0,filled_count
|
|
186
186
|
return 1,filled_count
|
|
187
|
-
|
|
187
|
+
|
|
188
|
+
def fill_extreme_mlog10p(sumstats,df,log,verbose=True,filled_count=0):
|
|
188
189
|
# ref: https://stackoverflow.com/questions/46416027/how-to-compute-p-values-from-z-scores-in-r-when-the-z-score-is-large-pvalue-muc/46416222#46416222
|
|
189
190
|
if "Z" in sumstats.columns:
|
|
190
191
|
# P -> MLOG10P
|
|
@@ -198,6 +199,10 @@ def fill_extreme_mlog10p(sumstats,log,verbose=True,filled_count=0):
|
|
|
198
199
|
log.write(" - Filling MLOG10P using Z column...", verbose=verbose)
|
|
199
200
|
sumstats = fill_extreme_mlog10(sumstats, "Z")
|
|
200
201
|
filled_count +=1
|
|
202
|
+
elif "CHISQ" in sumstats.columns and "DOF" in sumstats.columns:
|
|
203
|
+
log.write(" - Filling MLOG10P using CHISQ and DOF column...", verbose=verbose)
|
|
204
|
+
sumstats = fill_extreme_mlog10_chisq(sumstats, "CHISQ", df)
|
|
205
|
+
filled_count +=1
|
|
201
206
|
else:
|
|
202
207
|
return 0,filled_count
|
|
203
208
|
return 1,filled_count
|
|
@@ -223,6 +228,19 @@ def fill_extreme_mlog10(sumstats, z):
|
|
|
223
228
|
sumstats["P_EXPONENT"]= exponent
|
|
224
229
|
return sumstats
|
|
225
230
|
|
|
231
|
+
def fill_extreme_mlog10_chisq(sumstats, chisq, df):
|
|
232
|
+
#https://stackoverflow.com/a/46416222/199475
|
|
233
|
+
log_pvalue = ss.chi2.logsf(sumstats[chisq], sumstats[df])
|
|
234
|
+
|
|
235
|
+
log10_pvalue = log_pvalue/np.log(10)
|
|
236
|
+
|
|
237
|
+
mantissa = 10**(log10_pvalue %1)
|
|
238
|
+
exponent = log10_pvalue // 1
|
|
239
|
+
sumstats["MLOG10P"] = -log10_pvalue
|
|
240
|
+
sumstats["P_MANTISSA"]= mantissa
|
|
241
|
+
sumstats["P_EXPONENT"]= exponent
|
|
242
|
+
return sumstats
|
|
243
|
+
|
|
226
244
|
####################################################################################################################
|
|
227
245
|
def fill_iteratively(sumstats,raw_to_fill,log,only_sig,df,extreme,verbose,sig_level):
|
|
228
246
|
to_fill = raw_to_fill.copy()
|
|
@@ -260,7 +278,7 @@ def fill_iteratively(sumstats,raw_to_fill,log,only_sig,df,extreme,verbose,sig_le
|
|
|
260
278
|
# p to -log10(P) ###############################################################################################
|
|
261
279
|
if "MLOG10P" in to_fill:
|
|
262
280
|
if extreme==True:
|
|
263
|
-
status,filled_count = fill_extreme_mlog10p(sumstats,log,verbose=verbose,filled_count=filled_count)
|
|
281
|
+
status,filled_count = fill_extreme_mlog10p(sumstats,df, log,verbose=verbose,filled_count=filled_count)
|
|
264
282
|
filled_count +=1
|
|
265
283
|
elif "P" not in sumstats.columns:
|
|
266
284
|
fill_p(sumstats,log,verbose=verbose)
|
gwaslab/viz_aux_annotate_plot.py
CHANGED
|
@@ -38,6 +38,7 @@ def annotate_single(
|
|
|
38
38
|
region,
|
|
39
39
|
region_anno_bbox_args,
|
|
40
40
|
skip,
|
|
41
|
+
anno_height=1,
|
|
41
42
|
amode="int",
|
|
42
43
|
snpid="SNPID",
|
|
43
44
|
chrom="CHR",
|
|
@@ -131,7 +132,7 @@ def annotate_single(
|
|
|
131
132
|
|
|
132
133
|
#xy=(row["i"],row["scaled_P"]+0.2)
|
|
133
134
|
xy=(row["i"],row["scaled_P"]+0.01*maxy)
|
|
134
|
-
xytext=(last_pos,1.15*maxy*arm_scale)
|
|
135
|
+
xytext=(last_pos,1.15*maxy*arm_scale*anno_height)
|
|
135
136
|
|
|
136
137
|
if anno_fixed_arm_length is not None:
|
|
137
138
|
armB_length_in_point = anno_fixed_arm_length
|
gwaslab/viz_aux_quickfix.py
CHANGED
|
@@ -286,8 +286,9 @@ def _cut(series, mode,cutfactor,cut,skip, ylabels, cut_log, verbose, lines_to_pl
|
|
|
286
286
|
log.write(" -Converting data above cut line...",verbose=verbose)
|
|
287
287
|
if ylabels is not None:
|
|
288
288
|
ylabels = pd.Series(ylabels)
|
|
289
|
-
maxy = series.max()
|
|
290
289
|
series = series.copy()
|
|
290
|
+
|
|
291
|
+
maxy = series.max()
|
|
291
292
|
if "b" not in mode:
|
|
292
293
|
log.write(" -Maximum -log10(P) value is "+str(maxy) +" .", verbose=verbose)
|
|
293
294
|
elif "b" in mode:
|
|
@@ -77,8 +77,10 @@ def compare_effect(path1,
|
|
|
77
77
|
scaled2 = True
|
|
78
78
|
if is_q_mc=="fdr" or is_q_mc=="bon":
|
|
79
79
|
is_q = True
|
|
80
|
-
|
|
81
|
-
|
|
80
|
+
|
|
81
|
+
if is_q == True:
|
|
82
|
+
if is_q_mc not in [False,"fdr","bon","non"]:
|
|
83
|
+
raise ValueError("Please select either fdr or bon or non for is_q_mc.")
|
|
82
84
|
if save_args is None:
|
|
83
85
|
save_args = {"dpi":300,"facecolor":"white"}
|
|
84
86
|
if reg_box is None:
|
gwaslab/viz_plot_miamiplot2.py
CHANGED
|
@@ -247,7 +247,10 @@ def plot_miami2(
|
|
|
247
247
|
plt.subplots_adjust(hspace=region_hspace)
|
|
248
248
|
else:
|
|
249
249
|
fig, ax1, ax5 = figax
|
|
250
|
-
|
|
250
|
+
|
|
251
|
+
#if same_ylim==True:
|
|
252
|
+
#maxy = merged_sumstats[["scaled_P_1","scaled_P_2"]].max().max()
|
|
253
|
+
|
|
251
254
|
log.write("Start to create Manhattan plot for sumstats1...", verbose=verbose)
|
|
252
255
|
fig,log = mqqplot(merged_sumstats,
|
|
253
256
|
chrom="CHR",
|
|
@@ -284,16 +287,14 @@ def plot_miami2(
|
|
|
284
287
|
_if_quick_qc=False,
|
|
285
288
|
**mqq_args2)
|
|
286
289
|
log.write("Finished creating Manhattan plot for sumstats2".format(_get_version()), verbose=verbose)
|
|
290
|
+
|
|
287
291
|
|
|
288
|
-
if same_ylim==True:
|
|
289
|
-
ylim1_converted = ax1.get_ylim()
|
|
290
|
-
ylim2_converted = ax5.get_ylim()
|
|
291
|
-
if ylim1_converted > ylim2_converted:
|
|
292
|
-
ax5.set_ylim(ylim1_converted)
|
|
293
|
-
else:
|
|
294
|
-
ax1.set_ylim(ylim2_converted)
|
|
295
292
|
#####################################################################################################################
|
|
296
|
-
|
|
293
|
+
ax1l, ax1r = ax5.get_xlim()
|
|
294
|
+
ax5l, ax5r = ax1.get_xlim()
|
|
295
|
+
ax1.set_xlim([min(ax1l,ax5l), max(ax1r,ax5r)])
|
|
296
|
+
ax5.set_xlim([min(ax1l,ax5l), max(ax1r,ax5r)])
|
|
297
|
+
#####################################################################################################################
|
|
297
298
|
ax5.set_xlabel("")
|
|
298
299
|
#ax5.set_xticks(chrom_df)
|
|
299
300
|
ax5.set_xticklabels([])
|
gwaslab/viz_plot_mqqplot.py
CHANGED
|
@@ -141,6 +141,7 @@ def mqqplot(insumstats,
|
|
|
141
141
|
anno_max_iter=100,
|
|
142
142
|
arm_offset=50,
|
|
143
143
|
arm_scale=1,
|
|
144
|
+
anno_height=1,
|
|
144
145
|
arm_scale_d=None,
|
|
145
146
|
cut=0,
|
|
146
147
|
skip=0,
|
|
@@ -180,6 +181,7 @@ def mqqplot(insumstats,
|
|
|
180
181
|
xpad=None,
|
|
181
182
|
xpadl=None,
|
|
182
183
|
xpadr=None,
|
|
184
|
+
xtight=False,
|
|
183
185
|
chrpad=0.03,
|
|
184
186
|
drop_chr_start=False,
|
|
185
187
|
title =None,
|
|
@@ -552,7 +554,8 @@ def mqqplot(insumstats,
|
|
|
552
554
|
cut_log = cut_log,
|
|
553
555
|
verbose =verbose,
|
|
554
556
|
lines_to_plot=lines_to_plot,
|
|
555
|
-
log = log
|
|
557
|
+
log = log
|
|
558
|
+
)
|
|
556
559
|
except:
|
|
557
560
|
log.warning("No valid data! Please check the input.")
|
|
558
561
|
return None
|
|
@@ -596,19 +599,23 @@ def mqqplot(insumstats,
|
|
|
596
599
|
sumstats.loc[sumstats["scaled_P"]>-np.log10(sig_level_plot),"s"]=4
|
|
597
600
|
sumstats["chr_hue"]=sumstats[chrom].astype("string")
|
|
598
601
|
|
|
599
|
-
if
|
|
602
|
+
if "r" in mode:
|
|
603
|
+
if vcf_path is None:
|
|
604
|
+
sumstats["LD"]=100
|
|
605
|
+
sumstats["SHAPE"]=1
|
|
600
606
|
sumstats["chr_hue"]=sumstats["LD"]
|
|
607
|
+
|
|
601
608
|
## default seetings
|
|
602
609
|
|
|
603
610
|
palette = sns.color_palette(colors,n_colors=sumstats[chrom].nunique())
|
|
604
|
-
|
|
605
611
|
|
|
606
612
|
legend = None
|
|
607
613
|
style=None
|
|
608
614
|
linewidth=0
|
|
609
615
|
edgecolor="black"
|
|
610
616
|
# if regional plot assign colors
|
|
611
|
-
if
|
|
617
|
+
if "r" in mode:
|
|
618
|
+
#if vcf_path is not None:
|
|
612
619
|
legend=None
|
|
613
620
|
linewidth=1
|
|
614
621
|
if len(region_ref) == 1:
|
|
@@ -631,10 +638,9 @@ def mqqplot(insumstats,
|
|
|
631
638
|
palette[(i+1)*100 + j ] = hex_color
|
|
632
639
|
|
|
633
640
|
edgecolor="none"
|
|
634
|
-
scatter_args["markers"]= region_marker_shapes[:len(region_ref)]
|
|
641
|
+
scatter_args["markers"]= {(i+1):m for i,m in enumerate(region_marker_shapes[:len(region_ref)])}
|
|
635
642
|
style="SHAPE"
|
|
636
|
-
|
|
637
|
-
|
|
643
|
+
|
|
638
644
|
|
|
639
645
|
## if highlight
|
|
640
646
|
highlight_i = pd.DataFrame()
|
|
@@ -977,6 +983,7 @@ def mqqplot(insumstats,
|
|
|
977
983
|
region=region,
|
|
978
984
|
region_anno_bbox_args=region_anno_bbox_args,
|
|
979
985
|
skip=skip,
|
|
986
|
+
anno_height=anno_height,
|
|
980
987
|
snpid=snpid,
|
|
981
988
|
chrom=chrom,
|
|
982
989
|
pos=pos,
|
|
@@ -1040,7 +1047,7 @@ def mqqplot(insumstats,
|
|
|
1040
1047
|
if "qq" in mode:
|
|
1041
1048
|
ax2.set_ylim(ylim)
|
|
1042
1049
|
|
|
1043
|
-
ax1 = _add_pad_to_x_axis(ax1, xpad, xpadl, xpadr, sumstats)
|
|
1050
|
+
ax1 = _add_pad_to_x_axis(ax1, xpad, xpadl, xpadr, sumstats, pos, chrpad, xtight, log = log, verbose=verbose)
|
|
1044
1051
|
|
|
1045
1052
|
# Titles
|
|
1046
1053
|
if title and anno and len(to_annotate)>0:
|
|
@@ -1065,20 +1072,34 @@ def mqqplot(insumstats,
|
|
|
1065
1072
|
|
|
1066
1073
|
|
|
1067
1074
|
|
|
1068
|
-
def _add_pad_to_x_axis(ax1, xpad, xpadl, xpadr, sumstats):
|
|
1075
|
+
def _add_pad_to_x_axis(ax1, xpad, xpadl, xpadr, sumstats, pos, chrpad, xtight, log, verbose):
|
|
1069
1076
|
|
|
1070
|
-
if
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1077
|
+
if xtight==True:
|
|
1078
|
+
log.write(" -Adjusting X padding on both side : tight mode", verbose=verbose)
|
|
1079
|
+
xmax = sumstats["i"].max()
|
|
1080
|
+
xmin= sumstats["i"].min()
|
|
1081
|
+
ax1.set_xlim([xmin, xmax])
|
|
1082
|
+
|
|
1083
|
+
else:
|
|
1084
|
+
chrpad_to_remove = sumstats[pos].max()*chrpad
|
|
1085
|
+
if ax1 is not None:
|
|
1086
|
+
xmin, xmax = ax1.get_xlim()
|
|
1087
|
+
length = xmax - xmin
|
|
1088
|
+
|
|
1089
|
+
if xpad is not None:
|
|
1090
|
+
log.write(" -Adjusting X padding on both side: {}".format(xpad), verbose=verbose)
|
|
1091
|
+
pad = xpad* length #sumstats["i"].max()
|
|
1092
|
+
ax1.set_xlim([xmin - pad + chrpad_to_remove, xmax + pad - chrpad_to_remove])
|
|
1093
|
+
if xpad is None and xpadl is not None:
|
|
1094
|
+
log.write(" -Adjusting X padding on left side: {}".format(xpadl), verbose=verbose)
|
|
1095
|
+
xmin, xmax = ax1.get_xlim()
|
|
1096
|
+
pad = xpadl*length # sumstats["i"].max()
|
|
1097
|
+
ax1.set_xlim([xmin - pad + chrpad_to_remove ,xmax])
|
|
1098
|
+
if xpad is None and xpadr is not None:
|
|
1099
|
+
log.write(" -Adjusting X padding on right side: {}".format(xpadr), verbose=verbose)
|
|
1100
|
+
xmin, xmax = ax1.get_xlim()
|
|
1101
|
+
pad = xpadr*length # sumstats["i"].max()
|
|
1102
|
+
ax1.set_xlim([xmin, xmax + pad - chrpad_to_remove])
|
|
1082
1103
|
|
|
1083
1104
|
return ax1
|
|
1084
1105
|
|
gwaslab/viz_plot_regional2.py
CHANGED
|
@@ -5,6 +5,7 @@ import matplotlib.patches as patches
|
|
|
5
5
|
import seaborn as sns
|
|
6
6
|
import numpy as np
|
|
7
7
|
import copy
|
|
8
|
+
import re
|
|
8
9
|
import scipy as sp
|
|
9
10
|
from pyensembl import EnsemblRelease
|
|
10
11
|
from allel import GenotypeArray
|
|
@@ -96,19 +97,20 @@ def _plot_regional(
|
|
|
96
97
|
marker_size= marker_size,
|
|
97
98
|
region_marker_shapes=region_marker_shapes,
|
|
98
99
|
log=log,verbose=verbose)
|
|
99
|
-
if lead_id_single is not None:
|
|
100
|
-
|
|
100
|
+
#if lead_id_single is not None:
|
|
101
|
+
lead_ids.append(lead_id_single)
|
|
101
102
|
|
|
102
103
|
# update region_ref to variant rsID or variantID / skip NAs
|
|
103
104
|
new_region_ref = []
|
|
104
105
|
for i in range(len(lead_ids)):
|
|
105
106
|
if lead_ids[i] is None:
|
|
107
|
+
new_region_ref.append(region_ref[i])
|
|
106
108
|
continue
|
|
107
109
|
if region_ref[i] is None:
|
|
108
|
-
if "
|
|
109
|
-
new_name = sumstats.loc[lead_ids[i],"rsID"]
|
|
110
|
-
elif "SNPID" in sumstats.columns:
|
|
110
|
+
if "SNPID" in sumstats.columns:
|
|
111
111
|
new_name = sumstats.loc[lead_ids[i],"SNPID"]
|
|
112
|
+
elif "rsID" in sumstats.columns:
|
|
113
|
+
new_name = sumstats.loc[lead_ids[i],"rsID"]
|
|
112
114
|
else:
|
|
113
115
|
new_name = "chr{}:{}".format(sumstats.loc[lead_ids[i],"CHR"] , sumstats.loc[lead_ids[i],"POS"])
|
|
114
116
|
new_region_ref.append(new_name)
|
|
@@ -162,7 +164,6 @@ def _plot_regional(
|
|
|
162
164
|
lead_snp_ys = []
|
|
163
165
|
lead_snp_is = []
|
|
164
166
|
lead_snp_is_colors = []
|
|
165
|
-
|
|
166
167
|
for i,lead_id_single in enumerate(lead_ids):
|
|
167
168
|
if lead_id_single is not None:
|
|
168
169
|
lead_snp_ys.append(sumstats.loc[lead_id_single,"scaled_P"] )
|
|
@@ -258,11 +259,35 @@ def _get_lead_id(sumstats=None, region_ref=None, log=None, verbose=True):
|
|
|
258
259
|
if type(lead_id) is list:
|
|
259
260
|
if len(lead_id)>0:
|
|
260
261
|
lead_id = int(lead_id[0])
|
|
261
|
-
|
|
262
|
+
|
|
262
263
|
if region_ref_to_check is not None:
|
|
263
264
|
if type(lead_id) is list:
|
|
264
265
|
if len(lead_id)==0 :
|
|
265
|
-
|
|
266
|
+
#try:
|
|
267
|
+
matched_snpid = re.match("(chr)?[0-9]+:[0-9]+:[ATCG]+:[ATCG]+", region_ref_to_check, re.IGNORECASE)
|
|
268
|
+
if matched_snpid is None:
|
|
269
|
+
pass
|
|
270
|
+
else:
|
|
271
|
+
lead_snpid = matched_snpid.group(0).split(":")
|
|
272
|
+
if len(lead_snpid)==4:
|
|
273
|
+
lead_chr= int(lead_snpid[0])
|
|
274
|
+
lead_pos= int(lead_snpid[1])
|
|
275
|
+
lead_ea= lead_snpid[2]
|
|
276
|
+
lead_nea= lead_snpid[3]
|
|
277
|
+
chrpos_match = (sumstats["CHR"] == lead_chr) & (sumstats["POS"] == lead_pos)
|
|
278
|
+
eanea_match = ((sumstats["EA"] == lead_ea) & (sumstats["NEA"] == lead_nea)) | ((sumstats["EA"] == lead_nea) & (sumstats["NEA"] == lead_ea))
|
|
279
|
+
if "rsID" in sumstats.columns:
|
|
280
|
+
lead_id = sumstats.index[chrpos_match&eanea_match].to_list()
|
|
281
|
+
if "SNPID" in sumstats.columns:
|
|
282
|
+
lead_id = sumstats.index[chrpos_match&eanea_match].to_list()
|
|
283
|
+
if type(lead_id) is list:
|
|
284
|
+
if len(lead_id)>0:
|
|
285
|
+
lead_id = int(lead_id[0])
|
|
286
|
+
log.warning("Trying matching variant {} using CHR:POS:EA:NEA to {}... ".format(region_ref_to_check,lead_id))
|
|
287
|
+
|
|
288
|
+
if type(lead_id) is list:
|
|
289
|
+
if len(lead_id)==0 :
|
|
290
|
+
log.warning("Extracting variant: {} not found in sumstats.. Skipping..".format(region_ref_to_check))
|
|
266
291
|
#lead_id = sumstats["scaled_P"].idxmax()
|
|
267
292
|
lead_id = None
|
|
268
293
|
return lead_id
|
|
@@ -275,7 +300,7 @@ def _get_lead_id(sumstats=None, region_ref=None, log=None, verbose=True):
|
|
|
275
300
|
|
|
276
301
|
return lead_id
|
|
277
302
|
|
|
278
|
-
def _pinpoint_lead(sumstats,ax1,region_ref, lead_color, marker_size, log, verbose,region_marker_shapes):
|
|
303
|
+
def _pinpoint_lead(sumstats,ax1,region_ref, lead_color, marker_size, log, verbose, region_marker_shapes):
|
|
279
304
|
|
|
280
305
|
if region_ref is None:
|
|
281
306
|
log.write(" -Extracting lead variant..." , verbose=verbose)
|
|
@@ -416,6 +441,11 @@ def _plot_gene_track(
|
|
|
416
441
|
texts_to_adjust_left = []
|
|
417
442
|
texts_to_adjust_middle = []
|
|
418
443
|
texts_to_adjust_right = []
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
sig_gene_names=[]
|
|
447
|
+
sig_gene_lefts=[]
|
|
448
|
+
sig_gene_rights=[]
|
|
419
449
|
for index,row in uniq_gene_region.iterrows():
|
|
420
450
|
|
|
421
451
|
gene_color="#020080"
|
|
@@ -426,21 +456,18 @@ def _plot_gene_track(
|
|
|
426
456
|
gene_anno = "<-" + row["name"]
|
|
427
457
|
|
|
428
458
|
|
|
429
|
-
|
|
430
|
-
sig_gene_lefts=[]
|
|
431
|
-
sig_gene_rights=[]
|
|
459
|
+
|
|
432
460
|
for lead_snp_i in lead_snp_is:
|
|
433
461
|
if region_lead_grid is True and lead_snp_i > gene_track_start_i+row["start"] and lead_snp_i < gene_track_start_i+row["end"] :
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
462
|
+
gene_color=region_lead_grid_line["color"]
|
|
463
|
+
sig_gene_names.append(row["name"])
|
|
464
|
+
sig_gene_lefts.append(gene_track_start_i+row["start"])
|
|
465
|
+
sig_gene_rights.append(gene_track_start_i+row["end"])
|
|
438
466
|
|
|
439
467
|
# plot gene line
|
|
440
468
|
ax3.plot((gene_track_start_i+row["start"],gene_track_start_i+row["end"]),
|
|
441
469
|
(row["stack"]*2,row["stack"]*2),color=gene_color,linewidth=linewidth_in_points/10)
|
|
442
470
|
|
|
443
|
-
|
|
444
471
|
# plot gene name
|
|
445
472
|
if row["end"] >= region[2]:
|
|
446
473
|
#right side
|
|
@@ -459,6 +486,7 @@ def _plot_gene_track(
|
|
|
459
486
|
for index,row in exons.iterrows():
|
|
460
487
|
exon_color="#020080"
|
|
461
488
|
for sig_gene_name, sig_gene_left, sig_gene_right in zip(sig_gene_names,sig_gene_lefts,sig_gene_rights):
|
|
489
|
+
|
|
462
490
|
if not pd.isnull(row["name"]):
|
|
463
491
|
if (region_lead_grid is True) and row["name"]==sig_gene_name:
|
|
464
492
|
exon_color = region_lead_grid_line["color"]
|
|
@@ -468,7 +496,7 @@ def _plot_gene_track(
|
|
|
468
496
|
exon_color = region_lead_grid_line["color"]
|
|
469
497
|
else:
|
|
470
498
|
exon_color="#020080"
|
|
471
|
-
|
|
499
|
+
|
|
472
500
|
ax3.plot((gene_track_start_i+row["start"],gene_track_start_i+row["end"]),
|
|
473
501
|
(row["stack"]*2,row["stack"]*2),linewidth=linewidth_in_points*taf[3],color=exon_color,solid_capstyle="butt")
|
|
474
502
|
|
|
@@ -550,24 +578,42 @@ def process_vcf(sumstats,
|
|
|
550
578
|
# figure out lead variant
|
|
551
579
|
lead_id = _get_lead_id(sumstats, region_ref_single, log, verbose)
|
|
552
580
|
|
|
553
|
-
if lead_id is None:
|
|
554
|
-
sumstats[rsq] = None
|
|
555
|
-
sumstats[rsq] = sumstats[rsq].astype("float")
|
|
556
|
-
sumstats[ld_single] = 0
|
|
557
|
-
continue
|
|
558
581
|
|
|
559
|
-
|
|
582
|
+
lead_series = None
|
|
583
|
+
if lead_id is None:
|
|
584
|
+
|
|
585
|
+
matched_snpid = re.match("(chr)?[0-9]+:[0-9]+:[ATCG]+:[ATCG]+",region_ref_single, re.IGNORECASE)
|
|
586
|
+
|
|
587
|
+
if matched_snpid is None:
|
|
588
|
+
sumstats[rsq] = None
|
|
589
|
+
sumstats[rsq] = sumstats[rsq].astype("float")
|
|
590
|
+
sumstats[ld_single] = 0
|
|
591
|
+
continue
|
|
592
|
+
else:
|
|
593
|
+
|
|
594
|
+
lead_snpid = matched_snpid.group(0).split(":")[1:]
|
|
595
|
+
lead_pos = int(lead_snpid[0])
|
|
596
|
+
lead_snpid[0]= int(lead_snpid[0])
|
|
597
|
+
lead_series = pd.Series(lead_snpid)
|
|
598
|
+
else:
|
|
599
|
+
lead_pos = sumstats.loc[lead_id,pos]
|
|
560
600
|
|
|
601
|
+
|
|
561
602
|
# if lead pos is available:
|
|
562
603
|
if lead_pos in ref_genotype["variants/POS"]:
|
|
563
604
|
|
|
564
605
|
# get ref index for lead snp
|
|
565
|
-
|
|
566
|
-
|
|
606
|
+
if lead_series is None:
|
|
607
|
+
lead_snp_ref_index = match_varaint(sumstats.loc[lead_id,[pos,nea,ea]])
|
|
608
|
+
#lead_snp_ref_index = np.where(ref_genotype["variants/POS"] == lead_pos)[0][0]
|
|
609
|
+
else:
|
|
610
|
+
log.warning("Computing LD: {} not found in sumstats but found in reference...Still Computing...".format(region_ref_single))
|
|
611
|
+
lead_snp_ref_index = match_varaint(lead_series)
|
|
567
612
|
|
|
568
613
|
# non-na other snp index
|
|
569
614
|
other_snps_ref_index = sumstats["REFINDEX"].dropna().astype("int").values
|
|
570
615
|
# get genotype
|
|
616
|
+
|
|
571
617
|
lead_snp_genotype = GenotypeArray([ref_genotype["calldata/GT"][lead_snp_ref_index]]).to_n_alt()
|
|
572
618
|
try:
|
|
573
619
|
if len(set(lead_snp_genotype[0]))==1:
|
|
@@ -604,10 +650,10 @@ def process_vcf(sumstats,
|
|
|
604
650
|
sumstats.loc[to_change_color,ld_single] = 1
|
|
605
651
|
to_change_color = sumstats[rsq]>ld_threshold
|
|
606
652
|
sumstats.loc[to_change_color,ld_single] = index+2
|
|
607
|
-
|
|
608
|
-
sumstats.loc[lead_id,ld_single] = len(region_ld_threshold)+2
|
|
609
653
|
|
|
610
|
-
|
|
654
|
+
if lead_series is None:
|
|
655
|
+
sumstats.loc[lead_id,ld_single] = len(region_ld_threshold)+2
|
|
656
|
+
sumstats.loc[lead_id,lead] = 1
|
|
611
657
|
|
|
612
658
|
####################################################################################################
|
|
613
659
|
final_shape_col = "SHAPE"
|