biopipen 0.28.1__py3-none-any.whl → 0.29.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +8 -0
- biopipen/ns/bam.py +0 -2
- biopipen/ns/bed.py +35 -0
- biopipen/ns/cellranger_pipeline.py +5 -5
- biopipen/ns/cnv.py +18 -2
- biopipen/ns/cnvkit_pipeline.py +16 -11
- biopipen/ns/gene.py +68 -23
- biopipen/ns/misc.py +2 -15
- biopipen/ns/plot.py +146 -0
- biopipen/ns/regulation.py +214 -0
- biopipen/ns/scrna.py +15 -3
- biopipen/ns/snp.py +516 -8
- biopipen/ns/stats.py +74 -2
- biopipen/ns/vcf.py +196 -0
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/scripts/bam/CNVpytor.py +144 -46
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMerge.py +1 -1
- biopipen/scripts/cnv/AneuploidyScore.R +30 -7
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
- biopipen/scripts/cnv/TMADScore.R +21 -5
- biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
- biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
- biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
- biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
- biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
- biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
- biopipen/scripts/gene/GeneNameConversion.R +65 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/plot/Manhattan.R +140 -0
- biopipen/scripts/plot/QQPlot.R +62 -0
- biopipen/scripts/regulation/MotifAffinityTest.R +226 -0
- biopipen/scripts/regulation/MotifAffinityTest_AtSNP.R +126 -0
- biopipen/scripts/regulation/MotifAffinityTest_MotifBreakR.R +96 -0
- biopipen/scripts/regulation/MotifScan.py +159 -0
- biopipen/scripts/regulation/atSNP.R +33 -0
- biopipen/scripts/regulation/motifBreakR.R +1594 -0
- biopipen/scripts/scrna/MarkersFinder.R +59 -67
- biopipen/scripts/scrna/SeuratClustering.R +63 -29
- biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
- biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
- biopipen/scripts/snp/MatrixEQTL.R +84 -43
- biopipen/scripts/snp/Plink2GTMat.py +133 -0
- biopipen/scripts/snp/PlinkCallRate.R +190 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +298 -0
- biopipen/scripts/snp/PlinkFromVcf.py +78 -0
- biopipen/scripts/snp/PlinkHWE.R +80 -0
- biopipen/scripts/snp/PlinkHet.R +92 -0
- biopipen/scripts/snp/PlinkIBD.R +197 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/MetaPvalue.R +2 -1
- biopipen/scripts/stats/MetaPvalue1.R +70 -0
- biopipen/scripts/tcr/TCRClusterStats.R +12 -7
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/VcfFix_utils.py +1 -1
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/utils/gene.R +83 -37
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.R +56 -0
- biopipen/utils/misc.py +5 -2
- biopipen/utils/reference.py +54 -10
- {biopipen-0.28.1.dist-info → biopipen-0.29.0.dist-info}/METADATA +2 -2
- {biopipen-0.28.1.dist-info → biopipen-0.29.0.dist-info}/RECORD +77 -49
- {biopipen-0.28.1.dist-info → biopipen-0.29.0.dist-info}/entry_points.txt +1 -1
- biopipen/ns/bcftools.py +0 -111
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- {biopipen-0.28.1.dist-info → biopipen-0.29.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
source("{{biopipen_dir}}/utils/misc.R")
|
|
2
|
+
source("{{biopipen_dir}}/utils/plot.R")
|
|
3
|
+
library(rlang)
|
|
4
|
+
library(ggprism)
|
|
5
|
+
theme_set(theme_prism())
|
|
6
|
+
|
|
7
|
+
indir <- {{in.indir | r}}
|
|
8
|
+
outdir <- {{out.outdir | r}}
|
|
9
|
+
plink <- {{envs.plink | r}}
|
|
10
|
+
ncores <- {{envs.ncores | r}}
|
|
11
|
+
modifier <- {{envs.modifier | r}}
|
|
12
|
+
gz <- {{envs.gz | r}}
|
|
13
|
+
cutoffs <- {{envs.cutoff | r}}
|
|
14
|
+
filters <- {{envs.filter | r}}
|
|
15
|
+
doplot <- {{envs.plot | r}}
|
|
16
|
+
devpars <- {{envs.devpars | r}}
|
|
17
|
+
|
|
18
|
+
bedfile = Sys.glob(file.path(indir, '*.bed'))
|
|
19
|
+
if (length(bedfile) == 0)
|
|
20
|
+
stop("No bed files found in the input directory.")
|
|
21
|
+
if (length(bedfile) > 1) {
|
|
22
|
+
log_warn("Multiple bed files found in the input directory. Using the first one.")
|
|
23
|
+
bedfile <- bedfile[1]
|
|
24
|
+
}
|
|
25
|
+
input <- tools::file_path_sans_ext(bedfile)
|
|
26
|
+
output <- file.path(outdir, basename(input))
|
|
27
|
+
|
|
28
|
+
modifier <- match.arg(modifier, c("none", "counts", "x"))
|
|
29
|
+
|
|
30
|
+
cmd <- c(
|
|
31
|
+
plink,
|
|
32
|
+
"--threads", ncores,
|
|
33
|
+
"--bfile", input,
|
|
34
|
+
"--out", output
|
|
35
|
+
)
|
|
36
|
+
if (modifier == "counts") {
|
|
37
|
+
cmd <- c(cmd, "--freq", "counts")
|
|
38
|
+
if (!is.list(cutoffs)) { cutoffs <- list(ALT1_CT = cutoffs) }
|
|
39
|
+
# } else if (modifier == "case-control") {
|
|
40
|
+
# cmd <- c(cmd, "--freq", "case-control")
|
|
41
|
+
# if (!is.list(cutoffs)) { cutoffs <- list(MAF_A = cutoffs) }
|
|
42
|
+
} else if (modifier == "x") {
|
|
43
|
+
cmd <- c(cmd, "--geno-counts")
|
|
44
|
+
if (!is.list(cutoffs)) { cutoffs <- list("HOM_ALT1_CT" = cutoffs) }
|
|
45
|
+
} else {
|
|
46
|
+
cmd <- c(cmd, "--freq")
|
|
47
|
+
if (!is.list(cutoffs)) { cutoffs <- list(MAF = cutoffs) }
|
|
48
|
+
}
|
|
49
|
+
if (isTRUE(gz)) { cmd <- c(cmd, "gz") }
|
|
50
|
+
|
|
51
|
+
if (!is.list(filters)) {
|
|
52
|
+
filters <- as.list(rep(filters, length(cutoffs)))
|
|
53
|
+
names(filters) <- names(cutoffs)
|
|
54
|
+
} else {
|
|
55
|
+
for (name in names(filters)) {
|
|
56
|
+
if (is.null(cutoffs[[name]])) {
|
|
57
|
+
stop(paste0("Cutoff for filter ", name, " is not provided."))
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
run_command(cmd, fg = TRUE)
|
|
63
|
+
|
|
64
|
+
post_process <- function(suffix, snp_col = "ID", sep = "\t", modifier = NULL) {
|
|
65
|
+
freq <- read.table(
|
|
66
|
+
paste0(output, suffix),
|
|
67
|
+
header=TRUE,
|
|
68
|
+
check.names=FALSE,
|
|
69
|
+
row.names = NULL,
|
|
70
|
+
sep = sep,
|
|
71
|
+
comment = ""
|
|
72
|
+
)
|
|
73
|
+
colnames(freq)[1] <- sub("#", "", colnames(freq)[1])
|
|
74
|
+
if (!is.null(modifier)) { freq <- modifier(freq) }
|
|
75
|
+
iter_in <- input
|
|
76
|
+
n <- 0
|
|
77
|
+
for (metric_col in names(cutoffs)) {
|
|
78
|
+
if (is.null(cutoffs[[metric_col]])) {
|
|
79
|
+
stop(paste0(
|
|
80
|
+
"Cutoff for metric ",
|
|
81
|
+
metric_col,
|
|
82
|
+
" is not provided in ",
|
|
83
|
+
suffix, "(x) file."))
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
freq[[metric_col]] <- as.numeric(freq[[metric_col]])
|
|
87
|
+
cutoff <- cutoffs[[metric_col]]
|
|
88
|
+
filter <- filters[[metric_col]] %||% "no"
|
|
89
|
+
|
|
90
|
+
if (filter == "no") {
|
|
91
|
+
ge_flag <- paste0(metric_col, " >= ", cutoff)
|
|
92
|
+
lt_flag <- paste0(metric_col, " < ", cutoff)
|
|
93
|
+
freq$GE <- freq[[metric_col]] >= cutoff
|
|
94
|
+
freq$Flag <- ifelse(freq$GE, ge_flag, lt_flag)
|
|
95
|
+
freq$Flag <- factor(freq$Flag, levels = c(ge_flag, lt_flag))
|
|
96
|
+
write.table(
|
|
97
|
+
freq[[snp_col]][freq$GE],
|
|
98
|
+
file = paste0(output, suffix, ".", metric_col, ".ge"),
|
|
99
|
+
col.names=FALSE,
|
|
100
|
+
row.names=FALSE,
|
|
101
|
+
quote=FALSE
|
|
102
|
+
)
|
|
103
|
+
write.table(
|
|
104
|
+
freq[[snp_col]][!freq$GE],
|
|
105
|
+
file = paste0(output, suffix, ".", metric_col, ".lt"),
|
|
106
|
+
col.names=FALSE,
|
|
107
|
+
row.names=FALSE,
|
|
108
|
+
quote=FALSE
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if (doplot) {
|
|
112
|
+
plotGG(
|
|
113
|
+
data = freq,
|
|
114
|
+
geom = "histogram",
|
|
115
|
+
outfile = paste0(output, suffix, ".", metric_col, ".png"),
|
|
116
|
+
args = list(aes(x = !!sym(metric_col), fill = Flag), alpha = 0.8, bins = 50),
|
|
117
|
+
ggs = c(
|
|
118
|
+
sprintf('xlab("%s")', metric_col),
|
|
119
|
+
'ylab("Count")',
|
|
120
|
+
sprintf('geom_vline(xintercept = %.3f, color = "red", linetype="dashed")', cutoff),
|
|
121
|
+
sprintf(
|
|
122
|
+
'geom_text(aes(x = %.3f, y = Inf, label = as.character(%.3f)), colour="blue", vjust = 1.5, hjust = -.1)',
|
|
123
|
+
cutoff, cutoff
|
|
124
|
+
),
|
|
125
|
+
sprintf(
|
|
126
|
+
'scale_fill_manual(values = c("%s" = "blue3", "%s" = "green3"))',
|
|
127
|
+
ge_flag, lt_flag
|
|
128
|
+
)
|
|
129
|
+
),
|
|
130
|
+
devpars = devpars
|
|
131
|
+
)
|
|
132
|
+
}
|
|
133
|
+
} else {
|
|
134
|
+
iter_dir <- file.path(outdir, paste0(metric_col, "_filtered"))
|
|
135
|
+
dir.create(iter_dir, showWarnings = FALSE)
|
|
136
|
+
iter_out <- file.path(iter_dir, basename(output))
|
|
137
|
+
|
|
138
|
+
filter <- match.arg(filter, c("gt", "lt", "ge", "le"))
|
|
139
|
+
indicate <- function(metric){
|
|
140
|
+
if (filter == "gt") {
|
|
141
|
+
return(freq[[metric_col]] > cutoff)
|
|
142
|
+
} else if (filter == "lt") {
|
|
143
|
+
return(freq[[metric_col]] < cutoff)
|
|
144
|
+
} else if (filter == "ge") {
|
|
145
|
+
return(freq[[metric_col]] >= cutoff)
|
|
146
|
+
} else if (filter == "le") {
|
|
147
|
+
return(freq[[metric_col]] <= cutoff)
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
freq$Flag <- ifelse(indicate(freq), "Fail", "Pass")
|
|
151
|
+
failfile <- paste0(output, suffix, ".", metric_col, ".fail")
|
|
152
|
+
write.table(
|
|
153
|
+
freq[[snp_col]][freq$Flag == "Fail"],
|
|
154
|
+
file = failfile,
|
|
155
|
+
col.names=FALSE,
|
|
156
|
+
row.names=FALSE,
|
|
157
|
+
quote=FALSE
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
if (doplot) {
|
|
161
|
+
plotGG(
|
|
162
|
+
data = freq,
|
|
163
|
+
geom = "histogram",
|
|
164
|
+
outfile = paste0(output, suffix, ".", metric_col, ".png"),
|
|
165
|
+
args = list(aes(x = !!sym(metric_col), fill = Flag), alpha = 0.8, bins = 50),
|
|
166
|
+
ggs = c(
|
|
167
|
+
sprintf('xlab("%s")', metric_col),
|
|
168
|
+
'ylab("Count")',
|
|
169
|
+
sprintf('geom_vline(xintercept = %.3f, color = "blue", linetype="dashed")', cutoff),
|
|
170
|
+
sprintf(
|
|
171
|
+
'geom_text(aes(x = %.3f, y = Inf, label = as.character(%.3f)), colour="blue", vjust = 1.5, hjust = -.1)',
|
|
172
|
+
cutoff, cutoff
|
|
173
|
+
),
|
|
174
|
+
'theme(legend.position = "none")',
|
|
175
|
+
'scale_fill_manual(values = c("Pass" = "blue3", "Fail" = "red3"))'
|
|
176
|
+
),
|
|
177
|
+
devpars = devpars
|
|
178
|
+
)
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
filter_cmd <- c(
|
|
182
|
+
plink,
|
|
183
|
+
"--threads", ncores,
|
|
184
|
+
"--bfile", shQuote(iter_in),
|
|
185
|
+
"--exclude", shQuote(failfile),
|
|
186
|
+
"--make-bed",
|
|
187
|
+
"--out", shQuote(iter_out)
|
|
188
|
+
)
|
|
189
|
+
run_command(filter_cmd, fg = TRUE)
|
|
190
|
+
|
|
191
|
+
iter_in <- iter_out
|
|
192
|
+
n <- n + 1
|
|
193
|
+
|
|
194
|
+
if (n == length(cutoffs)) {
|
|
195
|
+
# make symbolic links to output from input .bed, .bim and .fam files
|
|
196
|
+
file.symlink(paste0(iter_in, '.bed'), paste0(output, '.bed'))
|
|
197
|
+
file.symlink(paste0(iter_in, '.bim'), paste0(output, '.bim'))
|
|
198
|
+
file.symlink(paste0(iter_in, '.fam'), paste0(output, '.fam'))
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
splitup <- function(x, agg = NULL) {
|
|
205
|
+
sp <- strsplit(as.character(x), ",")
|
|
206
|
+
if (is.null(agg)) {
|
|
207
|
+
return(sp)
|
|
208
|
+
}
|
|
209
|
+
return(sapply(sp, agg))
|
|
210
|
+
}
|
|
211
|
+
if (modifier == "none") {
|
|
212
|
+
mod <- function(freq) {
|
|
213
|
+
# Add ALT1, ALT1_FREQ, REF_FREQ and MAF columns
|
|
214
|
+
writing = FALSE
|
|
215
|
+
if (is.null(freq$ALT1)) {
|
|
216
|
+
# should be the first allele of ALT
|
|
217
|
+
freq$ALT1 <- splitup(freq$ALT, agg = function(s) s[1])
|
|
218
|
+
writing = TRUE
|
|
219
|
+
}
|
|
220
|
+
if (is.null(freq$ALT1_FREQ)) {
|
|
221
|
+
freq$ALT1_FREQ <- as.double(splitup(freq$ALT_FREQS, agg = function(s) s[1]))
|
|
222
|
+
writing = TRUE
|
|
223
|
+
}
|
|
224
|
+
if (is.null(freq$REF_FREQ)) {
|
|
225
|
+
freq$REF_FREQ <- 1 - splitup(freq$ALT_FREQS, agg = function(s) sum(as.double(s)))
|
|
226
|
+
writing = TRUE
|
|
227
|
+
}
|
|
228
|
+
if (is.null(freq$MAF)) {
|
|
229
|
+
min_alt_freqs <- splitup(freq$ALT_FREQS, agg = function(s) min(as.double(s)))
|
|
230
|
+
freq$MAF <- pmin(freq$REF_FREQ, min_alt_freqs)
|
|
231
|
+
writing = TRUE
|
|
232
|
+
}
|
|
233
|
+
if (writing) {
|
|
234
|
+
write.table(
|
|
235
|
+
freq,
|
|
236
|
+
file = paste0(output, ".afreqx"),
|
|
237
|
+
col.names=TRUE,
|
|
238
|
+
row.names=FALSE,
|
|
239
|
+
quote=FALSE,
|
|
240
|
+
sep = "\t"
|
|
241
|
+
)
|
|
242
|
+
}
|
|
243
|
+
return(freq)
|
|
244
|
+
}
|
|
245
|
+
post_process(".afreq", modifier = mod)
|
|
246
|
+
} else if (modifier == "counts") {
|
|
247
|
+
mod <- function(freq) {
|
|
248
|
+
# Add ALT1, ALT1_CT, and REF_CT columns
|
|
249
|
+
writing = FALSE
|
|
250
|
+
if (is.null(freq$ALT1)) {
|
|
251
|
+
# should be the first allele of ALT
|
|
252
|
+
freq$ALT1 <- splitup(freq$ALT, agg = function(s) s[1])
|
|
253
|
+
writing = TRUE
|
|
254
|
+
}
|
|
255
|
+
if (is.null(freq$ALT1_CT)) {
|
|
256
|
+
freq$ALT1_CT <- as.integer(splitup(freq$ALT_CTS, agg = function(s) s[1]))
|
|
257
|
+
writing = TRUE
|
|
258
|
+
}
|
|
259
|
+
if (is.null(freq$REF_CT)) {
|
|
260
|
+
freq$REF_CT <- freq$OBS_CT - splitup(freq$ALT_CTS, agg = function(s) sum(as.integer(s)))
|
|
261
|
+
writing = TRUE
|
|
262
|
+
}
|
|
263
|
+
if (writing) {
|
|
264
|
+
write.table(
|
|
265
|
+
freq,
|
|
266
|
+
file = paste0(output, ".acountx"),
|
|
267
|
+
col.names=TRUE,
|
|
268
|
+
row.names=FALSE,
|
|
269
|
+
quote=FALSE,
|
|
270
|
+
sep = "\t"
|
|
271
|
+
)
|
|
272
|
+
}
|
|
273
|
+
return(freq)
|
|
274
|
+
}
|
|
275
|
+
post_process(".acount", modifier = mod)
|
|
276
|
+
# } else if (modifier == "case-control") {
|
|
277
|
+
# post_process(".frq.cc")
|
|
278
|
+
} else if (modifier == "x") {
|
|
279
|
+
mod <- function(freq) {
|
|
280
|
+
# Add ALT1, HET_REF_ALT1_CT, HOM_ALT1_CT
|
|
281
|
+
writing = FALSE
|
|
282
|
+
if (is.null(freq$ALT1)) {
|
|
283
|
+
# should be the first allele of ALT
|
|
284
|
+
freq$ALT1 <- splitup(freq$ALT, agg = function(s) s[1])
|
|
285
|
+
writing = TRUE
|
|
286
|
+
}
|
|
287
|
+
if (is.null(freq$HET_REF_ALT1_CT)) {
|
|
288
|
+
freq$HET_REF_ALT1_CT <- as.integer(splitup(freq$HET_REF_ALT_CTS, agg = function(s) s[1]))
|
|
289
|
+
writing = TRUE
|
|
290
|
+
}
|
|
291
|
+
if (is.null(freq$HOM_ALT1_CT)) {
|
|
292
|
+
freq$HOM_ALT1_CT <- as.integer(splitup(freq$TWO_ALT_GENO_CTS, agg = function(s) s[1]))
|
|
293
|
+
writing = TRUE
|
|
294
|
+
}
|
|
295
|
+
return(freq)
|
|
296
|
+
}
|
|
297
|
+
post_process(".gcount", modifier = mod)
|
|
298
|
+
}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from os import path
|
|
2
|
+
from biopipen.core.filters import dict_to_cli_args
|
|
3
|
+
from biopipen.utils.reference import tabix_index
|
|
4
|
+
from biopipen.utils.misc import run_command
|
|
5
|
+
|
|
6
|
+
invcf = {{in.invcf | repr}} # noqa: E999 # pyright: ignore
|
|
7
|
+
outprefix = {{in.invcf | stem0 | repr}} # pyright: ignore
|
|
8
|
+
outdir = {{out.outdir | repr}} # pyright: ignore
|
|
9
|
+
args = {{envs | dict | repr}} # pyright: ignore
|
|
10
|
+
|
|
11
|
+
plink = args.pop("plink")
|
|
12
|
+
tabix = args.pop("tabix")
|
|
13
|
+
ncores = args.pop("ncores")
|
|
14
|
+
|
|
15
|
+
# normalize vcf-filter
|
|
16
|
+
args.setdefault("vcf_filter", True)
|
|
17
|
+
if isinstance(args["vcf_filter"], str):
|
|
18
|
+
args["vcf_filter"] = args["vcf_filter"].split()
|
|
19
|
+
|
|
20
|
+
# normalize biallelic-only
|
|
21
|
+
args.setdefault("max_alleles", 2)
|
|
22
|
+
|
|
23
|
+
# This makes it possible to keep the allele order in the output
|
|
24
|
+
# no need for plink2
|
|
25
|
+
# args["keep_allele_order"] = True
|
|
26
|
+
|
|
27
|
+
# resolve plink 1.x --set-missing-var-ids doesn't distinguish $1, $2,...
|
|
28
|
+
# for ref and alts
|
|
29
|
+
# if (
|
|
30
|
+
# "set_missing_var_ids" in args
|
|
31
|
+
# and args["set_missing_var_ids"]
|
|
32
|
+
# and ("$" in args["set_missing_var_ids"] or "%" in args["set_missing_var_ids"])
|
|
33
|
+
# ):
|
|
34
|
+
# tmpfile = path.join(outdir, 'with_var_ids.vcf')
|
|
35
|
+
# set_missing_var_ids = args.pop("set_missing_var_ids")
|
|
36
|
+
# set_missing_var_ids = (
|
|
37
|
+
# set_missing_var_ids
|
|
38
|
+
# .replace("@", "%CHROM")
|
|
39
|
+
# .replace("#", "%POS")
|
|
40
|
+
# .replace("$1", "%REF")
|
|
41
|
+
# .replace("$2", "%ALT{0}")
|
|
42
|
+
# .replace("$3", "%ALT{1}")
|
|
43
|
+
# .replace("$4", "%ALT{2}")
|
|
44
|
+
# .replace("$5", "%ALT{3}")
|
|
45
|
+
# .replace("$6", "%ALT{4}")
|
|
46
|
+
# .replace("%CHROM_", "%CHROM\\_")
|
|
47
|
+
# .replace("%POS_", "%POS\\_")
|
|
48
|
+
# .replace("%REF_", "%REF\\_")
|
|
49
|
+
# )
|
|
50
|
+
# set_vid_cmd = [
|
|
51
|
+
# bcftools,
|
|
52
|
+
# "annotate",
|
|
53
|
+
# "--set-id",
|
|
54
|
+
# f"+{set_missing_var_ids}",
|
|
55
|
+
# "--output-type",
|
|
56
|
+
# "z",
|
|
57
|
+
# "--output",
|
|
58
|
+
# tmpfile,
|
|
59
|
+
# "--threads",
|
|
60
|
+
# ncores,
|
|
61
|
+
# invcf,
|
|
62
|
+
# ]
|
|
63
|
+
|
|
64
|
+
# run_command(set_vid_cmd, fg=True, env={"cwd": outdir})
|
|
65
|
+
# invcf = tmpfile
|
|
66
|
+
|
|
67
|
+
invcf = tabix_index(invcf, "vcf", tabix=tabix)
|
|
68
|
+
args["vcf"] = invcf
|
|
69
|
+
args["out"] = path.join(outdir, outprefix)
|
|
70
|
+
args["threads"] = ncores
|
|
71
|
+
|
|
72
|
+
cmd = [
|
|
73
|
+
plink,
|
|
74
|
+
"--make-bed",
|
|
75
|
+
*dict_to_cli_args(args, dup_key=False, dashify = True),
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
run_command(cmd, fg=True, env={"cwd": outdir})
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
source("{{biopipen_dir}}/utils/misc.R")
|
|
2
|
+
source("{{biopipen_dir}}/utils/plot.R")
|
|
3
|
+
library(ggprism)
|
|
4
|
+
theme_set(theme_prism())
|
|
5
|
+
|
|
6
|
+
indir <- {{in.indir | r}}
|
|
7
|
+
outdir <- {{out.outdir | r}}
|
|
8
|
+
plink <- {{envs.plink | r}}
|
|
9
|
+
ncores <- {{envs.ncores | r}}
|
|
10
|
+
cutoff <- {{envs.cutoff | r}}
|
|
11
|
+
doplot <- {{envs.plot | r}}
|
|
12
|
+
devpars <- {{envs.devpars | r}}
|
|
13
|
+
|
|
14
|
+
bedfile = Sys.glob(file.path(indir, '*.bed'))
|
|
15
|
+
if (length(bedfile) == 0)
|
|
16
|
+
stop("No bed files found in the input directory.")
|
|
17
|
+
if (length(bedfile) > 1) {
|
|
18
|
+
log_warn("Multiple bed files found in the input directory. Using the first one.")
|
|
19
|
+
bedfile <- bedfile[1]
|
|
20
|
+
}
|
|
21
|
+
input <- tools::file_path_sans_ext(bedfile)
|
|
22
|
+
output <- file.path(outdir, basename(input))
|
|
23
|
+
|
|
24
|
+
cmd <- c(
|
|
25
|
+
plink,
|
|
26
|
+
"--threads", ncores,
|
|
27
|
+
"--bfile", input,
|
|
28
|
+
"--hardy",
|
|
29
|
+
"--out", output
|
|
30
|
+
)
|
|
31
|
+
run_command(cmd, fg = TRUE)
|
|
32
|
+
|
|
33
|
+
hardy <- read.table(
|
|
34
|
+
paste0(output, '.hardy'),
|
|
35
|
+
header = TRUE,
|
|
36
|
+
row.names = NULL,
|
|
37
|
+
check.names = FALSE,
|
|
38
|
+
comment.char = ""
|
|
39
|
+
)
|
|
40
|
+
hardy.fail <- hardy[which(hardy$P < cutoff), 'ID', drop = FALSE]
|
|
41
|
+
write.table(
|
|
42
|
+
hardy.fail,
|
|
43
|
+
paste0(output, '.hardy.fail'),
|
|
44
|
+
col.names = FALSE,
|
|
45
|
+
row.names = FALSE,
|
|
46
|
+
sep = "\t",
|
|
47
|
+
quote = FALSE
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
if (doplot) {
|
|
51
|
+
hardy$Pval <- -log10(hardy$P)
|
|
52
|
+
hardy$Status <- "Pass"
|
|
53
|
+
hardy[which(hardy$SNP %in% hardy.fail$SNP), "Status"] <- "Fail"
|
|
54
|
+
|
|
55
|
+
plotGG(
|
|
56
|
+
data = hardy,
|
|
57
|
+
geom = "histogram",
|
|
58
|
+
outfile = paste0(output, '.hardy.png'),
|
|
59
|
+
args = list(aes(x = Pval, fill = Status), alpha = 0.8, bins = 50),
|
|
60
|
+
ggs = c(
|
|
61
|
+
'xlab("-log10(HWE p-value)")',
|
|
62
|
+
'ylab("Count")',
|
|
63
|
+
'geom_vline(xintercept = -log10(cutoff), color = "red", linetype="dashed")',
|
|
64
|
+
'theme(legend.position = "none")',
|
|
65
|
+
'geom_text(aes(x = -log10(cutoff), y = Inf, label = cutoff), colour="red", angle=90, vjust = 1.2, hjust = 1.2)',
|
|
66
|
+
'scale_fill_manual(values = c("Pass" = "blue3", "Fail" = "red3"))' # Added line to set "Fail" color to red
|
|
67
|
+
),
|
|
68
|
+
devpars = devpars
|
|
69
|
+
)
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
cmd <- c(
|
|
73
|
+
plink,
|
|
74
|
+
"--threads", ncores,
|
|
75
|
+
"--bfile", input,
|
|
76
|
+
"--exclude", paste0(output, '.hardy.fail'),
|
|
77
|
+
"--make-bed",
|
|
78
|
+
"--out", output
|
|
79
|
+
)
|
|
80
|
+
run_command(cmd, fg = TRUE)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
source("{{biopipen_dir}}/utils/misc.R")
|
|
2
|
+
source("{{biopipen_dir}}/utils/plot.R")
|
|
3
|
+
library(ggprism)
|
|
4
|
+
theme_set(theme_prism())
|
|
5
|
+
|
|
6
|
+
indir <- {{in.indir | r}}
|
|
7
|
+
outdir <- {{out.outdir | r}}
|
|
8
|
+
plink <- {{envs.plink | r}}
|
|
9
|
+
ncores <- {{envs.ncores | r}}
|
|
10
|
+
cutoff <- {{envs.cutoff | r}}
|
|
11
|
+
doplot <- {{envs.plot | r}}
|
|
12
|
+
devpars <- {{envs.devpars | r}}
|
|
13
|
+
|
|
14
|
+
bedfile = Sys.glob(file.path(indir, '*.bed'))
|
|
15
|
+
if (length(bedfile) == 0)
|
|
16
|
+
stop("No bed files found in the input directory.")
|
|
17
|
+
if (length(bedfile) > 1) {
|
|
18
|
+
log_warn("Multiple bed files found in the input directory. Using the first one.")
|
|
19
|
+
bedfile <- bedfile[1]
|
|
20
|
+
}
|
|
21
|
+
input <- tools::file_path_sans_ext(bedfile)
|
|
22
|
+
output <- file.path(outdir, basename(input))
|
|
23
|
+
|
|
24
|
+
# need .afreq for --het for plink2
|
|
25
|
+
freq_cmd <- cmd <- c(
|
|
26
|
+
plink,
|
|
27
|
+
"--threads", ncores,
|
|
28
|
+
"--bfile", input,
|
|
29
|
+
"--freq",
|
|
30
|
+
"--out", output
|
|
31
|
+
)
|
|
32
|
+
run_command(freq_cmd, fg = TRUE)
|
|
33
|
+
|
|
34
|
+
cmd <- c(
|
|
35
|
+
plink,
|
|
36
|
+
"--threads", ncores,
|
|
37
|
+
"--bfile", input,
|
|
38
|
+
"--het",
|
|
39
|
+
"--out", output,
|
|
40
|
+
"--read-freq", paste0(output, '.afreq')
|
|
41
|
+
)
|
|
42
|
+
run_command(cmd, fg = TRUE)
|
|
43
|
+
|
|
44
|
+
phet <- read.table(
|
|
45
|
+
paste0(output, '.het'),
|
|
46
|
+
header = TRUE,
|
|
47
|
+
row.names = NULL,
|
|
48
|
+
check.names = FALSE,
|
|
49
|
+
comment.char = ""
|
|
50
|
+
)
|
|
51
|
+
het <- data.frame(Het = 1 - phet[, "O(HOM)"]/phet[, "OBS_CT"])
|
|
52
|
+
rownames(het) <- paste(phet$FID, phet$IID, sep = "\t")
|
|
53
|
+
het.mean <- mean(het$Het, na.rm = TRUE)
|
|
54
|
+
het.sd <- sd(het$Het, na.rm = TRUE)
|
|
55
|
+
het.fail <- rownames(het[
|
|
56
|
+
!is.na(het$Het) & (het$Het < het.mean-cutoff*het.sd | het$Het > het.mean+cutoff*het.sd), , drop = FALSE
|
|
57
|
+
])
|
|
58
|
+
writeLines(het.fail, con = file(paste0(output, '.het.fail')))
|
|
59
|
+
|
|
60
|
+
if (doplot) {
|
|
61
|
+
het$Status <- "Pass"
|
|
62
|
+
het[het.fail, "Status"] <- "Fail"
|
|
63
|
+
|
|
64
|
+
plotGG(
|
|
65
|
+
data = het,
|
|
66
|
+
geom = "histogram",
|
|
67
|
+
outfile = paste0(output, '.het.png'),
|
|
68
|
+
args = list(aes(fill = Status, x = Het), alpha = 0.8, bins = 50),
|
|
69
|
+
ggs = c(
|
|
70
|
+
'xlab("Sample Heterozygosity")',
|
|
71
|
+
'ylab("Count")',
|
|
72
|
+
'geom_vline(xintercept = c(het.mean-cutoff*het.sd, het.mean+cutoff*het.sd), color = "red", linetype="dashed")',
|
|
73
|
+
'geom_vline(xintercept = het.mean, color = "blue", linetype="dashed")',
|
|
74
|
+
'theme(legend.position = "none")',
|
|
75
|
+
'geom_text(aes(x = het.mean-cutoff*het.sd, y = Inf, label = sprintf("mean - %ssd (%.3f)", cutoff, het.mean - cutoff*het.sd)), colour="red", angle=90, vjust = 1.2, hjust = 1.2)',
|
|
76
|
+
'geom_text(aes(x = het.mean+cutoff*het.sd, y = Inf, label = sprintf("mean + %ssd (%.3f)", cutoff, het.mean + cutoff*het.sd)), colour="red", angle=90, vjust = 1.2, hjust = 1.2)',
|
|
77
|
+
'geom_text(aes(x = het.mean, y = Inf, label = sprintf("mean (%.3f)", het.mean)), colour="blue", vjust = 1.5, hjust = -.1)',
|
|
78
|
+
'scale_fill_manual(values = c("Pass" = "blue3", "Fail" = "red3"))'
|
|
79
|
+
),
|
|
80
|
+
devpars = devpars
|
|
81
|
+
)
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
cmd <- c(
|
|
85
|
+
plink,
|
|
86
|
+
"--threads", ncores,
|
|
87
|
+
"--bfile", input,
|
|
88
|
+
"--remove", paste0(output, '.het.fail'),
|
|
89
|
+
"--make-bed",
|
|
90
|
+
"--out", output
|
|
91
|
+
)
|
|
92
|
+
run_command(cmd, fg = TRUE)
|