biopipen 0.31.4__py3-none-any.whl → 0.31.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/ns/bam.py +41 -0
- biopipen/ns/protein.py +84 -0
- biopipen/ns/regulatory.py +72 -0
- biopipen/ns/vcf.py +7 -3
- biopipen/reports/protein/ProdigySummary.svelte +16 -0
- biopipen/scripts/bam/BamMerge.py +10 -14
- biopipen/scripts/bam/BamSampling.py +90 -0
- biopipen/scripts/protein/Prodigy.py +119 -0
- biopipen/scripts/protein/ProdigySummary.R +133 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +5 -143
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +31 -37
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +25 -26
- biopipen/scripts/regulatory/VariantMotifPlot.R +76 -0
- biopipen/scripts/regulatory/motifs-common.R +322 -0
- biopipen/scripts/vcf/TruvariBench.sh +14 -7
- biopipen/scripts/vcf/TruvariBenchSummary.R +1 -2
- {biopipen-0.31.4.dist-info → biopipen-0.31.6.dist-info}/METADATA +1 -1
- {biopipen-0.31.4.dist-info → biopipen-0.31.6.dist-info}/RECORD +21 -16
- {biopipen-0.31.4.dist-info → biopipen-0.31.6.dist-info}/entry_points.txt +1 -0
- biopipen/scripts/regulatory/atSNP.R +0 -33
- biopipen/scripts/regulatory/motifBreakR.R +0 -1594
- {biopipen-0.31.4.dist-info → biopipen-0.31.6.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
{{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
|
|
2
|
+
|
|
3
|
+
library(rlang)
|
|
4
|
+
library(dplyr)
|
|
5
|
+
library(ggplot2)
|
|
6
|
+
library(ggprism)
|
|
7
|
+
|
|
8
|
+
theme_set(theme_prism())
|
|
9
|
+
|
|
10
|
+
infiles <- {{in.infiles | r}}
|
|
11
|
+
outdir <- {{out.outdir | r}}
|
|
12
|
+
joboutdir <- {{job.outdir | r}}
|
|
13
|
+
group <- {{envs.group | r}}
|
|
14
|
+
|
|
15
|
+
if (is.character(group)) {
|
|
16
|
+
group <- read.csv(group, header = FALSE, row.names = NULL)
|
|
17
|
+
colnames(group) <- c("Sample", "Group")
|
|
18
|
+
} else if (is.list(group)) {
|
|
19
|
+
group <- do_call(
|
|
20
|
+
rbind,
|
|
21
|
+
lapply(names(group), function(n) data.frame(Sample = group[[n]], Group = n))
|
|
22
|
+
)
|
|
23
|
+
} else if (!is.null(group)) {
|
|
24
|
+
stop(paste0("Invalid group: ", paste0(group, collapse = ", ")))
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
log_info("Reading and merging metrics for each sample ...")
|
|
28
|
+
metrics <- NULL
|
|
29
|
+
|
|
30
|
+
for (infile in infiles) {
|
|
31
|
+
sample <- sub("_prodigy$", "", basename(dirname(infile)))
|
|
32
|
+
log_debug("- Reading metrics from {sample}")
|
|
33
|
+
metric <- read.table(
|
|
34
|
+
infile,
|
|
35
|
+
header = TRUE,
|
|
36
|
+
sep = "\t",
|
|
37
|
+
stringsAsFactors = FALSE,
|
|
38
|
+
check.names = FALSE,
|
|
39
|
+
row.names = NULL)
|
|
40
|
+
metric$Sample <- sample
|
|
41
|
+
metric <- metric %>% select(Sample, everything())
|
|
42
|
+
if (is.null(metrics)) {
|
|
43
|
+
metrics <- metric
|
|
44
|
+
} else {
|
|
45
|
+
metrics <- rbind(metrics, metric)
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Save metrics
|
|
50
|
+
write.table(
|
|
51
|
+
metrics,
|
|
52
|
+
file.path(outdir, "metrics.txt"),
|
|
53
|
+
sep = "\t",
|
|
54
|
+
quote = FALSE,
|
|
55
|
+
row.names = FALSE
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
add_report(
|
|
59
|
+
list(kind = "descr", content = "Metrics for all samples"),
|
|
60
|
+
list(kind = "table", src = file.path(outdir, "metrics.txt")),
|
|
61
|
+
h1 = "Metrics of all samples"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
METRIC_DESCR = list(
|
|
65
|
+
nIC = "No. of intermolecular contacts",
|
|
66
|
+
nCCC = "No. of charged-charged contacts",
|
|
67
|
+
nCPC = "No. of charged-polar contacts",
|
|
68
|
+
nCAPC = "No. of charged-apolar contacts",
|
|
69
|
+
nPPC = "No. of polar-polar contacts",
|
|
70
|
+
nAPPC = "No. of apolar-polar contacts",
|
|
71
|
+
nAPAPC = "No. of apolar-apolar contacts",
|
|
72
|
+
pANISR = "Percentage of apolar NIS residues",
|
|
73
|
+
pCNISR = "Percentage of charged NIS residues",
|
|
74
|
+
BindingAffinity = "Predicted binding affinity (kcal.mol^-1)",
|
|
75
|
+
DissociationConstant = "Predicted dissociation constant (M)"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
if (!is.null(group)) {
|
|
79
|
+
log_info("Merging group information ...")
|
|
80
|
+
metrics <- group %>%
|
|
81
|
+
left_join(metrics, by = "Sample") %>%
|
|
82
|
+
mutate(Group = factor(Group, levels = unique(Group)))
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
log_info("Plotting Prodigy metrics ...")
|
|
86
|
+
for (metric in names(METRIC_DESCR)) {
|
|
87
|
+
log_info("- {metric}: {METRIC_DESCR[[metric]]}")
|
|
88
|
+
|
|
89
|
+
add_report(
|
|
90
|
+
list(
|
|
91
|
+
kind = "descr",
|
|
92
|
+
content = METRIC_DESCR[[metric]] %||% paste0("Metric: ", metric)
|
|
93
|
+
),
|
|
94
|
+
h1 = metric
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# barplot
|
|
98
|
+
p <- ggplot(metrics, aes(x = Sample, y = !!sym(metric))) +
|
|
99
|
+
geom_bar(stat = "identity", fill = "steelblue") +
|
|
100
|
+
labs(x = "Sample", y = metric) +
|
|
101
|
+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
|
|
102
|
+
|
|
103
|
+
figfile <- file.path(outdir, paste0(slugify(metric), ".barplot.png"))
|
|
104
|
+
png(figfile, height = 600, res = 100, width = nrow(metrics) * 30 + 200)
|
|
105
|
+
print(p)
|
|
106
|
+
dev.off()
|
|
107
|
+
|
|
108
|
+
add_report(
|
|
109
|
+
list(src = figfile, name = "By Sample"),
|
|
110
|
+
ui = "table_of_images",
|
|
111
|
+
h1 = metric
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
if (is.null(group)) { next }
|
|
115
|
+
# group: Sample, Group
|
|
116
|
+
p <- ggplot(metrics, aes(x = Group, y = !!sym(metric))) +
|
|
117
|
+
geom_boxplot(fill = "steelblue") +
|
|
118
|
+
labs(x = "Group", y = metric) +
|
|
119
|
+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
|
|
120
|
+
|
|
121
|
+
figfile <- file.path(outdir, paste0(slugify(metric), ".boxplot.png"))
|
|
122
|
+
png(figfile, height = 600, res = 100, width = length(unique(metrics$Group)) * 30 + 200)
|
|
123
|
+
print(p)
|
|
124
|
+
dev.off()
|
|
125
|
+
|
|
126
|
+
add_report(
|
|
127
|
+
list(src = figfile, name = "By Group"),
|
|
128
|
+
ui = "table_of_images",
|
|
129
|
+
h1 = metric
|
|
130
|
+
)
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
save_report(joboutdir)
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# Script for regulatory.MotifAffinityTest
|
|
2
2
|
{{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
|
|
3
|
+
{{ biopipen_dir | joinpaths: "scripts", "regulatory", "motifs-common.R" | source_r }}
|
|
3
4
|
|
|
4
5
|
library(BiocParallel)
|
|
5
6
|
library(BSgenome)
|
|
6
|
-
library(universalmotif)
|
|
7
7
|
|
|
8
8
|
motiffile <- {{in.motiffile | r}}
|
|
9
9
|
varfile <- {{in.varfile | r}}
|
|
@@ -45,63 +45,9 @@ if (is.null(motif_col) && is.null(regulator_col)) {
|
|
|
45
45
|
log_info("Reading input regulator/motif file ...")
|
|
46
46
|
in_motifs <- read.table(motiffile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
|
|
47
47
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
stop("Regulator motifs (envs.regmotifs) is required and must exist when no motif column (envs.motif_col) is provided")
|
|
52
|
-
}
|
|
53
|
-
regmotifs <- read.table(regmotifs, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
|
|
54
|
-
rm_motif_col <- c('Motif', 'motif', 'MOTIF', 'Model', 'model', 'MODEL')
|
|
55
|
-
rm_reg_col <- c('Regulator', 'regulator', 'REGULATOR', 'TF', 'tf', 'TF', 'Transcription factor', 'transcription factor', 'Transcription Factor')
|
|
56
|
-
rm_motif_col <- intersect(rm_motif_col, colnames(regmotifs))
|
|
57
|
-
rm_reg_col <- intersect(rm_reg_col, colnames(regmotifs))
|
|
58
|
-
if (length(rm_motif_col) == 0) {
|
|
59
|
-
stop("No motif column found in envs.regmotifs, provide one of: ", paste(rm_motif_col, collapse = ", "))
|
|
60
|
-
}
|
|
61
|
-
if (length(rm_reg_col) == 0) {
|
|
62
|
-
stop("No regulator column found in envs.regmotifs, provide one of: ", paste(rm_reg_col, collapse = ", "))
|
|
63
|
-
}
|
|
64
|
-
rm_motif_col <- rm_motif_col[1]
|
|
65
|
-
rm_reg_col <- rm_reg_col[1]
|
|
66
|
-
# check regulators
|
|
67
|
-
rm_regs <- regmotifs[, rm_reg_col, drop = TRUE]
|
|
68
|
-
regulators <- in_motifs[, regulator_col, drop = TRUE]
|
|
69
|
-
notfound_regs <- setdiff(regulators, rm_regs)
|
|
70
|
-
if (length(notfound_regs) > 0 && notfound == "error") {
|
|
71
|
-
first_notfound <- head(notfound_regs, 3)
|
|
72
|
-
if (length(notfound_regs) > 3) {
|
|
73
|
-
first_notfound <- c(first_notfound, "...")
|
|
74
|
-
notfound_file <- file.path(outdir, "notfound_regulators.txt")
|
|
75
|
-
writeLines(notfound_regs, notfound_file)
|
|
76
|
-
msg1 <- paste0("The following regulators were not found in the envs.regmotifs file: ", paste(first_notfound, collapse = ", "))
|
|
77
|
-
msg2 <- paste0("Check the full list in ", notfound_file)
|
|
78
|
-
stop(msg1, "\n", msg2)
|
|
79
|
-
} else {
|
|
80
|
-
msg <- paste0("The following regulators were not found in the regmotifs file: ", paste(first_notfound, collapse = ", "))
|
|
81
|
-
stop(msg)
|
|
82
|
-
}
|
|
83
|
-
}
|
|
84
|
-
in_motifs <- in_motifs[in_motifs[, regulator_col] %in% rm_regs, , drop = FALSE]
|
|
85
|
-
# add motif column
|
|
86
|
-
in_motifs <- merge(in_motifs, regmotifs, by.x = regulator_col, by.y = rm_reg_col, all.x = TRUE, suffixes = c("", "_db"))
|
|
87
|
-
motif_col <- rm_motif_col
|
|
88
|
-
}
|
|
89
|
-
if (is.null(regulator_col)) {
|
|
90
|
-
# make motifs unique
|
|
91
|
-
in_moitfs <- in_motifs[!duplicated(in_motifs[, motif_col]), , drop = FALSE]
|
|
92
|
-
} else {
|
|
93
|
-
in_motifs <- in_motifs[!duplicated(in_motifs[, c(regulator_col, motif_col)]), , drop = FALSE]
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
if (!grepl(".", genome, fixed = TRUE)) {
|
|
98
|
-
genome_pkg = sprintf("BSgenome.Hsapiens.UCSC.%s", genome)
|
|
99
|
-
} else {
|
|
100
|
-
genome_pkg = genome
|
|
101
|
-
}
|
|
102
|
-
if (!requireNamespace(genome_pkg, quietly = TRUE)) {
|
|
103
|
-
stop(sprintf("Genome package %s is not installed", genome_pkg))
|
|
104
|
-
}
|
|
48
|
+
log_info("Ensuring motifs and regulators in the input data ...")
|
|
49
|
+
in_motifs <- ensure_regulator_motifs(in_motifs, outdir, motif_col, regulator_col, regmotifs, notfound = notfound)
|
|
50
|
+
genome_pkg <- get_genome_pkg(genome)
|
|
105
51
|
|
|
106
52
|
log_info("Reading variant file ...")
|
|
107
53
|
if (grepl("\\.vcf$", varfile) || grepl("\\.vcf\\.gz$", varfile)) {
|
|
@@ -124,91 +70,7 @@ snpinfo <- read.table(varfile, header=FALSE, stringsAsFactors=FALSE)
|
|
|
124
70
|
colnames(snpinfo) <- c("chrom", "start", "end", "name", "score", "strand", "ref", "alt")
|
|
125
71
|
|
|
126
72
|
log_info("Reading motif database ...")
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
check_motifs <- function(motifdb_names) {
|
|
130
|
-
motifs <- in_motifs[, motif_col, drop = TRUE]
|
|
131
|
-
notfound_motifs <- setdiff(motifs, motifdb_names)
|
|
132
|
-
if (length(notfound_motifs) > 0) {
|
|
133
|
-
first_notfound <- head(notfound_motifs, 3)
|
|
134
|
-
if (length(notfound_motifs) > 3) {
|
|
135
|
-
first_notfound <- c(first_notfound, "...")
|
|
136
|
-
notfound_file <- file.path(outdir, "notfound_motifs.txt")
|
|
137
|
-
writeLines(notfound_motifs, notfound_file)
|
|
138
|
-
msg1 <- paste0("The following motifs were not found in the motif database: ", paste(first_notfound, collapse = ", "))
|
|
139
|
-
msg2 <- paste0("Check the full list in ", notfound_file)
|
|
140
|
-
|
|
141
|
-
if (notfound == "error") {
|
|
142
|
-
stop(msg1, "\n", msg2)
|
|
143
|
-
} else if (notfound == "ignore") {
|
|
144
|
-
log_warn(msg1)
|
|
145
|
-
log_warn(msg2)
|
|
146
|
-
}
|
|
147
|
-
} else {
|
|
148
|
-
msg <- paste0("The following motifs were not found in the motif database: ", paste(first_notfound, collapse = ", "))
|
|
149
|
-
if (notfound == "error") {
|
|
150
|
-
stop(msg)
|
|
151
|
-
} else if (notfound == "ignore") {
|
|
152
|
-
log_warn(msg)
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
motifs <- setdiff(motifs, notfound_motifs)
|
|
157
|
-
}
|
|
158
|
-
return(motifs)
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
plot_variant <- function(motifbreakr_results) {
|
|
162
|
-
log_info("Plotting variants ...")
|
|
163
|
-
plotdir <- file.path(outdir, "plots")
|
|
164
|
-
dir.create(plotdir, showWarnings = FALSE)
|
|
165
|
-
results <- motifbreakr_results
|
|
166
|
-
if (is.null(plots) || length(plots) == 0) {
|
|
167
|
-
results <- results[order(-abs(results$alleleDiff)), , drop = FALSE]
|
|
168
|
-
results <- results[1:min(plot_nvars, length(results)), , drop = FALSE]
|
|
169
|
-
variants <- unique(results$SNP_id)
|
|
170
|
-
} else {
|
|
171
|
-
variants <- names(plots)
|
|
172
|
-
}
|
|
173
|
-
for (variant in variants) {
|
|
174
|
-
log_info("- Variant: {variant}")
|
|
175
|
-
if (is.null(plots[[variant]])) {
|
|
176
|
-
plots[[variant]] <- list(devpars = devpars, which = "TRUE")
|
|
177
|
-
}
|
|
178
|
-
if (is.null(plots[[variant]]$which)) {
|
|
179
|
-
plots[[variant]]$which <- "TRUE"
|
|
180
|
-
}
|
|
181
|
-
if (is.null(plots[[variant]]$devpars)) {
|
|
182
|
-
plots[[variant]]$devpars <- devpars
|
|
183
|
-
}
|
|
184
|
-
if (is.null(plots[[variant]]$devpars$res)) {
|
|
185
|
-
plots[[variant]]$devpars$res <- 100
|
|
186
|
-
}
|
|
187
|
-
res <- results[results$SNP_id == variant, , drop = FALSE]
|
|
188
|
-
if (length(res) == 0) {
|
|
189
|
-
stop(sprintf("Variant %s not found in results", variant))
|
|
190
|
-
}
|
|
191
|
-
res <- subset(res, subset = eval(parse(text = plots[[variant]]$which)))
|
|
192
|
-
if (length(res) == 0) {
|
|
193
|
-
stop(sprintf("No variants to plot for %s", variant))
|
|
194
|
-
}
|
|
195
|
-
plotfile <- file.path(plotdir, sprintf("%s.png", slugify(variant)))
|
|
196
|
-
# fix motifBreakR 2.12 using names to filter in plotMB
|
|
197
|
-
names(res) <- res$SNP_id
|
|
198
|
-
dv <- plots[[variant]]$devpars
|
|
199
|
-
if (is.null(dv$height)) {
|
|
200
|
-
dv$height <- 2.4 * dv$res + length(res) * 1.2 * dv$res
|
|
201
|
-
}
|
|
202
|
-
if (is.null(dv$width)) {
|
|
203
|
-
left <- min(sapply(res$motifPos, `[`, 1))
|
|
204
|
-
right <- max(sapply(res$motifPos, `[`, 2))
|
|
205
|
-
dv$width <- 1.5 * dv$res + (right - left) * 0.3 * dv$res
|
|
206
|
-
}
|
|
207
|
-
png(plotfile, width = dv$width, height = dv$height, res = dv$res)
|
|
208
|
-
motifbreakR::plotMB(res, variant)
|
|
209
|
-
dev.off()
|
|
210
|
-
}
|
|
211
|
-
}
|
|
73
|
+
mdb <- read_meme_to_motifdb(motifdb, in_motifs, motif_col, regulator_col, notfound, outdir)
|
|
212
74
|
|
|
213
75
|
tool <- tolower(tool)
|
|
214
76
|
tool <- match.arg(tool, c("motifbreakr", "atsnp"))
|
|
@@ -1,36 +1,6 @@
|
|
|
1
1
|
library(atSNP)
|
|
2
2
|
library(rtracklayer)
|
|
3
3
|
|
|
4
|
-
log_info("Converting universalmotif object to motif_library ...")
|
|
5
|
-
|
|
6
|
-
motifdb_names <- sapply(meme, function(m) m@name)
|
|
7
|
-
motifs <- check_motifs(motifdb_names)
|
|
8
|
-
meme <- filter_motifs(meme, name = motifs)
|
|
9
|
-
# Get the right order of motif names
|
|
10
|
-
motifs <- sapply(meme, function(m) m@name)
|
|
11
|
-
|
|
12
|
-
# used for atSNP
|
|
13
|
-
mdb <- lapply(meme, function(m) t(m@motif))
|
|
14
|
-
names(mdb) <- motifs
|
|
15
|
-
|
|
16
|
-
# compose one used for plotting using motifbreakR
|
|
17
|
-
motifdb_matrices <- lapply(meme, function(m) m@motif)
|
|
18
|
-
names(motifdb_matrices) <- motifs
|
|
19
|
-
motifdb_meta <- do.call(rbind, lapply(meme, function(m) {
|
|
20
|
-
ats <- attributes(m)
|
|
21
|
-
ats$dataSource <- basename(motifdb)
|
|
22
|
-
ats$class <- NULL
|
|
23
|
-
ats$motif <- NULL
|
|
24
|
-
ats$gapinfo <- NULL
|
|
25
|
-
ats$sequenceCount <- ats$nsites
|
|
26
|
-
ats$providerId <- ats$name
|
|
27
|
-
ats$providerName <- ats$name
|
|
28
|
-
ats$organism <- if (is.null(ats$organism) || length(ats$organism) == 0) "Unknown" else ats$organism
|
|
29
|
-
unlist(ats)
|
|
30
|
-
}))
|
|
31
|
-
rownames(motifdb_meta) <- motifs
|
|
32
|
-
pmotifs <- MotifDb:::MotifList(motifdb_matrices, tbl.metadata = motifdb_meta)
|
|
33
|
-
|
|
34
4
|
log_info("Converting snpinfo to atSNP object ...")
|
|
35
5
|
|
|
36
6
|
# c("chrom", "start", "end", "name", "score", "strand", "ref", "alt", "ref_seq", "alt_seq")
|
|
@@ -53,7 +23,9 @@ write.table(
|
|
|
53
23
|
file = atsnp_bed,
|
|
54
24
|
sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE
|
|
55
25
|
)
|
|
56
|
-
|
|
26
|
+
|
|
27
|
+
motif_lib <- motifdb_to_motiflib(mdb)
|
|
28
|
+
k <- max(sapply(motif_lib, nrow))
|
|
57
29
|
snps <- LoadSNPData(
|
|
58
30
|
atsnp_bed,
|
|
59
31
|
genome.lib = genome_pkg,
|
|
@@ -62,13 +34,12 @@ snps <- LoadSNPData(
|
|
|
62
34
|
half.window.size = k
|
|
63
35
|
)
|
|
64
36
|
|
|
65
|
-
# run motifbreakR
|
|
66
37
|
log_info("Running atSNP ...")
|
|
67
|
-
atsnp_scores <- ComputeMotifScore(
|
|
38
|
+
atsnp_scores <- ComputeMotifScore(motif_lib, snps, ncores = ncores)
|
|
68
39
|
|
|
69
40
|
log_info("Calculating p values ...")
|
|
70
41
|
atsnp_result <- ComputePValues(
|
|
71
|
-
motif.lib =
|
|
42
|
+
motif.lib = motif_lib,
|
|
72
43
|
snp.info = snps,
|
|
73
44
|
motif.scores = atsnp_scores$motif.scores,
|
|
74
45
|
ncores = ncores,
|
|
@@ -101,7 +72,7 @@ atsnp_result$motifPos <- sapply(1:nrow(atsnp_result), function(i) {
|
|
|
101
72
|
paste(c(atsnp_result$ref_start[i] - k, atsnp_result$ref_end[i] - k), collapse = ",")
|
|
102
73
|
})
|
|
103
74
|
if (!is.null(regulator_col)) {
|
|
104
|
-
atsnp_result$Regulator <- in_motifs[
|
|
75
|
+
atsnp_result$geneSymbol <- atsnp_result$Regulator <- in_motifs[
|
|
105
76
|
match(atsnp_result$providerId, in_motifs[[motif_col]]),
|
|
106
77
|
regulator_col,
|
|
107
78
|
drop = TRUE
|
|
@@ -120,7 +91,30 @@ atsnp_result$alleleDiff <- -atsnp_result[[cutoff_col]]
|
|
|
120
91
|
atsnp_result$effect <- "strong"
|
|
121
92
|
atsnp_result$motifPos <- lapply(atsnp_result$motifPos, function(x) as.integer(unlist(strsplit(x, ","))))
|
|
122
93
|
atsnp_result <- makeGRangesFromDataFrame(atsnp_result, keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE)
|
|
94
|
+
genome(atsnp_result) <- genome
|
|
123
95
|
attributes(atsnp_result)$genome.package <- genome_pkg
|
|
124
|
-
attributes(atsnp_result)$motifs <-
|
|
96
|
+
attributes(atsnp_result)$motifs <- mdb
|
|
97
|
+
|
|
98
|
+
if (is.null(plots) || length(plots) == 0) {
|
|
99
|
+
atsnp_result <- atsnp_result[order(-abs(atsnp_result$alleleDiff)), , drop = FALSE]
|
|
100
|
+
atsnp_result <- atsnp_result[1:min(plot_nvars, length(atsnp_result)), , drop = FALSE]
|
|
101
|
+
variants <- unique(atsnp_result$SNP_id)
|
|
102
|
+
} else {
|
|
103
|
+
variants <- names(plots)
|
|
104
|
+
}
|
|
105
|
+
for (variant in variants) {
|
|
106
|
+
log_info("- Variant: {variant}")
|
|
107
|
+
if (is.null(plots[[variant]])) {
|
|
108
|
+
plots[[variant]] <- list(devpars = devpars, which = "TRUE")
|
|
109
|
+
}
|
|
110
|
+
if (is.null(plots[[variant]]$which)) {
|
|
111
|
+
plots[[variant]]$which <- "TRUE"
|
|
112
|
+
}
|
|
113
|
+
if (is.null(plots[[variant]]$devpars)) {
|
|
114
|
+
plots[[variant]]$devpars <- devpars
|
|
115
|
+
}
|
|
116
|
+
res <- atsnp_result[atsnp_result$SNP_id == variant, , drop = FALSE]
|
|
117
|
+
res <- subset(res, subset = eval(parse(text = plots[[variant]]$which)))
|
|
125
118
|
|
|
126
|
-
|
|
119
|
+
plot_variant_motifs(res, variant, plots[[variant]]$devpars, outdir)
|
|
120
|
+
}
|
|
@@ -1,30 +1,6 @@
|
|
|
1
1
|
library(motifbreakR)
|
|
2
|
-
bsgenome <- getBSgenome(genome_pkg)
|
|
3
|
-
|
|
4
|
-
log_info("Converting universalmotif object to MotifDb object ...")
|
|
5
|
-
|
|
6
|
-
motifdb_names <- sapply(meme, function(m) m@name)
|
|
7
|
-
motifs <- check_motifs(motifdb_names)
|
|
8
|
-
meme <- filter_motifs(meme, name = motifs)
|
|
9
|
-
# Get the right order of motif names
|
|
10
|
-
motifs <- sapply(meme, function(m) m@name)
|
|
11
|
-
motifdb_matrices <- lapply(meme, function(m) m@motif)
|
|
12
|
-
names(motifdb_matrices) <- motifs
|
|
13
2
|
|
|
14
|
-
|
|
15
|
-
ats <- attributes(m)
|
|
16
|
-
ats$dataSource <- basename(motifdb)
|
|
17
|
-
ats$class <- NULL
|
|
18
|
-
ats$motif <- NULL
|
|
19
|
-
ats$gapinfo <- NULL
|
|
20
|
-
ats$sequenceCount <- ats$nsites
|
|
21
|
-
ats$providerId <- ats$name
|
|
22
|
-
ats$providerName <- ats$name
|
|
23
|
-
ats$organism <- if (is.null(ats$organism) || length(ats$organism) == 0) "Unknown" else ats$organism
|
|
24
|
-
unlist(ats)
|
|
25
|
-
}))
|
|
26
|
-
rownames(motifdb_meta) <- motifs
|
|
27
|
-
mdb <- MotifDb:::MotifList(motifdb_matrices, tbl.metadata = motifdb_meta)
|
|
3
|
+
bsgenome <- getBSgenome(genome_pkg)
|
|
28
4
|
|
|
29
5
|
# `chrom`, `start`, `end`, `name`, `score`, `strand`, `ref`, `alt`.
|
|
30
6
|
is_indel <- nchar(snpinfo$ref) != 1 | nchar(snpinfo$alt) != 1
|
|
@@ -93,4 +69,27 @@ write.table(
|
|
|
93
69
|
)
|
|
94
70
|
rm(results_to_save)
|
|
95
71
|
|
|
96
|
-
|
|
72
|
+
log_info("Plotting variants ...")
|
|
73
|
+
if (is.null(plots) || length(plots) == 0) {
|
|
74
|
+
results <- results[order(-abs(results$alleleDiff)), , drop = FALSE]
|
|
75
|
+
results <- results[1:min(plot_nvars, length(results)), , drop = FALSE]
|
|
76
|
+
variants <- unique(results$SNP_id)
|
|
77
|
+
} else {
|
|
78
|
+
variants <- names(plots)
|
|
79
|
+
}
|
|
80
|
+
for (variant in variants) {
|
|
81
|
+
log_info("- Variant: {variant}")
|
|
82
|
+
if (is.null(plots[[variant]])) {
|
|
83
|
+
plots[[variant]] <- list(devpars = devpars, which = "TRUE")
|
|
84
|
+
}
|
|
85
|
+
if (is.null(plots[[variant]]$which)) {
|
|
86
|
+
plots[[variant]]$which <- "TRUE"
|
|
87
|
+
}
|
|
88
|
+
if (is.null(plots[[variant]]$devpars)) {
|
|
89
|
+
plots[[variant]]$devpars <- devpars
|
|
90
|
+
}
|
|
91
|
+
res <- results[results$SNP_id == variant, , drop = FALSE]
|
|
92
|
+
res <- subset(res, subset = eval(parse(text = plots[[variant]]$which)))
|
|
93
|
+
|
|
94
|
+
plot_variant_motifs(res, variant, plots[[variant]]$devpars, outdir)
|
|
95
|
+
}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
{{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
|
|
2
|
+
{{ biopipen_dir | joinpaths: "scripts", "regulatory", "motifs-common.R" | source_r }}
|
|
3
|
+
|
|
4
|
+
library(BSgenome)
|
|
5
|
+
library(GenomicRanges)
|
|
6
|
+
|
|
7
|
+
infile <- {{in.infile | r}}
|
|
8
|
+
outdir <- {{out.outdir | r}}
|
|
9
|
+
genome <- {{envs.genome | r}}
|
|
10
|
+
motifdb <- {{envs.motifdb | r}}
|
|
11
|
+
motif_col <- {{envs.motif_col | r}}
|
|
12
|
+
regulator_col <- {{envs.regulator_col | r}}
|
|
13
|
+
regmotifs <- {{envs.regmotifs | r}}
|
|
14
|
+
notfound <- {{envs.notfound | r}}
|
|
15
|
+
devpars <- {{envs.devpars | r}}
|
|
16
|
+
plot_vars <- {{envs.plot_vars | r}}
|
|
17
|
+
|
|
18
|
+
if (is.null(motifdb) || !file.exists(motifdb)) {
|
|
19
|
+
stop("Motif database (envs.motifdb) is required and must exist")
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
if (is.null(genome)) {
|
|
23
|
+
stop("Reference genome (envs.ref) is required and must exist")
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
if (is.null(motif_col) && is.null(regulator_col)) {
|
|
27
|
+
stop("Either motif (envs.motif_col) or regulator (envs.regulator_col) column must be provided")
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
log_info("Reading input data ...")
|
|
31
|
+
indata <- read.table(infile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
|
|
32
|
+
|
|
33
|
+
log_info("Ensuring regulators in the input data ...")
|
|
34
|
+
indata <- ensure_regulator_motifs(indata, outdir, motif_col, regulator_col, regmotifs, notfound = notfound)
|
|
35
|
+
genome_pkg <- get_genome_pkg(genome)
|
|
36
|
+
|
|
37
|
+
log_info("Reading motif database ...")
|
|
38
|
+
meme <- read_meme_to_motifdb(motifdb, indata, motif_col, regulator_col, notfound, outdir)
|
|
39
|
+
|
|
40
|
+
log_info("Composing motifbreakR results from input data ...")
|
|
41
|
+
indata$chr <- indata$chrom %||% indata$chr %||% indata$seqnames
|
|
42
|
+
indata$seqnames <- NULL
|
|
43
|
+
indata$strand <- indata$strand %||% "+"
|
|
44
|
+
indata$varType <- indata$varType %||% "SNV"
|
|
45
|
+
indata$geneSymbol <- indata$geneSymbol %||% indata$Regulator
|
|
46
|
+
indata$providerId <- indata$providerId %||% indata$motif
|
|
47
|
+
indata$providerName <- indata$providerName %||% indata$providerId
|
|
48
|
+
indata$dataSource <- indata$dataSource %||% strsplit(basename(motifdb), "\\.")[[1]][1]
|
|
49
|
+
indata$effect <- indata$effect %||% "strong"
|
|
50
|
+
indata$altPos <- indata$altPos %||% 1
|
|
51
|
+
indata$alleleDiff <- indata$alleleDiff %||% indata$score %||% 0
|
|
52
|
+
|
|
53
|
+
# check other required columns
|
|
54
|
+
for (col in c("start", "end", "SNP_id", "REF", "ALT", "motifPos")) {
|
|
55
|
+
if (!(col %in% colnames(indata))) {
|
|
56
|
+
stop("Column '", col, "' is required in the input data")
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
indata$motifPos <- lapply(indata$motifPos, function(x) as.integer(unlist(strsplit(x, ","))))
|
|
60
|
+
indata <- makeGRangesFromDataFrame(indata, keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE)
|
|
61
|
+
genome(indata) <- genome
|
|
62
|
+
attributes(indata)$genome.package <- genome_pkg
|
|
63
|
+
attributes(indata)$motifs <- meme
|
|
64
|
+
|
|
65
|
+
log_info("Plotting variants ...")
|
|
66
|
+
if (is.null(plot_vars)) {
|
|
67
|
+
plot_vars <- unique(indata$SNP_id)
|
|
68
|
+
} else if (length(plot_vars) > 1) {
|
|
69
|
+
plot_vars <- unique(plot_vars)
|
|
70
|
+
} else {
|
|
71
|
+
plot_vars <- strsplit(plot_vars, ",")[[1]]
|
|
72
|
+
}
|
|
73
|
+
for (pvar in plot_vars) {
|
|
74
|
+
log_info("- Variant: {pvar}")
|
|
75
|
+
plot_variant_motifs(indata, pvar, devpars, outdir)
|
|
76
|
+
}
|