biopipen 0.31.5__py3-none-any.whl → 0.31.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/ns/bam.py +28 -0
- biopipen/ns/bed.py +40 -0
- biopipen/ns/regulatory.py +72 -0
- biopipen/ns/vcf.py +6 -2
- biopipen/scripts/bam/BamSubsetByBed.py +38 -0
- biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +5 -143
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +31 -37
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +25 -26
- biopipen/scripts/regulatory/VariantMotifPlot.R +76 -0
- biopipen/scripts/regulatory/motifs-common.R +322 -0
- biopipen/scripts/vcf/TruvariBench.sh +14 -7
- biopipen/scripts/vcf/TruvariBenchSummary.R +1 -2
- {biopipen-0.31.5.dist-info → biopipen-0.31.7.dist-info}/METADATA +1 -1
- {biopipen-0.31.5.dist-info → biopipen-0.31.7.dist-info}/RECORD +18 -16
- biopipen/scripts/regulatory/atSNP.R +0 -33
- biopipen/scripts/regulatory/motifBreakR.R +0 -1594
- {biopipen-0.31.5.dist-info → biopipen-0.31.7.dist-info}/WHEEL +0 -0
- {biopipen-0.31.5.dist-info → biopipen-0.31.7.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
{{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
|
|
2
|
+
{{ biopipen_dir | joinpaths: "scripts", "regulatory", "motifs-common.R" | source_r }}
|
|
3
|
+
|
|
4
|
+
library(BSgenome)
|
|
5
|
+
library(GenomicRanges)
|
|
6
|
+
|
|
7
|
+
infile <- {{in.infile | r}}
|
|
8
|
+
outdir <- {{out.outdir | r}}
|
|
9
|
+
genome <- {{envs.genome | r}}
|
|
10
|
+
motifdb <- {{envs.motifdb | r}}
|
|
11
|
+
motif_col <- {{envs.motif_col | r}}
|
|
12
|
+
regulator_col <- {{envs.regulator_col | r}}
|
|
13
|
+
regmotifs <- {{envs.regmotifs | r}}
|
|
14
|
+
notfound <- {{envs.notfound | r}}
|
|
15
|
+
devpars <- {{envs.devpars | r}}
|
|
16
|
+
plot_vars <- {{envs.plot_vars | r}}
|
|
17
|
+
|
|
18
|
+
if (is.null(motifdb) || !file.exists(motifdb)) {
|
|
19
|
+
stop("Motif database (envs.motifdb) is required and must exist")
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
if (is.null(genome)) {
|
|
23
|
+
stop("Reference genome (envs.ref) is required and must exist")
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
if (is.null(motif_col) && is.null(regulator_col)) {
|
|
27
|
+
stop("Either motif (envs.motif_col) or regulator (envs.regulator_col) column must be provided")
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
log_info("Reading input data ...")
|
|
31
|
+
indata <- read.table(infile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
|
|
32
|
+
|
|
33
|
+
log_info("Ensuring regulators in the input data ...")
|
|
34
|
+
indata <- ensure_regulator_motifs(indata, outdir, motif_col, regulator_col, regmotifs, notfound = notfound)
|
|
35
|
+
genome_pkg <- get_genome_pkg(genome)
|
|
36
|
+
|
|
37
|
+
log_info("Reading motif database ...")
|
|
38
|
+
meme <- read_meme_to_motifdb(motifdb, indata, motif_col, regulator_col, notfound, outdir)
|
|
39
|
+
|
|
40
|
+
log_info("Composing motifbreakR results from input data ...")
|
|
41
|
+
indata$chr <- indata$chrom %||% indata$chr %||% indata$seqnames
|
|
42
|
+
indata$seqnames <- NULL
|
|
43
|
+
indata$strand <- indata$strand %||% "+"
|
|
44
|
+
indata$varType <- indata$varType %||% "SNV"
|
|
45
|
+
indata$geneSymbol <- indata$geneSymbol %||% indata$Regulator
|
|
46
|
+
indata$providerId <- indata$providerId %||% indata$motif
|
|
47
|
+
indata$providerName <- indata$providerName %||% indata$providerId
|
|
48
|
+
indata$dataSource <- indata$dataSource %||% strsplit(basename(motifdb), "\\.")[[1]][1]
|
|
49
|
+
indata$effect <- indata$effect %||% "strong"
|
|
50
|
+
indata$altPos <- indata$altPos %||% 1
|
|
51
|
+
indata$alleleDiff <- indata$alleleDiff %||% indata$score %||% 0
|
|
52
|
+
|
|
53
|
+
# check other required columns
|
|
54
|
+
for (col in c("start", "end", "SNP_id", "REF", "ALT", "motifPos")) {
|
|
55
|
+
if (!(col %in% colnames(indata))) {
|
|
56
|
+
stop("Column '", col, "' is required in the input data")
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
indata$motifPos <- lapply(indata$motifPos, function(x) as.integer(unlist(strsplit(x, ","))))
|
|
60
|
+
indata <- makeGRangesFromDataFrame(indata, keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE)
|
|
61
|
+
genome(indata) <- genome
|
|
62
|
+
attributes(indata)$genome.package <- genome_pkg
|
|
63
|
+
attributes(indata)$motifs <- meme
|
|
64
|
+
|
|
65
|
+
log_info("Plotting variants ...")
|
|
66
|
+
if (is.null(plot_vars)) {
|
|
67
|
+
plot_vars <- unique(indata$SNP_id)
|
|
68
|
+
} else if (length(plot_vars) > 1) {
|
|
69
|
+
plot_vars <- unique(plot_vars)
|
|
70
|
+
} else {
|
|
71
|
+
plot_vars <- strsplit(plot_vars, ",")[[1]]
|
|
72
|
+
}
|
|
73
|
+
for (pvar in plot_vars) {
|
|
74
|
+
log_info("- Variant: {pvar}")
|
|
75
|
+
plot_variant_motifs(indata, pvar, devpars, outdir)
|
|
76
|
+
}
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
# make sure biopipen/utils/misc.R is loaded, log_warn is defined, and slugify is defined
|
|
2
|
+
|
|
3
|
+
library(rlang)
|
|
4
|
+
library(universalmotif)
|
|
5
|
+
library(MotifDb)
|
|
6
|
+
|
|
7
|
+
#' @title Common functions for regulatory analysis
|
|
8
|
+
#' @name regulatory-common
|
|
9
|
+
#' @author Panwen Wang
|
|
10
|
+
|
|
11
|
+
#' Read a regulator-motif mapping file
|
|
12
|
+
#'
|
|
13
|
+
#' @param rmfile Regulator-motif mapping file
|
|
14
|
+
#' @param motif_cols_allowed Allowed motif columns
|
|
15
|
+
#' @param reg_cols_allowed Allowed regulator columns
|
|
16
|
+
#' @return Data frame with regulators and motifs in the first and second columns, respectively
|
|
17
|
+
.read_regmotifs <- function(
|
|
18
|
+
rmfile,
|
|
19
|
+
motif_cols_allowed = c("Motif", "motif", "MOTIF", "Model", "model", "MODEL"),
|
|
20
|
+
reg_cols_allowed = c("Regulator", "regulator", "REGULATOR", "TF", "tf", "TF")
|
|
21
|
+
) {
|
|
22
|
+
if (!file.exists(rmfile)) {
|
|
23
|
+
stop("Regulator-motif mapping file does not exist.")
|
|
24
|
+
}
|
|
25
|
+
regmotifs <- read.table(rmfile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
|
|
26
|
+
rm_motif_col <- intersect(motif_cols_allowed, colnames(regmotifs))
|
|
27
|
+
rm_reg_col <- intersect(reg_cols_allowed, colnames(regmotifs))
|
|
28
|
+
if (length(rm_motif_col) == 0) {
|
|
29
|
+
stop(paste0("No motif column found in the regulator-motif mapping file, provide one of: ", paste(motif_cols_allowed, collapse = ", ")))
|
|
30
|
+
}
|
|
31
|
+
if (length(rm_reg_col) == 0) {
|
|
32
|
+
stop(paste0("No regulator column found in the regulator-motif mapping file, provide one of: ", paste(reg_cols_allowed, collapse = ", ")))
|
|
33
|
+
}
|
|
34
|
+
if (length(rm_motif_col) > 1) {
|
|
35
|
+
stop(paste0("Multiple motif columns found (", paste(rm_motif_col, collapse = ", "), ") in the regulator-motif mapping file, provide only one"))
|
|
36
|
+
}
|
|
37
|
+
if (length(rm_reg_col) > 1) {
|
|
38
|
+
stop(paste0("Multiple regulator columns found (", paste(rm_reg_col, collapse = ", "), ") in the regulator-motif mapping file, provide only one"))
|
|
39
|
+
}
|
|
40
|
+
rm_motif_col <- rm_motif_col[1]
|
|
41
|
+
rm_reg_col <- rm_reg_col[1]
|
|
42
|
+
regmotifs <- regmotifs[, c(rm_motif_col, rm_reg_col), drop = FALSE]
|
|
43
|
+
|
|
44
|
+
return(regmotifs)
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
#' Handle not found items
|
|
48
|
+
#'
|
|
49
|
+
#' @param notfound_items Items that were not found
|
|
50
|
+
#' @param log_warn Function to log warnings
|
|
51
|
+
#' @param msg Message to display
|
|
52
|
+
#' @param notfound Action to take if items are not found
|
|
53
|
+
#' @param notfound_file File to save the full list of not found items
|
|
54
|
+
#' @param log_indent Indentation for log messages
|
|
55
|
+
.handle_notfound_items <- function (notfound_items, log_warn, msg, notfound, notfound_file, log_indent = "") {
|
|
56
|
+
if (length(notfound_items) > 0) {
|
|
57
|
+
first_notfound <- head(notfound_items, 3)
|
|
58
|
+
if (length(notfound_items) > 3) {
|
|
59
|
+
first_notfound <- c(first_notfound, "...")
|
|
60
|
+
writeLines(notfound_items, notfound_file)
|
|
61
|
+
msg1 <- paste0(log_indent, msg, ": ", paste(first_notfound, collapse = ", "))
|
|
62
|
+
msg2 <- paste0(log_indent, "Check the full list in ", notfound_file)
|
|
63
|
+
if (notfound == "error") {
|
|
64
|
+
stop(msg1, "\n", msg2)
|
|
65
|
+
} else if (notfound == "ignore") {
|
|
66
|
+
log_warn(msg1)
|
|
67
|
+
log_warn(msg2)
|
|
68
|
+
}
|
|
69
|
+
} else {
|
|
70
|
+
msg <- paste0(log_indent, msg, ": ", paste(first_notfound, collapse = ", "))
|
|
71
|
+
if (notfound == "error") {
|
|
72
|
+
stop(msg)
|
|
73
|
+
} else if (notfound == "ignore") {
|
|
74
|
+
log_warn(msg)
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
#' Read a MEME file to a MotifDb object
|
|
81
|
+
#' and filter the motifs based on the input data
|
|
82
|
+
#' and return the filtered MotifDb object
|
|
83
|
+
#' with metadata
|
|
84
|
+
#'
|
|
85
|
+
#' @param motifdb MEME file
|
|
86
|
+
#' @param indata Input data frame
|
|
87
|
+
#' @param motif_col Column name for the motif
|
|
88
|
+
#' @param regulator_col Column name for the regulator
|
|
89
|
+
#' @param notfound Action to take if motifs are not found
|
|
90
|
+
#' @param outdir Output directory, used to save un-matched motifs
|
|
91
|
+
#' @return MotifDb object
|
|
92
|
+
#' @export
|
|
93
|
+
read_meme_to_motifdb <- function(motifdb, indata, motif_col, regulator_col, notfound, outdir) {
|
|
94
|
+
meme <- read_meme(motifdb)
|
|
95
|
+
motifdb_names <- sapply(meme, function(m) m@name)
|
|
96
|
+
motifs <- check_motifs(indata[[motif_col]], motifdb_names, notfound, outdir)
|
|
97
|
+
meme <- filter_motifs(meme, name = motifs)
|
|
98
|
+
# Get the right order of motif names
|
|
99
|
+
motifs <- sapply(meme, function(m) m@name)
|
|
100
|
+
motifdb_matrices <- lapply(meme, function(m) m@motif)
|
|
101
|
+
names(motifdb_matrices) <- motifs
|
|
102
|
+
motifdb_meta <- do.call(rbind, lapply(meme, function(m) {
|
|
103
|
+
ats <- attributes(m)
|
|
104
|
+
ats$dataSource <- strsplit(basename(motifdb), "\\.")[[1]][1]
|
|
105
|
+
ats$class <- NULL
|
|
106
|
+
ats$motif <- NULL
|
|
107
|
+
ats$gapinfo <- NULL
|
|
108
|
+
ats$sequenceCount <- ats$nsites
|
|
109
|
+
ats$providerId <- ats$name
|
|
110
|
+
ats$providerName <- ats$name
|
|
111
|
+
ats$organism <- if (is.null(ats$organism) || length(ats$organism) == 0) "Unknown" else ats$organism
|
|
112
|
+
if (!is.null(regulator_col)) {
|
|
113
|
+
ats$geneSymbol <- indata[
|
|
114
|
+
indata[[motif_col]] == ats$name,
|
|
115
|
+
regulator_col,
|
|
116
|
+
drop = TRUE
|
|
117
|
+
]
|
|
118
|
+
}
|
|
119
|
+
unlist(ats)
|
|
120
|
+
})
|
|
121
|
+
)
|
|
122
|
+
rownames(motifdb_meta) <- motifs
|
|
123
|
+
MotifDb:::MotifList(motifdb_matrices, tbl.metadata = motifdb_meta)
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
#' Convert a MotifDb object to a motif library
|
|
127
|
+
#' with motif names as keys
|
|
128
|
+
#' and PWMs as values
|
|
129
|
+
#' @param motifdb MotifDb object
|
|
130
|
+
#' @return Motif library
|
|
131
|
+
#' @export
|
|
132
|
+
motifdb_to_motiflib <- function(motifdb) {
|
|
133
|
+
lapply(motifdb, t)
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
#' Make sure the regulators and motifs in the input data from a regulator-motif mappings
|
|
137
|
+
#'
|
|
138
|
+
#' @param indata Input data frame
|
|
139
|
+
#' @param outdir Output directory, used to save un-matched regulators
|
|
140
|
+
#' @param motif_col Column name for the motif
|
|
141
|
+
#' @param regulator_col Column name for the regulator
|
|
142
|
+
#' @param regmotifs Regulator-motif mapping file
|
|
143
|
+
#' @param log_indent Indentation for log messages
|
|
144
|
+
#' @param notfound Action to take if regulators are not found in the mapping file
|
|
145
|
+
#' @return Data frame with regulators and motifs
|
|
146
|
+
#' @export
|
|
147
|
+
ensure_regulator_motifs <- function (indata, outdir, motif_col, regulator_col, regmotifs, log_indent = "", notfound = "error") {
|
|
148
|
+
if (is.null(motif_col)) {
|
|
149
|
+
if (is.null(regmotifs)) {
|
|
150
|
+
stop("Regulator-motif mapping file (envs.regmotifs) is required when no motif column (envs.motif_col) is provided")
|
|
151
|
+
}
|
|
152
|
+
regmotifs <- .read_regmotifs(regmotifs)
|
|
153
|
+
rm_motif_col <- colnames(regmotifs)[1]
|
|
154
|
+
rm_reg_col <- colnames(regmotifs)[2]
|
|
155
|
+
# check regulators
|
|
156
|
+
rm_regs <- regmotifs[[rm_reg_col]]
|
|
157
|
+
regulators <- indata[[regulator_col]]
|
|
158
|
+
notfound_regs <- setdiff(regulators, rm_regs)
|
|
159
|
+
.handle_notfound_items(
|
|
160
|
+
notfound_regs,
|
|
161
|
+
log_warn,
|
|
162
|
+
"The following regulators were not found in the regulator-motif mapping file",
|
|
163
|
+
notfound,
|
|
164
|
+
file.path(outdir, "notfound_regulators.txt"),
|
|
165
|
+
log_indent
|
|
166
|
+
)
|
|
167
|
+
indata <- indata[indata[[regulator_col]] %in% rm_regs, , drop = FALSE]
|
|
168
|
+
# add motif column
|
|
169
|
+
indata <- merge(indata, regmotifs, by.x = regulator_col, by.y = rm_reg_col, all.x = TRUE, suffixes = c("", "_db"))
|
|
170
|
+
# update motif column
|
|
171
|
+
motif_col <<- rm_motif_col
|
|
172
|
+
} else if (is.null(regulator_col)) {
|
|
173
|
+
if (is.null(regmotifs) || (is.character(regmotifs) && nchar(regmotifs) == 0)) {
|
|
174
|
+
# make motifs unique
|
|
175
|
+
indata <- indata[!duplicated(indata[[motif_col]]), , drop = FALSE]
|
|
176
|
+
} else if (!file.exists(regmotifs)) {
|
|
177
|
+
stop("Regulator-motif mapping file (envs.regmotifs) does not exist.")
|
|
178
|
+
} else {
|
|
179
|
+
# map the regulators
|
|
180
|
+
regmotifs <- .read_regmotifs(regmotifs)
|
|
181
|
+
rm_motif_col <- colnames(regmotifs)[1]
|
|
182
|
+
rm_reg_col <- colnames(regmotifs)[2]
|
|
183
|
+
rm_motifs <- regmotifs[[rm_motif_col]]
|
|
184
|
+
motifs <- indata[[motif_col]]
|
|
185
|
+
notfound_motifs <- setdiff(motifs, rm_motifs)
|
|
186
|
+
.handle_notfound_items(
|
|
187
|
+
notfound_motifs,
|
|
188
|
+
log_warn,
|
|
189
|
+
"The following motifs were not found in the regulator-motif mapping file",
|
|
190
|
+
notfound,
|
|
191
|
+
file.path(outdir, "notfound_motifs.txt"),
|
|
192
|
+
log_indent
|
|
193
|
+
)
|
|
194
|
+
indata <- indata[indata[[motif_col]] %in% rm_motifs, , drop = FALSE]
|
|
195
|
+
# add regulator column
|
|
196
|
+
indata <- merge(indata, regmotifs, by.x = motif_col, by.y = rm_motif_col, all.x = TRUE, suffixes = c("", "_db"))
|
|
197
|
+
# update regulator column
|
|
198
|
+
regulator_col <<- rm_reg_col
|
|
199
|
+
}
|
|
200
|
+
} else {
|
|
201
|
+
indata <- indata[!duplicated(indata[, c(regulator_col, motif_col), drop = FALSE]), , drop = FALSE]
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
return(indata)
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
#' Get the genome package name for a given genome
|
|
208
|
+
#'
|
|
209
|
+
#' @param genome Genome name
|
|
210
|
+
#' @return Genome package name
|
|
211
|
+
#' @export
|
|
212
|
+
get_genome_pkg <- function(genome) {
|
|
213
|
+
if (!grepl(".", genome, fixed = TRUE)) {
|
|
214
|
+
genome_pkg = sprintf("BSgenome.Hsapiens.UCSC.%s", genome)
|
|
215
|
+
} else {
|
|
216
|
+
genome_pkg = genome
|
|
217
|
+
}
|
|
218
|
+
if (!requireNamespace(genome_pkg, quietly = TRUE)) {
|
|
219
|
+
stop(sprintf("Genome package %s is not installed", genome_pkg))
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
library(package = genome_pkg, character.only = TRUE)
|
|
223
|
+
return(genome_pkg)
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
#' Check if motifs are in the motif database
|
|
227
|
+
#' and return the motifs that are found
|
|
228
|
+
#'
|
|
229
|
+
#' @param motifs Motifs to check
|
|
230
|
+
#' @param all_motifs All motifs in the motif database
|
|
231
|
+
#' @param notfound Action to take if motifs are not found
|
|
232
|
+
#' @param outdir Output directory, used to save un-matched motifs
|
|
233
|
+
#' @return Motifs that are found
|
|
234
|
+
#' @export
|
|
235
|
+
check_motifs <- function(motifs, all_motifs, notfound, outdir) {
|
|
236
|
+
notfound_motifs <- setdiff(motifs, all_motifs)
|
|
237
|
+
if (length(notfound_motifs) > 0) {
|
|
238
|
+
first_notfound <- head(notfound_motifs, 3)
|
|
239
|
+
if (length(notfound_motifs) > 3) {
|
|
240
|
+
first_notfound <- c(first_notfound, "...")
|
|
241
|
+
notfound_file <- file.path(outdir, "notfound_motifs.txt")
|
|
242
|
+
writeLines(notfound_motifs, notfound_file)
|
|
243
|
+
msg1 <- paste0("The following motifs were not found in the motif database: ", paste(first_notfound, collapse = ", "))
|
|
244
|
+
msg2 <- paste0("Check the full list in ", notfound_file)
|
|
245
|
+
|
|
246
|
+
if (notfound == "error") {
|
|
247
|
+
stop(msg1, "\n", msg2)
|
|
248
|
+
} else if (notfound == "ignore") {
|
|
249
|
+
log_warn(msg1)
|
|
250
|
+
log_warn(msg2)
|
|
251
|
+
}
|
|
252
|
+
} else {
|
|
253
|
+
msg <- paste0("The following motifs were not found in the motif database: ", paste(first_notfound, collapse = ", "))
|
|
254
|
+
if (notfound == "error") {
|
|
255
|
+
stop(msg)
|
|
256
|
+
} else if (notfound == "ignore") {
|
|
257
|
+
log_warn(msg)
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
motifs <- setdiff(motifs, notfound_motifs)
|
|
262
|
+
}
|
|
263
|
+
return(motifs)
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
#' Plot a genomic region surrounding a genomic variant, and
|
|
267
|
+
#' potentially disrupted motifs.
|
|
268
|
+
#'
|
|
269
|
+
#' @param results The motifbreakR results.
|
|
270
|
+
#' A GRanges object with the following columns:
|
|
271
|
+
#' - seqnames: Chromosome
|
|
272
|
+
#' - ranges: Start and end positions
|
|
273
|
+
#' - strand: Strand
|
|
274
|
+
#' -------------------
|
|
275
|
+
#' - SNP_id: Variant ID
|
|
276
|
+
#' - REF: Reference allele
|
|
277
|
+
#' - ALT: Alternative allele
|
|
278
|
+
#' - varType: Variant type. By default, "SNV"
|
|
279
|
+
#' - motifPos: Motif positions
|
|
280
|
+
#' - geneSymbol: Gene symbol, if not provided, try to get from the Regulator column
|
|
281
|
+
#' - dataSource: Motif database source
|
|
282
|
+
#' - providerName: Motif name
|
|
283
|
+
#' - providerId: Motif ID
|
|
284
|
+
#' - effect: Effect of the variant. By default, "strong"
|
|
285
|
+
#' - altPos: Alternative allele position. By default, 1
|
|
286
|
+
#' - alleleDiff: Allele difference, default 0, does not affect the plot for SNVs
|
|
287
|
+
#'
|
|
288
|
+
#' Attributes:
|
|
289
|
+
#' - genome.package: Genome package name
|
|
290
|
+
#' - motifs: Motif database, in MotifDb::MotifList format
|
|
291
|
+
#' @param variant Variant ID to be plotted
|
|
292
|
+
#' @param devpars List of device parameters
|
|
293
|
+
#' - res: Resolution, default 100
|
|
294
|
+
#' - width: Width of the plot, default NULL, calculated based on sequence length
|
|
295
|
+
#' - height: Height of the plot, default NULL, calculated based on the number of motifs
|
|
296
|
+
#' @param outdir Output directory. Plots will be saved in the sub-directory "<outdir>/plots/"
|
|
297
|
+
#' @export
|
|
298
|
+
plot_variant_motifs <- function(results, variant, devpars, outdir) {
|
|
299
|
+
plotdir <- file.path(outdir, "plots")
|
|
300
|
+
dir.create(plotdir, showWarnings = FALSE)
|
|
301
|
+
|
|
302
|
+
res <- results[results$SNP_id == variant, , drop = FALSE]
|
|
303
|
+
devpars <- devpars %||% list(res = 100, width = NULL, height = NULL)
|
|
304
|
+
if (length(res) == 0) {
|
|
305
|
+
stop(sprintf("Variant %s not found in results", variant))
|
|
306
|
+
}
|
|
307
|
+
devpars$res <- devpars$res %||% 100
|
|
308
|
+
devpars$height <- devpars$height %||% 2.4 * devpars$res + length(res) * 1.2 * devpars$res
|
|
309
|
+
if (is.null(devpars$width)) {
|
|
310
|
+
left <- min(sapply(res$motifPos, `[`, 1))
|
|
311
|
+
right <- max(sapply(res$motifPos, `[`, 2))
|
|
312
|
+
devpars$width <- 1.5 * devpars$res + (right - left) * 0.3 * devpars$res
|
|
313
|
+
devpars$width <- max(devpars$width, 5 * devpars$res)
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
plotfile <- file.path(plotdir, sprintf("%s.png", slugify(variant)))
|
|
317
|
+
# fix motifBreakR 2.12 using names to filter in plotMB
|
|
318
|
+
names(res) <- res$SNP_id
|
|
319
|
+
png(plotfile, width = devpars$width, height = devpars$height, res = devpars$res)
|
|
320
|
+
motifbreakR::plotMB(res, variant)
|
|
321
|
+
dev.off()
|
|
322
|
+
}
|
|
@@ -1,13 +1,15 @@
|
|
|
1
|
+
# shellcheck disable=SC1083
|
|
1
2
|
compvcf={{in.compvcf | quote}}
|
|
2
3
|
basevcf={{in.basevcf | quote}}
|
|
3
4
|
outdir={{out.outdir | quote}}
|
|
4
5
|
truvari={{envs.truvari | quote}}
|
|
5
6
|
ref={{envs.ref | quote}}
|
|
6
7
|
refdist={{envs.refdist | quote}}
|
|
7
|
-
|
|
8
|
+
pctseq={{envs.pctseq | quote}}
|
|
8
9
|
pctsize={{envs.pctsize | quote}}
|
|
9
10
|
pctovl={{envs.pctovl | quote}}
|
|
10
11
|
sizemax={{envs.sizemax | default: 50000 | quote}}
|
|
12
|
+
# shellcheck disable=SC1054
|
|
11
13
|
{% if envs.typeignore %}
|
|
12
14
|
typeignore="--typeignore"
|
|
13
15
|
{% else %}
|
|
@@ -15,20 +17,25 @@ typeignore=""
|
|
|
15
17
|
{% endif %}
|
|
16
18
|
{% if envs.multimatch %}
|
|
17
19
|
multimatch="--multimatch"
|
|
20
|
+
# shellcheck disable=SC1009
|
|
18
21
|
{% else %}
|
|
19
22
|
multimatch=""
|
|
23
|
+
# shellcheck disable=SC1073
|
|
20
24
|
{% endif %}
|
|
21
25
|
|
|
22
26
|
rm -rf $outdir
|
|
23
|
-
$truvari bench \
|
|
24
|
-
-c
|
|
25
|
-
-b
|
|
26
|
-
-f
|
|
27
|
+
cmd="$truvari bench \
|
|
28
|
+
-c '$compvcf' \
|
|
29
|
+
-b '$basevcf' \
|
|
30
|
+
-f '$ref' \
|
|
27
31
|
--refdist $refdist \
|
|
28
|
-
--
|
|
32
|
+
--pctseq $pctseq \
|
|
29
33
|
--pctsize $pctsize \
|
|
30
34
|
--pctovl $pctovl \
|
|
31
35
|
--sizemax $sizemax \
|
|
32
36
|
$typeignore \
|
|
33
37
|
$multimatch \
|
|
34
|
-
-o $outdir
|
|
38
|
+
-o $outdir"
|
|
39
|
+
|
|
40
|
+
echo "$cmd"
|
|
41
|
+
eval "$cmd"
|
|
@@ -17,7 +17,7 @@ read_summary = function() {
|
|
|
17
17
|
|
|
18
18
|
summaries = NULL
|
|
19
19
|
for (indir in indirs) {
|
|
20
|
-
summary = fromJSON(file=file.path(indir, "summary.
|
|
20
|
+
summary = fromJSON(file=file.path(indir, "summary.json"))
|
|
21
21
|
summary$gt_matrix = NULL
|
|
22
22
|
summary$Sample = sub(".truvari_bench", "", basename(indir), fixed=T)
|
|
23
23
|
summaries = bind_rows(summaries, summary)
|
|
@@ -43,7 +43,6 @@ plot_summary = function(col) {
|
|
|
43
43
|
summaries,
|
|
44
44
|
"col",
|
|
45
45
|
list(mapping = aes_string(x = "Sample", y = bQuote(col), fill = "Sample")),
|
|
46
|
-
|
|
47
46
|
devpars = get_devpars(),
|
|
48
47
|
outfile = outfile
|
|
49
48
|
)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
biopipen/__init__.py,sha256=
|
|
1
|
+
biopipen/__init__.py,sha256=APQVRwZptBFPacKCHqg_tW4g4--qdUtMJoA6GprSuSI,23
|
|
2
2
|
biopipen/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
biopipen/core/config.py,sha256=edK5xnDhM8j27srDzsxubi934NMrglLoKrdcC8qsEPk,1069
|
|
4
4
|
biopipen/core/config.toml,sha256=7IXvviRicZ2D1h6x3BVgbLJ96nsh-ikvZ0sVlQepqFE,1944
|
|
@@ -7,8 +7,8 @@ biopipen/core/filters.py,sha256=5bZsbpdW7DCxqiteRdb2gelmXvfqWPmPsFxrpHdWsoE,1298
|
|
|
7
7
|
biopipen/core/proc.py,sha256=60lUP3PcUAaKbDETo9N5PEIoeOYrLgcSmuytmrhcx8g,912
|
|
8
8
|
biopipen/core/testing.py,sha256=lZ_R5ZbYPO2NPuLHdbzg6HbD_f4j8paVVbyeUqwg6FE,3411
|
|
9
9
|
biopipen/ns/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
biopipen/ns/bam.py,sha256
|
|
11
|
-
biopipen/ns/bed.py,sha256=
|
|
10
|
+
biopipen/ns/bam.py,sha256=OtvzEadx-zpr98PoZoozbQszcQBvtFDMuAtuwH-3JUw,11394
|
|
11
|
+
biopipen/ns/bed.py,sha256=EqpSa7Hx6GImvJNghtV4uDo2PnPXeUt1Yq9AFWJP9_8,8159
|
|
12
12
|
biopipen/ns/cellranger.py,sha256=yPBoNzVSY74J7uyVucaob5lqZKKru5-hYSM4f4Nr2OY,5553
|
|
13
13
|
biopipen/ns/cellranger_pipeline.py,sha256=EWkPJTujamNSMQoRnKfhUiIj6TkMfRmCSUbPfd8Tv8E,4011
|
|
14
14
|
biopipen/ns/cnv.py,sha256=ssQAHf2MB675avoXVRkYy0vGiqIpRpRExywjhbymmBI,7811
|
|
@@ -20,7 +20,7 @@ biopipen/ns/gsea.py,sha256=EsNRAPYsagaV2KYgr4Jv0KCnZGqayM209v4yOGGTIOI,7423
|
|
|
20
20
|
biopipen/ns/misc.py,sha256=qXcm0RdR6W-xpYGgQn3v7JBeYRWwVm5gtgSj2tdVxx4,2935
|
|
21
21
|
biopipen/ns/plot.py,sha256=XzLq0A8qCIQRbxhPEdWhEfbRZ8g3e4KriVz0RP8enNY,18078
|
|
22
22
|
biopipen/ns/protein.py,sha256=33pzM-gvBTw0jH60mvfqnriM6uw2zj3katZ82nC9owI,3309
|
|
23
|
-
biopipen/ns/regulatory.py,sha256=
|
|
23
|
+
biopipen/ns/regulatory.py,sha256=gJjGVpJrdv-rg2t5UjK4AGuvtLNymaNYNvoD8PhlbvE,15929
|
|
24
24
|
biopipen/ns/rnaseq.py,sha256=bKAa6friFWof4yDTWZQahm1MS-lrdetO1GqDKdfxXYc,7708
|
|
25
25
|
biopipen/ns/scrna.py,sha256=fXP_h7gchcuk_Jwos0IgY_P8ON6Q995OgKHgdrxfvAY,112868
|
|
26
26
|
biopipen/ns/scrna_metabolic_landscape.py,sha256=6AhaynGG3lNRi96N2tReVT46BJMuEwooSSd2irBoN80,28347
|
|
@@ -28,7 +28,7 @@ biopipen/ns/snp.py,sha256=-Jx5Hsv_7KV7TqLU0nHCaPkMEN0CFdi4tNVlyq0rUZ4,27259
|
|
|
28
28
|
biopipen/ns/stats.py,sha256=DlPyK5Vsg6ZEkV9SDS3aAw21eXzvOHgqeZDkXPhg7go,20509
|
|
29
29
|
biopipen/ns/tcgamaf.py,sha256=AFbUJIxiMSvsVY3RcHgjRFuMnNh2DG3Mr5slLNEyz6o,1455
|
|
30
30
|
biopipen/ns/tcr.py,sha256=0PCF8iPZ629z6P3RHoAWEpMWmuDslomTGcMopjqvXmE,88304
|
|
31
|
-
biopipen/ns/vcf.py,sha256=
|
|
31
|
+
biopipen/ns/vcf.py,sha256=OYWuAWADba1xLwvHmIPwXYin_rUaAFQq7N38DQvoYzs,22746
|
|
32
32
|
biopipen/ns/web.py,sha256=4itJzaju8VBARIyZjDeh5rsVKpafFq_whixnvL8sXes,5368
|
|
33
33
|
biopipen/reports/bam/CNAClinic.svelte,sha256=D4IxQcgDCPQZMbXog-aZP5iJEQTK2N4i0C60e_iXyfs,213
|
|
34
34
|
biopipen/reports/bam/CNVpytor.svelte,sha256=s03SlhbEPd8-_44Dy_cqE8FSErhUdqStLK39te5o7ZE,1364
|
|
@@ -79,6 +79,7 @@ biopipen/reports/vcf/TruvariConsistency.svelte,sha256=BBvtxi1EPmGH7j5M5zMOcLEhKW
|
|
|
79
79
|
biopipen/scripts/bam/BamMerge.py,sha256=Gd5P8V-CSsTAA8ZrUxetR-I49GjJ3VJNjrqu7-EZwXQ,3642
|
|
80
80
|
biopipen/scripts/bam/BamSampling.py,sha256=Pi6CXAbBFVRGh8-0WrkB-3v3oxinfahQk11H0IdBNmQ,2312
|
|
81
81
|
biopipen/scripts/bam/BamSplitChroms.py,sha256=b7GS2I4X0dLOhlPg_r9-buoIHTWlq6zHI3Rox94LXR8,4893
|
|
82
|
+
biopipen/scripts/bam/BamSubsetByBed.py,sha256=QpY6WDJfbO3k2FdMyfgstFKgTdtOc1beGoUF5FI5EAc,1027
|
|
82
83
|
biopipen/scripts/bam/CNAClinic.R,sha256=mQXwtShL54HZXGCPqgPKPrU74_6K_8PqtOtG0mgA-F0,5062
|
|
83
84
|
biopipen/scripts/bam/CNVpytor.py,sha256=hOUli9BDMOoth0or-tjUYC1AP3yNOuxUS6G3Rhcg99s,18000
|
|
84
85
|
biopipen/scripts/bam/ControlFREEC.py,sha256=oX6iWsos-CfiT_ViDBrKeMOOIVdCKWrB-_MqzLgEF9s,3267
|
|
@@ -86,6 +87,7 @@ biopipen/scripts/bed/Bed2Vcf.py,sha256=u0mp_2Y4UtEA839zq9UENesH6Gyiwd4sZQW9wFnBV
|
|
|
86
87
|
biopipen/scripts/bed/BedConsensus.py,sha256=gfAxuIalvCEpS0tiOyAJGPYGgHN0L-hm0K37Iteh5yw,2386
|
|
87
88
|
biopipen/scripts/bed/BedLiftOver.sh,sha256=Y4gBsz9w4zhE29UmWojO6F4PXMMMWC1uCzjrxa19eOs,256
|
|
88
89
|
biopipen/scripts/bed/BedtoolsIntersect.py,sha256=sFyXPL3kG59xa5eJwHumcQLw3lfabOXsq2-k8IgIqt4,1722
|
|
90
|
+
biopipen/scripts/bed/BedtoolsMakeWindows.py,sha256=Ip4U0ORXti65g7znZeHEbvw2PGlCxoEfeucZmw4wb1o,1428
|
|
89
91
|
biopipen/scripts/bed/BedtoolsMerge.py,sha256=7mt307V_wWa_ME0VfuMsVX0HgEwfDcZtY_bDvOPjFiQ,368
|
|
90
92
|
biopipen/scripts/cellranger/CellRangerCount.py,sha256=b9kkHPpq-bFh-3XCUdxdMKg3SsZmEzZFrG3dbtb4fX8,2875
|
|
91
93
|
biopipen/scripts/cellranger/CellRangerSummary.R,sha256=mVOCIHngEpJIKVD3tMG5UWqS0OQGGjY6yx6ikRcqQU4,11067
|
|
@@ -126,12 +128,12 @@ biopipen/scripts/plot/Scatter.R,sha256=fg4H5rgdr6IePTMAIysiElnZme0vCh1T0wrwH2Q9x
|
|
|
126
128
|
biopipen/scripts/plot/VennDiagram.R,sha256=Am9umSGr2QxZc2MIMGMBhpoEqta3qt_ItF-9_Y53SXE,704
|
|
127
129
|
biopipen/scripts/protein/Prodigy.py,sha256=W56e51SkaWqthrkCSr2HUqhE9NfJQWZj4y0HXIqaYRA,4459
|
|
128
130
|
biopipen/scripts/protein/ProdigySummary.R,sha256=1s3ofk6Kvs--GAAvzV8SdAyq5LrYozWtIlL32b6ZarE,3806
|
|
129
|
-
biopipen/scripts/regulatory/MotifAffinityTest.R,sha256=
|
|
130
|
-
biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R,sha256=
|
|
131
|
-
biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R,sha256=
|
|
131
|
+
biopipen/scripts/regulatory/MotifAffinityTest.R,sha256=McAnbduE_6SMD_4RuftBemPdfJD9LeFYUYwqL3fzfjU,3047
|
|
132
|
+
biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R,sha256=lzP3EtmpMucWviLDgXLeP_JvG4VADykBOl49CkftiR8,4366
|
|
133
|
+
biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R,sha256=WpkPVNvWonmZqQ8khdDg2VHhda7ZHexvLFGRz4qgv88,3304
|
|
132
134
|
biopipen/scripts/regulatory/MotifScan.py,sha256=WtSbs8z08oeTgzjr0LfIDmjbUdknAh1raa_QPQ_NCFg,5336
|
|
133
|
-
biopipen/scripts/regulatory/
|
|
134
|
-
biopipen/scripts/regulatory/
|
|
135
|
+
biopipen/scripts/regulatory/VariantMotifPlot.R,sha256=RNnBc0bboGfrJOPk5CsUbFRBMBvVX8zgGsrI5eybNyo,2874
|
|
136
|
+
biopipen/scripts/regulatory/motifs-common.R,sha256=7deFrEzZKYzNhmYTsBqqb91CbIj2vtF7lRiPX0yGkO8,13277
|
|
135
137
|
biopipen/scripts/rnaseq/Simulation-ESCO.R,sha256=68cEHDdJclX8P8Q7ey9yBOfK09M_kxlL6zgYXsEL2Rs,6378
|
|
136
138
|
biopipen/scripts/rnaseq/Simulation-RUVcorr.R,sha256=6C6Ke5RLF0fC2V9WQPoFEdqoDabCnhslZBIyB6zhIxc,1155
|
|
137
139
|
biopipen/scripts/rnaseq/Simulation.R,sha256=PK9tZS88AcBPStcFalZlMU0KE0gSqFSQvhUoQ-8eg90,871
|
|
@@ -250,8 +252,8 @@ biopipen/scripts/vcf/BcftoolsAnnotate.py,sha256=iS-T6IhumqePW5kdyi_Tb6rubyIiCMjS
|
|
|
250
252
|
biopipen/scripts/vcf/BcftoolsFilter.py,sha256=AdQMXFTNLCS5eqYWMNIMbkK8qXJ5j8GYm7HdPopVk0c,2573
|
|
251
253
|
biopipen/scripts/vcf/BcftoolsSort.py,sha256=tU0pTrEIB-7x6iOSfU-KpYY1rEidi6Q4179NntY3cGc,3778
|
|
252
254
|
biopipen/scripts/vcf/BcftoolsView.py,sha256=Sj3KkYPpwQFo5kmZC5MRxItrSE5KVZi0jNYrRFck3Ow,2465
|
|
253
|
-
biopipen/scripts/vcf/TruvariBench.sh,sha256=
|
|
254
|
-
biopipen/scripts/vcf/TruvariBenchSummary.R,sha256=
|
|
255
|
+
biopipen/scripts/vcf/TruvariBench.sh,sha256=5M7lZhO4laNJvCVCHudf8DYArKNXoiPWuSkXgRi2t_A,908
|
|
256
|
+
biopipen/scripts/vcf/TruvariBenchSummary.R,sha256=rdNNIPoiwqnK6oEOlQUUhnL1MF958W_nDjRCkA5ubz4,1516
|
|
255
257
|
biopipen/scripts/vcf/TruvariConsistency.R,sha256=6h20v8qztbl1KZInJwoSK_t5XwqhKMTMzPWNPhoAjlM,2314
|
|
256
258
|
biopipen/scripts/vcf/Vcf2Bed.py,sha256=LzyJ9qW1s5mbfF8maLc77_0rE98KMc2lq1R94_NFbSU,855
|
|
257
259
|
biopipen/scripts/vcf/VcfAnno.py,sha256=FW626rAs_WSU7AHQMKjfkYoByUGh_gVyJM97neGfOMo,802
|
|
@@ -284,7 +286,7 @@ biopipen/utils/reference.py,sha256=oi5evicLwHxF0KAIPNZohBeHJLJQNWFJH0cr2y5pgcg,5
|
|
|
284
286
|
biopipen/utils/rnaseq.R,sha256=Ro2B2dG-Z2oVaT5tkwp9RHBz4dp_RF-JcizlM5GYXFs,1298
|
|
285
287
|
biopipen/utils/single_cell.R,sha256=pJjYP8bIZpNAtTQ32rOXhZxaM1Y-6D-xUcK3pql9tbk,4316
|
|
286
288
|
biopipen/utils/vcf.py,sha256=ajXs0M_QghEctlvUlSRjWQIABVF02wPdYd-0LP4mIsU,9377
|
|
287
|
-
biopipen-0.31.
|
|
288
|
-
biopipen-0.31.
|
|
289
|
-
biopipen-0.31.
|
|
290
|
-
biopipen-0.31.
|
|
289
|
+
biopipen-0.31.7.dist-info/METADATA,sha256=dmsDb7Q7iTWvkZjChqUgVqhb2CiQP4j8OA0jBzRYgGY,882
|
|
290
|
+
biopipen-0.31.7.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
291
|
+
biopipen-0.31.7.dist-info/entry_points.txt,sha256=BYqHGBQJxyFDNLYqgH64ycI5PYwnlqwYcCFsMvJgzAU,653
|
|
292
|
+
biopipen-0.31.7.dist-info/RECORD,,
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
snpinfo2atsnp <- function(snpinfo) {
|
|
2
|
-
# c("chrom", "start", "end", "name", "score", "strand", "ref", "alt", "ref_seq", "alt_seq")
|
|
3
|
-
if (any(nchar(snpinfo$ref) != 1) || any(nchar(snpinfo$alt) != 1)) {
|
|
4
|
-
stop("Only SNVs are supported by atSNP. Consider using motifbreakR instead if you have indels.")
|
|
5
|
-
}
|
|
6
|
-
base_encodings <- c(A = 1, C = 2, G = 3, T = 4)
|
|
7
|
-
transition <- matrix(
|
|
8
|
-
c(
|
|
9
|
-
0.3225035, 0.1738422, 0.24915044, 0.2545039,
|
|
10
|
-
0.3451410, 0.2642147, 0.05245011, 0.3381942,
|
|
11
|
-
0.2813089, 0.2136604, 0.26749171, 0.2375390,
|
|
12
|
-
0.2149776, 0.2071733, 0.25309238, 0.3247568
|
|
13
|
-
),
|
|
14
|
-
nrow = 4,
|
|
15
|
-
byrow = TRUE
|
|
16
|
-
)
|
|
17
|
-
rownames(transition) <- colnames(transition) <- names(base_encodings)
|
|
18
|
-
list(
|
|
19
|
-
sequence_matrix = unname(sapply(
|
|
20
|
-
snpinfo$ref_seq,
|
|
21
|
-
function(s) as.integer(base_encodings[strsplit(s, "")[[1]]])
|
|
22
|
-
)),
|
|
23
|
-
ref_base = as.integer(base_encodings[snpinfo$ref]),
|
|
24
|
-
snp_base = as.integer(base_encodings[snpinfo$alt]),
|
|
25
|
-
snpids = snpinfo$name,
|
|
26
|
-
transition = transition,
|
|
27
|
-
prior = c(A = 0.287, C = 0.211, G = 0.213, T = 0.289),
|
|
28
|
-
rsid.na = NULL,
|
|
29
|
-
rsid.rm = NULL,
|
|
30
|
-
rsid.duplicate = NULL,
|
|
31
|
-
rsid.missing = NULL
|
|
32
|
-
)
|
|
33
|
-
}
|