biopipen 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +8 -0
- biopipen/ns/bam.py +0 -2
- biopipen/ns/bed.py +35 -0
- biopipen/ns/cellranger_pipeline.py +5 -5
- biopipen/ns/cnv.py +18 -2
- biopipen/ns/cnvkit_pipeline.py +16 -11
- biopipen/ns/gene.py +68 -23
- biopipen/ns/misc.py +2 -15
- biopipen/ns/plot.py +204 -0
- biopipen/ns/regulatory.py +214 -0
- biopipen/ns/scrna.py +31 -5
- biopipen/ns/snp.py +516 -8
- biopipen/ns/stats.py +167 -3
- biopipen/ns/vcf.py +196 -0
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/scripts/bam/CNVpytor.py +144 -46
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMerge.py +1 -1
- biopipen/scripts/cnv/AneuploidyScore.R +30 -7
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
- biopipen/scripts/cnv/TMADScore.R +21 -5
- biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
- biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
- biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
- biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
- biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
- biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
- biopipen/scripts/delim/SampleInfo.R +10 -5
- biopipen/scripts/gene/GeneNameConversion.R +65 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/plot/Manhattan.R +146 -0
- biopipen/scripts/plot/QQPlot.R +146 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +226 -0
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +126 -0
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +96 -0
- biopipen/scripts/regulatory/MotifScan.py +159 -0
- biopipen/scripts/regulatory/atSNP.R +33 -0
- biopipen/scripts/regulatory/motifBreakR.R +1594 -0
- biopipen/scripts/scrna/MarkersFinder.R +69 -67
- biopipen/scripts/scrna/SeuratClustering.R +71 -29
- biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
- biopipen/scripts/scrna/SeuratPreparing.R +252 -122
- biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
- biopipen/scripts/snp/MatrixEQTL.R +85 -44
- biopipen/scripts/snp/Plink2GTMat.py +133 -0
- biopipen/scripts/snp/PlinkCallRate.R +190 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +298 -0
- biopipen/scripts/snp/PlinkFromVcf.py +78 -0
- biopipen/scripts/snp/PlinkHWE.R +80 -0
- biopipen/scripts/snp/PlinkHet.R +92 -0
- biopipen/scripts/snp/PlinkIBD.R +200 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/Mediation.R +94 -0
- biopipen/scripts/stats/MetaPvalue.R +2 -1
- biopipen/scripts/stats/MetaPvalue1.R +70 -0
- biopipen/scripts/tcr/TCRClusterStats.R +12 -7
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/VcfFix_utils.py +1 -1
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/utils/gene.R +83 -37
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.R +56 -0
- biopipen/utils/misc.py +5 -2
- biopipen/utils/reference.py +54 -10
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/METADATA +2 -2
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/RECORD +80 -51
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/entry_points.txt +1 -1
- biopipen/ns/bcftools.py +0 -111
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
library(rlang)
|
|
2
|
+
library(rtracklayer)
|
|
3
|
+
|
|
4
|
+
infile <- {{in.infile | r}}
|
|
5
|
+
outfile <- {{out.outfile | r}}
|
|
6
|
+
up <- {{envs.up | r}}
|
|
7
|
+
down <- {{envs.down | r}}
|
|
8
|
+
notfound <- {{envs.notfound | r}}
|
|
9
|
+
refgene <- {{envs.refgene | r}}
|
|
10
|
+
header <- {{envs.header | r}}
|
|
11
|
+
genecol <- {{envs.genecol | r}}
|
|
12
|
+
match_id <- {{envs.match_id | r}}
|
|
13
|
+
sort_ <- {{envs.sort | r}}
|
|
14
|
+
chrsize <- {{envs.chrsize | r}}
|
|
15
|
+
|
|
16
|
+
down <- down %||% up
|
|
17
|
+
|
|
18
|
+
refgenes <- readGFF(refgene)
|
|
19
|
+
refcol <- ifelse(match_id, "gene_id", "gene_name")
|
|
20
|
+
|
|
21
|
+
if (infile == "/dev/null") {
|
|
22
|
+
genes <- unique(refgenes[[refcol]])
|
|
23
|
+
} else {
|
|
24
|
+
data <- read.table(infile, header=header, sep="\t", stringsAsFactors=FALSE, check.names=FALSE)
|
|
25
|
+
genes <- data[[genecol]]
|
|
26
|
+
rm(data)
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
notfound_genes <- setdiff(genes, refgenes[[refcol]])
|
|
30
|
+
if (notfound == "error" && length(notfound_genes) > 0) {
|
|
31
|
+
stop(paste(
|
|
32
|
+
"The following genes were not found in the reference annotation:",
|
|
33
|
+
paste(notfound_genes, collapse=", ")
|
|
34
|
+
))
|
|
35
|
+
} else if (notfound == 'skip') {
|
|
36
|
+
genes <- genes[!genes %in% notfound_genes]
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
# Select the genes that are in the reference annotation and keep the order
|
|
40
|
+
# of the records in genes
|
|
41
|
+
refgenes <- refgenes[match(genes, refgenes[[refcol]]), , drop = FALSE]
|
|
42
|
+
refgenes <- unique(makeGRangesFromDataFrame(refgenes, keep.extra.columns=TRUE))
|
|
43
|
+
|
|
44
|
+
proms <- promoters(refgenes, up=up, down=down)
|
|
45
|
+
# Scores must be non-NA numeric values
|
|
46
|
+
elementMetadata(proms)$name <- elementMetadata(proms)[[refcol]]
|
|
47
|
+
score(proms) <- 0
|
|
48
|
+
start(proms) <- pmax(1, start(proms))
|
|
49
|
+
|
|
50
|
+
if (sort_) {
|
|
51
|
+
chrom_sizes <- read.table(chrsize, header=FALSE, stringsAsFactors=FALSE, sep="\t")
|
|
52
|
+
common_chroms <- intersect(chrom_sizes$V1, seqlevels(proms))
|
|
53
|
+
if (length(common_chroms) == 0) {
|
|
54
|
+
stop("No common chromosomes found between the promoters and the chromosome sizes. Do you use the correct chromosome sizes file?")
|
|
55
|
+
}
|
|
56
|
+
proms <- keepSeqlevels(proms, common_chroms, pruning.mode="coarse")
|
|
57
|
+
seqlevels(proms) <- common_chroms
|
|
58
|
+
proms <- sort(proms, ignore.strand = TRUE)
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export.bed(proms, outfile)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# shellcheck disable=all
|
|
2
|
+
export infile={{in.infile | quote}}
|
|
3
|
+
export outfile={{out.outfile | quote}}
|
|
4
|
+
is_outdir={{envs.outdir | int}}
|
|
5
|
+
cmd_given={{envs.cmd | bool | int}}
|
|
6
|
+
{% set _ = out.outfile | dirname | joinpath: "cmd.sh" | as_path | attr: 'write_text' | call: envs.cmd %}
|
|
7
|
+
cmd="{{proc.lang}} {{out.outfile | dirname | joinpath: 'cmd.sh'}}"
|
|
8
|
+
if [[ "$cmd_given" -eq 0 ]]; then
|
|
9
|
+
echo "No command given." 1>&2
|
|
10
|
+
exit 1
|
|
11
|
+
fi
|
|
12
|
+
if [[ $is_outdir -eq 1 ]]; then
|
|
13
|
+
mkdir -p "$outfile"
|
|
14
|
+
fi
|
|
15
|
+
eval "$cmd"
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
source("{{biopipen_dir}}/utils/misc.R")
|
|
2
|
+
library(rlang)
|
|
3
|
+
library(ggmanh)
|
|
4
|
+
|
|
5
|
+
infile <- {{in.infile | r}}
|
|
6
|
+
outfile <- {{out.outfile | r}}
|
|
7
|
+
chrom_col <- {{envs.chrom_col | r}}
|
|
8
|
+
pos_col <- {{envs.pos_col | r}}
|
|
9
|
+
pval_col <- {{envs.pval_col | r}}
|
|
10
|
+
label_col <- {{envs.label_col | r}}
|
|
11
|
+
devpars <- {{envs.devpars | r}}
|
|
12
|
+
title <- {{envs.title | r}}
|
|
13
|
+
ylabel <- {{envs.ylabel | r}}
|
|
14
|
+
rescale <- {{envs.rescale | r}}
|
|
15
|
+
rescale_ratio_threshold <- {{envs.rescale_ratio_threshold | r}}
|
|
16
|
+
signif <- {{envs.signif | r}}
|
|
17
|
+
hicolors <- {{envs.hicolors | r}}
|
|
18
|
+
thin_n <- {{envs.thin_n | r}}
|
|
19
|
+
thin_bins <- {{envs.thin_bins | r}}
|
|
20
|
+
zoom <- {{envs.zoom | r}}
|
|
21
|
+
zoom_devpars <- {{envs.zoom_devpars | r}}
|
|
22
|
+
chroms <- {{envs.chroms | r}}
|
|
23
|
+
args <- {{envs.args | r: todot="-"}}
|
|
24
|
+
|
|
25
|
+
data <- read.table(infile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
|
|
26
|
+
|
|
27
|
+
# normalize columns
|
|
28
|
+
cnames <- colnames(data)
|
|
29
|
+
if (is.numeric(chrom_col)) { chrom_col <- cnames[chrom_col] }
|
|
30
|
+
if (is.numeric(pos_col)) { pos_col <- cnames[pos_col] }
|
|
31
|
+
if (is.numeric(pval_col)) { pval_col <- cnames[pval_col] }
|
|
32
|
+
if (is.numeric(label_col)) { label_col <- cnames[label_col] }
|
|
33
|
+
|
|
34
|
+
# normalize chroms
|
|
35
|
+
norm_chroms <- function(chrs) {
|
|
36
|
+
chrs <- as.character(chrs)
|
|
37
|
+
if (length(chrs) == 1 && grepl(",", chrs)) {
|
|
38
|
+
chrs <- trimws(unlist(strsplit(chrs, ",")))
|
|
39
|
+
}
|
|
40
|
+
if (length(chrs) > 1) {
|
|
41
|
+
return(unique(unlist(sapply(chrs, function(chr) norm_chroms(chr)))))
|
|
42
|
+
}
|
|
43
|
+
if (!grepl("-", chrs)) { return(chrs) }
|
|
44
|
+
|
|
45
|
+
# expand chr1-22 -> chr1, chr2, ..., chr22
|
|
46
|
+
# chr1-22 -> 'chr1', '22'
|
|
47
|
+
chrs <- unlist(strsplit(chrs, "-"))
|
|
48
|
+
if (length(chrs) != 2) {
|
|
49
|
+
stop(paste0("Invalid chroms: ", chrs))
|
|
50
|
+
}
|
|
51
|
+
# detect prefix
|
|
52
|
+
prefix1 <- gsub("[0-9]", "", chrs[1])
|
|
53
|
+
prefix2 <- gsub("[0-9]", "", chrs[2])
|
|
54
|
+
if (nchar(prefix2) > 0 && prefix1 != prefix2) {
|
|
55
|
+
stop(paste0("Invalid chroms: ", chrs, " (prefix mismatch)"))
|
|
56
|
+
}
|
|
57
|
+
chr_a <- as.integer(substring(chrs[1], nchar(prefix1) + 1))
|
|
58
|
+
chr_b <- as.integer(substring(chrs[2], nchar(prefix2) + 1))
|
|
59
|
+
chr_min <- min(chr_a, chr_b)
|
|
60
|
+
chr_max <- max(chr_a, chr_b)
|
|
61
|
+
return(paste0(prefix1, chr_min:chr_max))
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
log_info("Preparing data for plotting ...")
|
|
65
|
+
if (length(chroms) == 1 && chroms == "auto") {
|
|
66
|
+
chroms <- unique(data[[chrom_col]])
|
|
67
|
+
} else {
|
|
68
|
+
chroms <- norm_chroms(chroms)
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
# prepare data
|
|
72
|
+
mp_prep_args = list()
|
|
73
|
+
if (length(signif) == 1 && is.character(signif)) {
|
|
74
|
+
signif <- as.numeric(trimws(unlist(strsplit(signif, ","))))
|
|
75
|
+
}
|
|
76
|
+
siglevel <- min(signif)
|
|
77
|
+
if (!is.null(label_col)) {
|
|
78
|
+
data$.label <- ifelse(data[[pval_col]] < siglevel, data[[label_col]], "")
|
|
79
|
+
}
|
|
80
|
+
if (!is.null(hicolors)) {
|
|
81
|
+
sig_str <- "Significant"
|
|
82
|
+
nsig_str <- "Not significant"
|
|
83
|
+
data$.highlight <- ifelse(data[[pval_col]] < siglevel, sig_str, nsig_str)
|
|
84
|
+
if (length(hicolors) == 1) { hicolors <- c(hicolors, "grey") }
|
|
85
|
+
names(hicolors) <- c(sig_str, nsig_str)
|
|
86
|
+
mp_prep_args$highlight.colname <- ".highlight"
|
|
87
|
+
mp_prep_args$highlight.col <- hicolors
|
|
88
|
+
}
|
|
89
|
+
mp_prep_args$x <- data
|
|
90
|
+
mp_prep_args$chr.colname <- chrom_col
|
|
91
|
+
mp_prep_args$pos.colname <- pos_col
|
|
92
|
+
mp_prep_args$pval.colname <- pval_col
|
|
93
|
+
mp_prep_args$chr.order <- chroms
|
|
94
|
+
if (!is.null(thin_n) && thin_n > 0) {
|
|
95
|
+
mp_prep_args$thin.n <- thin_n
|
|
96
|
+
mp_prep_args$thin.bins <- thin_bins
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
mpdata <- do_call(manhattan_data_preprocess, mp_prep_args)
|
|
100
|
+
|
|
101
|
+
# plot
|
|
102
|
+
log_info("Plotting Manhattan plot ...")
|
|
103
|
+
args$x <- mpdata
|
|
104
|
+
args$signif <- signif
|
|
105
|
+
args$plot.title <- title
|
|
106
|
+
args$rescale <- rescale
|
|
107
|
+
args$rescale.ratio.threshold <- rescale_ratio_threshold
|
|
108
|
+
args$y.label <- ylabel
|
|
109
|
+
if (!is.null(hicolors)) { args$color.by.highlight <- TRUE }
|
|
110
|
+
if (!is.null(label_col)) { args$label.colname <- ".label" }
|
|
111
|
+
g <- do_call(manhattan_plot, args)
|
|
112
|
+
|
|
113
|
+
png(outfile, width=devpars$width, height=devpars$height, res=devpars$res)
|
|
114
|
+
print(g)
|
|
115
|
+
dev.off()
|
|
116
|
+
|
|
117
|
+
# zoom into chromosomes
|
|
118
|
+
all_chroms <- as.character(unique(mpdata$data[[mpdata$chr.colname]]))
|
|
119
|
+
if (!is.null(zoom)) {
|
|
120
|
+
log_info("Zooming into chromosomes ...")
|
|
121
|
+
zoom <- norm_chroms(zoom)
|
|
122
|
+
for (z in zoom) {
|
|
123
|
+
if (!z %in% all_chroms) {
|
|
124
|
+
log_warn("- {z}: not found in data")
|
|
125
|
+
next
|
|
126
|
+
}
|
|
127
|
+
log_info("- {z}")
|
|
128
|
+
args_z <- args
|
|
129
|
+
args_z$chromosome <- z
|
|
130
|
+
args_z$plot.title <- paste0(title, " (", z, ")")
|
|
131
|
+
args_z$x.label <- "Position"
|
|
132
|
+
g_z <- do_call(manhattan_plot, args_z)
|
|
133
|
+
outfile_z <- gsub("\\.png$", paste0("-", z, ".png"), outfile)
|
|
134
|
+
zm_devpars <- zoom_devpars
|
|
135
|
+
zm_devpars$res <- zm_devpars$res %||% devpars$res
|
|
136
|
+
zm_devpars$height <- zm_devpars$height %||% devpars$height
|
|
137
|
+
png(
|
|
138
|
+
outfile_z,
|
|
139
|
+
width=zm_devpars$width,
|
|
140
|
+
height=zm_devpars$height,
|
|
141
|
+
res=zm_devpars$res
|
|
142
|
+
)
|
|
143
|
+
print(g_z)
|
|
144
|
+
dev.off()
|
|
145
|
+
}
|
|
146
|
+
}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
source("{{biopipen_dir}}/utils/misc.R")
|
|
2
|
+
|
|
3
|
+
library(rlang)
|
|
4
|
+
library(stats)
|
|
5
|
+
library(ggplot2)
|
|
6
|
+
library(ggprism)
|
|
7
|
+
library(qqplotr)
|
|
8
|
+
|
|
9
|
+
theme_set(theme_prism())
|
|
10
|
+
|
|
11
|
+
infile <- {{in.infile | r}}
|
|
12
|
+
theorfile <- {{in.theorfile | r}}
|
|
13
|
+
outfile <- {{out.outfile | r}}
|
|
14
|
+
val_col <- {{envs.val_col | r}}
|
|
15
|
+
theor_col <- {{envs.theor_col | r}}
|
|
16
|
+
theor_trans <- {{envs.theor_trans | r}}
|
|
17
|
+
theor_funs <- {{envs.theor_funs | r}}
|
|
18
|
+
devpars <- {{envs.devpars | r}}
|
|
19
|
+
title <- {{envs.title | r}}
|
|
20
|
+
xlabel <- {{envs.xlabel | r}}
|
|
21
|
+
ylabel <- {{envs.ylabel | r}}
|
|
22
|
+
kind <- {{envs.kind | r}}
|
|
23
|
+
trans <- {{envs.trans | r}}
|
|
24
|
+
args <- {{envs.args | r}}
|
|
25
|
+
band_args <- {{envs.band | r}}
|
|
26
|
+
line_args <- {{envs.line | r}}
|
|
27
|
+
point_args <- {{envs.point | r}}
|
|
28
|
+
ggs <- {{envs.ggs | r}}
|
|
29
|
+
|
|
30
|
+
.eval_fun <- function(fun) {
|
|
31
|
+
if (is.character(fun)) {
|
|
32
|
+
fun <- trimws(fun)
|
|
33
|
+
if (grepl("^-\\s*[a-zA-Z\\.][0-9a-zA-Z\\._]*$", fun)) {
|
|
34
|
+
fun <- trimws(substring(fun, 2))
|
|
35
|
+
fun <- eval(parse(text = fun))
|
|
36
|
+
return(function(x) -fun(x))
|
|
37
|
+
} else {
|
|
38
|
+
return(eval(parse(text = fun)))
|
|
39
|
+
}
|
|
40
|
+
} else {
|
|
41
|
+
return(fun)
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
indata <- read.table(infile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
|
|
46
|
+
if (is.numeric(val_col)) {
|
|
47
|
+
val_col <- colnames(indata)[val_col]
|
|
48
|
+
}
|
|
49
|
+
if (!is.null(trans)) {
|
|
50
|
+
trans <- .eval_fun(trans)
|
|
51
|
+
indata[[val_col]] <- trans(indata[[val_col]])
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
if (!is.null(theor_col)) {
|
|
55
|
+
if (is.numeric(theor_col)) {
|
|
56
|
+
theor_col <- colnames(theor)[theor_col]
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
if (!is.null(theorfile)) {
|
|
60
|
+
theor <- read.table(theorfile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
|
|
61
|
+
theor_vals <- theor[[theor_col]]
|
|
62
|
+
} else {
|
|
63
|
+
theor_vals <- indata[[theor_col]]
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if (!is.null(theor_trans)) {
|
|
67
|
+
theor_trans <- .eval_fun(theor_trans)
|
|
68
|
+
theor_vals <- theor_trans(theor_vals)
|
|
69
|
+
}
|
|
70
|
+
theor_vals <- sort(na.omit(theor_vals))
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
band_fun <- ifelse(kind == "pp", stat_pp_band, stat_qq_band)
|
|
74
|
+
line_fun <- ifelse(kind == "pp", stat_pp_line, stat_qq_line)
|
|
75
|
+
point_fun <- ifelse(kind == "pp", stat_pp_point, stat_qq_point)
|
|
76
|
+
|
|
77
|
+
for (fun in names(theor_funs)) {
|
|
78
|
+
assign(fun, .eval_fun(theor_funs[[fun]]))
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (!is.null(band_args) || isFALSE(band_args)) {
|
|
82
|
+
if (isTRUE(band_args$disabled)) {
|
|
83
|
+
band_args <- NULL
|
|
84
|
+
} else {
|
|
85
|
+
band_args$disabled <- NULL
|
|
86
|
+
band_args <- list_update(band_args, args)
|
|
87
|
+
if (band_args$distribution == "custom") {
|
|
88
|
+
band_args$dparams <- band_args$dparams %||% list()
|
|
89
|
+
band_args$dparams$values <- theor_vals
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
if (!is.null(line_args) || isFALSE(line_args)) {
|
|
94
|
+
if (isTRUE(line_args$disabled)) {
|
|
95
|
+
line_args <- NULL
|
|
96
|
+
} else {
|
|
97
|
+
line_args$disabled <- NULL
|
|
98
|
+
line_args <- list_update(line_args, args)
|
|
99
|
+
if (line_args$distribution == "custom") {
|
|
100
|
+
line_args$dparams <- line_args$dparams %||% list()
|
|
101
|
+
line_args$dparams$values <- theor_vals
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
if (!is.null(point_args) || isFALSE(point_args)) {
|
|
106
|
+
if (isTRUE(point_args$disabled)) {
|
|
107
|
+
point_args <- NULL
|
|
108
|
+
} else {
|
|
109
|
+
point_args$disabled <- NULL
|
|
110
|
+
point_args <- list_update(point_args, args)
|
|
111
|
+
if (point_args$distribution == "custom") {
|
|
112
|
+
point_args$dparams <- point_args$dparams %||% list()
|
|
113
|
+
point_args$dparams$values <- theor_vals
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
title <- title %||% waiver()
|
|
119
|
+
xlabel <- xlabel %||% waiver()
|
|
120
|
+
ylabel <- ylabel %||% waiver()
|
|
121
|
+
|
|
122
|
+
indata <- indata[complete.cases(indata), , drop = FALSE]
|
|
123
|
+
indata <- indata[order(indata[[val_col]]), , drop = FALSE]
|
|
124
|
+
|
|
125
|
+
p <- ggplot(data = indata, mapping = aes(sample = !!sym(val_col))) +
|
|
126
|
+
labs(title = title, x = xlabel, y = ylabel)
|
|
127
|
+
|
|
128
|
+
if (!is.null(band_args)) {
|
|
129
|
+
p <- p + do_call(band_fun, band_args)
|
|
130
|
+
}
|
|
131
|
+
if (!is.null(line_args)) {
|
|
132
|
+
p <- p + do_call(line_fun, line_args)
|
|
133
|
+
}
|
|
134
|
+
if (!is.null(point_args)) {
|
|
135
|
+
p <- p + do_call(point_fun, point_args)
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if (!is.null(ggs)) {
|
|
139
|
+
for (gg in ggs) {
|
|
140
|
+
p <- p + eval(parse(text = gg))
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
png(outfile, width=devpars$width, height=devpars$height, res=devpars$res)
|
|
145
|
+
print(p)
|
|
146
|
+
dev.off()
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# Script for regulatory.MotifAffinityTest
|
|
2
|
+
|
|
3
|
+
source("{{biopipen_dir}}/utils/misc.R")
|
|
4
|
+
library(BiocParallel)
|
|
5
|
+
library(BSgenome)
|
|
6
|
+
library(universalmotif)
|
|
7
|
+
|
|
8
|
+
motiffile <- {{in.motiffile | r}}
|
|
9
|
+
varfile <- {{in.varfile | r}}
|
|
10
|
+
outdir <- {{out.outdir | r}}
|
|
11
|
+
ncores <- {{envs.ncores | r}}
|
|
12
|
+
tool <- {{envs.tool | r}}
|
|
13
|
+
bcftools <- {{envs.bcftools | r}}
|
|
14
|
+
genome <- {{envs.genome | r}}
|
|
15
|
+
motif_col <- {{envs.motif_col | r}}
|
|
16
|
+
regulator_col <- {{envs.regulator_col | r}}
|
|
17
|
+
notfound <- {{envs.notfound | r}}
|
|
18
|
+
motifdb <- {{envs.motifdb | r}}
|
|
19
|
+
regmotifs <- {{envs.regmotifs | r}}
|
|
20
|
+
devpars <- {{envs.devpars | r}}
|
|
21
|
+
plot_nvars <- {{envs.plot_nvars | r}}
|
|
22
|
+
plots <- {{envs.plots | r}}
|
|
23
|
+
cutoff <- {{envs.cutoff | r}}
|
|
24
|
+
|
|
25
|
+
if (is.null(motifdb) || !file.exists(motifdb)) {
|
|
26
|
+
stop("Motif database (envs.motifdb) is required and must exist")
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if (is.null(genome)) {
|
|
30
|
+
stop("Reference genome (envs.ref) is required and must exist")
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
if (is.null(motiffile) || !file.exists(motiffile)) {
|
|
34
|
+
stop("Motif file (in.motiffile) is required and must exist")
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
if (is.null(varfile) || !file.exists(varfile)) {
|
|
38
|
+
stop("Variant file (in.varfile) is required and must exist")
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
if (is.null(motif_col) && is.null(regulator_col)) {
|
|
42
|
+
stop("Either motif (envs.motif_col) or regulator (envs.regulator_col) column must be provided")
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
log_info("Reading input regulator/motif file ...")
|
|
46
|
+
in_motifs <- read.table(motiffile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
|
|
47
|
+
|
|
48
|
+
if (is.null(motif_col)) {
|
|
49
|
+
log_info("Inferring motifs from regulators ...")
|
|
50
|
+
if (is.null(regmotifs) || !file.exists(regmotifs)) {
|
|
51
|
+
stop("Regulator motifs (envs.regmotifs) is required and must exist when no motif column (envs.motif_col) is provided")
|
|
52
|
+
}
|
|
53
|
+
regmotifs <- read.table(regmotifs, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
|
|
54
|
+
rm_motif_col <- c('Motif', 'motif', 'MOTIF', 'Model', 'model', 'MODEL')
|
|
55
|
+
rm_reg_col <- c('Regulator', 'regulator', 'REGULATOR', 'TF', 'tf', 'TF', 'Transcription factor', 'transcription factor', 'Transcription Factor')
|
|
56
|
+
rm_motif_col <- intersect(rm_motif_col, colnames(regmotifs))
|
|
57
|
+
rm_reg_col <- intersect(rm_reg_col, colnames(regmotifs))
|
|
58
|
+
if (length(rm_motif_col) == 0) {
|
|
59
|
+
stop("No motif column found in envs.regmotifs, provide one of: ", paste(rm_motif_col, collapse = ", "))
|
|
60
|
+
}
|
|
61
|
+
if (length(rm_reg_col) == 0) {
|
|
62
|
+
stop("No regulator column found in envs.regmotifs, provide one of: ", paste(rm_reg_col, collapse = ", "))
|
|
63
|
+
}
|
|
64
|
+
rm_motif_col <- rm_motif_col[1]
|
|
65
|
+
rm_reg_col <- rm_reg_col[1]
|
|
66
|
+
# check regulators
|
|
67
|
+
rm_regs <- regmotifs[, rm_reg_col, drop = TRUE]
|
|
68
|
+
regulators <- in_motifs[, regulator_col, drop = TRUE]
|
|
69
|
+
notfound_regs <- setdiff(regulators, rm_regs)
|
|
70
|
+
if (length(notfound_regs) > 0 && notfound == "error") {
|
|
71
|
+
first_notfound <- head(notfound_regs, 3)
|
|
72
|
+
if (length(notfound_regs) > 3) {
|
|
73
|
+
first_notfound <- c(first_notfound, "...")
|
|
74
|
+
notfound_file <- file.path(outdir, "notfound_regulators.txt")
|
|
75
|
+
writeLines(notfound_regs, notfound_file)
|
|
76
|
+
msg1 <- paste0("The following regulators were not found in the envs.regmotifs file: ", paste(first_notfound, collapse = ", "))
|
|
77
|
+
msg2 <- paste0("Check the full list in ", notfound_file)
|
|
78
|
+
stop(msg1, "\n", msg2)
|
|
79
|
+
} else {
|
|
80
|
+
msg <- paste0("The following regulators were not found in the regmotifs file: ", paste(first_notfound, collapse = ", "))
|
|
81
|
+
stop(msg)
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
in_motifs <- in_motifs[in_motifs[, regulator_col] %in% rm_regs, , drop = FALSE]
|
|
85
|
+
# add motif column
|
|
86
|
+
in_motifs <- merge(in_motifs, regmotifs, by.x = regulator_col, by.y = rm_reg_col, all.x = TRUE, suffixes = c("", "_db"))
|
|
87
|
+
motif_col <- rm_motif_col
|
|
88
|
+
}
|
|
89
|
+
if (is.null(regulator_col)) {
|
|
90
|
+
# make motifs unique
|
|
91
|
+
in_moitfs <- in_motifs[!duplicated(in_motifs[, motif_col]), , drop = FALSE]
|
|
92
|
+
} else {
|
|
93
|
+
in_motifs <- in_motifs[!duplicated(in_motifs[, c(regulator_col, motif_col)]), , drop = FALSE]
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
if (!grepl(".", genome, fixed = TRUE)) {
|
|
98
|
+
genome_pkg = sprintf("BSgenome.Hsapiens.UCSC.%s", genome)
|
|
99
|
+
} else {
|
|
100
|
+
genome_pkg = genome
|
|
101
|
+
}
|
|
102
|
+
if (!requireNamespace(genome_pkg, quietly = TRUE)) {
|
|
103
|
+
stop(sprintf("Genome package %s is not installed", genome_pkg))
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
log_info("Reading variant file ...")
|
|
107
|
+
if (grepl("\\.vcf$", varfile) || grepl("\\.vcf\\.gz$", varfile)) {
|
|
108
|
+
log_info("Converting VCF file to BED file ...")
|
|
109
|
+
varfile_bed <- file.path(outdir, gsub("\\.vcf(\\.gz)?$", ".bed", basename(varfile)))
|
|
110
|
+
cmd <- c(
|
|
111
|
+
bcftools, "query",
|
|
112
|
+
"-f", "%CHROM\\t%POS0\\t%END\\t%ID\\t0\\t+\\t%REF\\t%ALT{0}\\n",
|
|
113
|
+
"-i", 'FILTER="PASS" || FILTER="." || FILTER=""',
|
|
114
|
+
"-o", varfile_bed,
|
|
115
|
+
varfile
|
|
116
|
+
)
|
|
117
|
+
run_command(cmd, fg = TRUE)
|
|
118
|
+
|
|
119
|
+
varfile <- varfile_bed
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
# `chrom`, `start`, `end`, `name`, `score`, `strand`, `ref`, `alt`.
|
|
123
|
+
snpinfo <- read.table(varfile, header=FALSE, stringsAsFactors=FALSE)
|
|
124
|
+
colnames(snpinfo) <- c("chrom", "start", "end", "name", "score", "strand", "ref", "alt")
|
|
125
|
+
|
|
126
|
+
log_info("Reading motif database ...")
|
|
127
|
+
meme <- read_meme(motifdb)
|
|
128
|
+
|
|
129
|
+
check_motifs <- function(motifdb_names) {
|
|
130
|
+
motifs <- in_motifs[, motif_col, drop = TRUE]
|
|
131
|
+
notfound_motifs <- setdiff(motifs, motifdb_names)
|
|
132
|
+
if (length(notfound_motifs) > 0) {
|
|
133
|
+
first_notfound <- head(notfound_motifs, 3)
|
|
134
|
+
if (length(notfound_motifs) > 3) {
|
|
135
|
+
first_notfound <- c(first_notfound, "...")
|
|
136
|
+
notfound_file <- file.path(outdir, "notfound_motifs.txt")
|
|
137
|
+
writeLines(notfound_motifs, notfound_file)
|
|
138
|
+
msg1 <- paste0("The following motifs were not found in the motif database: ", paste(first_notfound, collapse = ", "))
|
|
139
|
+
msg2 <- paste0("Check the full list in ", notfound_file)
|
|
140
|
+
|
|
141
|
+
if (notfound == "error") {
|
|
142
|
+
stop(msg1, "\n", msg2)
|
|
143
|
+
} else if (notfound == "ignore") {
|
|
144
|
+
log_warn(msg1)
|
|
145
|
+
log_warn(msg2)
|
|
146
|
+
}
|
|
147
|
+
} else {
|
|
148
|
+
msg <- paste0("The following motifs were not found in the motif database: ", paste(first_notfound, collapse = ", "))
|
|
149
|
+
if (notfound == "error") {
|
|
150
|
+
stop(msg)
|
|
151
|
+
} else if (notfound == "ignore") {
|
|
152
|
+
log_warn(msg)
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
motifs <- setdiff(motifs, notfound_motifs)
|
|
157
|
+
}
|
|
158
|
+
return(motifs)
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
plot_variant <- function(motifbreakr_results) {
|
|
162
|
+
log_info("Plotting variants ...")
|
|
163
|
+
plotdir <- file.path(outdir, "plots")
|
|
164
|
+
dir.create(plotdir, showWarnings = FALSE)
|
|
165
|
+
results <- motifbreakr_results
|
|
166
|
+
if (is.null(plots) || length(plots) == 0) {
|
|
167
|
+
results <- results[order(-abs(results$alleleDiff)), , drop = FALSE]
|
|
168
|
+
results <- results[1:min(plot_nvars, length(results)), , drop = FALSE]
|
|
169
|
+
variants <- unique(results$SNP_id)
|
|
170
|
+
} else {
|
|
171
|
+
variants <- names(plots)
|
|
172
|
+
}
|
|
173
|
+
for (variant in variants) {
|
|
174
|
+
log_info("- Variant: {variant}")
|
|
175
|
+
if (is.null(plots[[variant]])) {
|
|
176
|
+
plots[[variant]] <- list(devpars = devpars, which = "TRUE")
|
|
177
|
+
}
|
|
178
|
+
if (is.null(plots[[variant]]$which)) {
|
|
179
|
+
plots[[variant]]$which <- "TRUE"
|
|
180
|
+
}
|
|
181
|
+
if (is.null(plots[[variant]]$devpars)) {
|
|
182
|
+
plots[[variant]]$devpars <- devpars
|
|
183
|
+
}
|
|
184
|
+
if (is.null(plots[[variant]]$devpars$res)) {
|
|
185
|
+
plots[[variant]]$devpars$res <- 100
|
|
186
|
+
}
|
|
187
|
+
res <- results[results$SNP_id == variant, , drop = FALSE]
|
|
188
|
+
if (length(res) == 0) {
|
|
189
|
+
stop(sprintf("Variant %s not found in results", variant))
|
|
190
|
+
}
|
|
191
|
+
res <- subset(res, subset = eval(parse(text = plots[[variant]]$which)))
|
|
192
|
+
if (length(res) == 0) {
|
|
193
|
+
stop(sprintf("No variants to plot for %s", variant))
|
|
194
|
+
}
|
|
195
|
+
plotfile <- file.path(plotdir, sprintf("%s.png", slugify(variant)))
|
|
196
|
+
# fix motifBreakR 2.12 using names to filter in plotMB
|
|
197
|
+
names(res) <- res$SNP_id
|
|
198
|
+
dv <- plots[[variant]]$devpars
|
|
199
|
+
if (is.null(dv$height)) {
|
|
200
|
+
dv$height <- 2.4 * dv$res + length(res) * 1.2 * dv$res
|
|
201
|
+
}
|
|
202
|
+
if (is.null(dv$width)) {
|
|
203
|
+
left <- min(sapply(res$motifPos, `[`, 1))
|
|
204
|
+
right <- max(sapply(res$motifPos, `[`, 2))
|
|
205
|
+
dv$width <- 1.5 * dv$res + (right - left) * 0.3 * dv$res
|
|
206
|
+
}
|
|
207
|
+
png(plotfile, width = dv$width, height = dv$height, res = dv$res)
|
|
208
|
+
motifbreakR::plotMB(res, variant)
|
|
209
|
+
dev.off()
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
tool <- tolower(tool)
|
|
214
|
+
tool <- match.arg(tool, c("motifbreakr", "atsnp"))
|
|
215
|
+
|
|
216
|
+
if (tool == "motifbreakr") {
|
|
217
|
+
motifbreakr_args <- {{envs.motifbreakr_args | r}}
|
|
218
|
+
{% set sourcefile = biopipen_dir | joinpaths: "scripts", "regulatory", "MotifAffinityTest_MotifBreakR.R" %}
|
|
219
|
+
# {{ sourcefile | getmtime }}
|
|
220
|
+
source("{{sourcefile}}")
|
|
221
|
+
} else { # atsnp
|
|
222
|
+
atsnp_args <- {{envs.atsnp_args | r}}
|
|
223
|
+
{% set sourcefile = biopipen_dir | joinpaths: "scripts", "regulatory", "MotifAffinityTest_AtSNP.R" %}
|
|
224
|
+
# {{ sourcefile | getmtime }}
|
|
225
|
+
source("{{sourcefile}}")
|
|
226
|
+
}
|