biopipen 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (85) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +8 -0
  3. biopipen/ns/bam.py +0 -2
  4. biopipen/ns/bed.py +35 -0
  5. biopipen/ns/cellranger_pipeline.py +5 -5
  6. biopipen/ns/cnv.py +18 -2
  7. biopipen/ns/cnvkit_pipeline.py +16 -11
  8. biopipen/ns/gene.py +68 -23
  9. biopipen/ns/misc.py +2 -15
  10. biopipen/ns/plot.py +204 -0
  11. biopipen/ns/regulatory.py +214 -0
  12. biopipen/ns/scrna.py +31 -5
  13. biopipen/ns/snp.py +516 -8
  14. biopipen/ns/stats.py +167 -3
  15. biopipen/ns/vcf.py +196 -0
  16. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  17. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  18. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  19. biopipen/reports/snp/PlinkHet.svelte +18 -0
  20. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  21. biopipen/scripts/bam/CNVpytor.py +144 -46
  22. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  23. biopipen/scripts/bed/BedtoolsMerge.py +1 -1
  24. biopipen/scripts/cnv/AneuploidyScore.R +30 -7
  25. biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
  26. biopipen/scripts/cnv/TMADScore.R +21 -5
  27. biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
  28. biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
  29. biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
  30. biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
  31. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
  32. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
  33. biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
  34. biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
  35. biopipen/scripts/delim/SampleInfo.R +10 -5
  36. biopipen/scripts/gene/GeneNameConversion.R +65 -0
  37. biopipen/scripts/gene/GenePromoters.R +61 -0
  38. biopipen/scripts/misc/Shell.sh +15 -0
  39. biopipen/scripts/plot/Manhattan.R +146 -0
  40. biopipen/scripts/plot/QQPlot.R +146 -0
  41. biopipen/scripts/regulatory/MotifAffinityTest.R +226 -0
  42. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +126 -0
  43. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +96 -0
  44. biopipen/scripts/regulatory/MotifScan.py +159 -0
  45. biopipen/scripts/regulatory/atSNP.R +33 -0
  46. biopipen/scripts/regulatory/motifBreakR.R +1594 -0
  47. biopipen/scripts/scrna/MarkersFinder.R +69 -67
  48. biopipen/scripts/scrna/SeuratClustering.R +71 -29
  49. biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
  50. biopipen/scripts/scrna/SeuratPreparing.R +252 -122
  51. biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
  52. biopipen/scripts/snp/MatrixEQTL.R +85 -44
  53. biopipen/scripts/snp/Plink2GTMat.py +133 -0
  54. biopipen/scripts/snp/PlinkCallRate.R +190 -0
  55. biopipen/scripts/snp/PlinkFilter.py +100 -0
  56. biopipen/scripts/snp/PlinkFreq.R +298 -0
  57. biopipen/scripts/snp/PlinkFromVcf.py +78 -0
  58. biopipen/scripts/snp/PlinkHWE.R +80 -0
  59. biopipen/scripts/snp/PlinkHet.R +92 -0
  60. biopipen/scripts/snp/PlinkIBD.R +200 -0
  61. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  62. biopipen/scripts/stats/Mediation.R +94 -0
  63. biopipen/scripts/stats/MetaPvalue.R +2 -1
  64. biopipen/scripts/stats/MetaPvalue1.R +70 -0
  65. biopipen/scripts/tcr/TCRClusterStats.R +12 -7
  66. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  67. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  68. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  69. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  70. biopipen/scripts/vcf/VcfFix_utils.py +1 -1
  71. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  72. biopipen/utils/gene.R +83 -37
  73. biopipen/utils/gene.py +108 -60
  74. biopipen/utils/misc.R +56 -0
  75. biopipen/utils/misc.py +5 -2
  76. biopipen/utils/reference.py +54 -10
  77. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/METADATA +2 -2
  78. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/RECORD +80 -51
  79. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/entry_points.txt +1 -1
  80. biopipen/ns/bcftools.py +0 -111
  81. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  82. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  83. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  84. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  85. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,61 @@
1
+ library(rlang)
2
+ library(rtracklayer)
3
+
4
+ infile <- {{in.infile | r}}
5
+ outfile <- {{out.outfile | r}}
6
+ up <- {{envs.up | r}}
7
+ down <- {{envs.down | r}}
8
+ notfound <- {{envs.notfound | r}}
9
+ refgene <- {{envs.refgene | r}}
10
+ header <- {{envs.header | r}}
11
+ genecol <- {{envs.genecol | r}}
12
+ match_id <- {{envs.match_id | r}}
13
+ sort_ <- {{envs.sort | r}}
14
+ chrsize <- {{envs.chrsize | r}}
15
+
16
+ down <- down %||% up
17
+
18
+ refgenes <- readGFF(refgene)
19
+ refcol <- ifelse(match_id, "gene_id", "gene_name")
20
+
21
+ if (infile == "/dev/null") {
22
+ genes <- unique(refgenes[[refcol]])
23
+ } else {
24
+ data <- read.table(infile, header=header, sep="\t", stringsAsFactors=FALSE, check.names=FALSE)
25
+ genes <- data[[genecol]]
26
+ rm(data)
27
+ }
28
+
29
+ notfound_genes <- setdiff(genes, refgenes[[refcol]])
30
+ if (notfound == "error" && length(notfound_genes) > 0) {
31
+ stop(paste(
32
+ "The following genes were not found in the reference annotation:",
33
+ paste(notfound_genes, collapse=", ")
34
+ ))
35
+ } else if (notfound == 'skip') {
36
+ genes <- genes[!genes %in% notfound_genes]
37
+ }
38
+
39
+ # Select the genes that are in the reference annotation and keep the order
40
+ # of the records in genes
41
+ refgenes <- refgenes[match(genes, refgenes[[refcol]]), , drop = FALSE]
42
+ refgenes <- unique(makeGRangesFromDataFrame(refgenes, keep.extra.columns=TRUE))
43
+
44
+ proms <- promoters(refgenes, up=up, down=down)
45
+ # Scores must be non-NA numeric values
46
+ elementMetadata(proms)$name <- elementMetadata(proms)[[refcol]]
47
+ score(proms) <- 0
48
+ start(proms) <- pmax(1, start(proms))
49
+
50
+ if (sort_) {
51
+ chrom_sizes <- read.table(chrsize, header=FALSE, stringsAsFactors=FALSE, sep="\t")
52
+ common_chroms <- intersect(chrom_sizes$V1, seqlevels(proms))
53
+ if (length(common_chroms) == 0) {
54
+ stop("No common chromosomes found between the promoters and the chromosome sizes. Do you use the correct chromosome sizes file?")
55
+ }
56
+ proms <- keepSeqlevels(proms, common_chroms, pruning.mode="coarse")
57
+ seqlevels(proms) <- common_chroms
58
+ proms <- sort(proms, ignore.strand = TRUE)
59
+ }
60
+
61
+ export.bed(proms, outfile)
@@ -0,0 +1,15 @@
1
+ # shellcheck disable=all
2
+ export infile={{in.infile | quote}}
3
+ export outfile={{out.outfile | quote}}
4
+ is_outdir={{envs.outdir | int}}
5
+ cmd_given={{envs.cmd | bool | int}}
6
+ {% set _ = out.outfile | dirname | joinpath: "cmd.sh" | as_path | attr: 'write_text' | call: envs.cmd %}
7
+ cmd="{{proc.lang}} {{out.outfile | dirname | joinpath: 'cmd.sh'}}"
8
+ if [[ "$cmd_given" -eq 0 ]]; then
9
+ echo "No command given." 1>&2
10
+ exit 1
11
+ fi
12
+ if [[ $is_outdir -eq 1 ]]; then
13
+ mkdir -p "$outfile"
14
+ fi
15
+ eval "$cmd"
@@ -0,0 +1,146 @@
1
+ source("{{biopipen_dir}}/utils/misc.R")
2
+ library(rlang)
3
+ library(ggmanh)
4
+
5
+ infile <- {{in.infile | r}}
6
+ outfile <- {{out.outfile | r}}
7
+ chrom_col <- {{envs.chrom_col | r}}
8
+ pos_col <- {{envs.pos_col | r}}
9
+ pval_col <- {{envs.pval_col | r}}
10
+ label_col <- {{envs.label_col | r}}
11
+ devpars <- {{envs.devpars | r}}
12
+ title <- {{envs.title | r}}
13
+ ylabel <- {{envs.ylabel | r}}
14
+ rescale <- {{envs.rescale | r}}
15
+ rescale_ratio_threshold <- {{envs.rescale_ratio_threshold | r}}
16
+ signif <- {{envs.signif | r}}
17
+ hicolors <- {{envs.hicolors | r}}
18
+ thin_n <- {{envs.thin_n | r}}
19
+ thin_bins <- {{envs.thin_bins | r}}
20
+ zoom <- {{envs.zoom | r}}
21
+ zoom_devpars <- {{envs.zoom_devpars | r}}
22
+ chroms <- {{envs.chroms | r}}
23
+ args <- {{envs.args | r: todot="-"}}
24
+
25
+ data <- read.table(infile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
26
+
27
+ # normalize columns
28
+ cnames <- colnames(data)
29
+ if (is.numeric(chrom_col)) { chrom_col <- cnames[chrom_col] }
30
+ if (is.numeric(pos_col)) { pos_col <- cnames[pos_col] }
31
+ if (is.numeric(pval_col)) { pval_col <- cnames[pval_col] }
32
+ if (is.numeric(label_col)) { label_col <- cnames[label_col] }
33
+
34
+ # normalize chroms
35
+ norm_chroms <- function(chrs) {
36
+ chrs <- as.character(chrs)
37
+ if (length(chrs) == 1 && grepl(",", chrs)) {
38
+ chrs <- trimws(unlist(strsplit(chrs, ",")))
39
+ }
40
+ if (length(chrs) > 1) {
41
+ return(unique(unlist(sapply(chrs, function(chr) norm_chroms(chr)))))
42
+ }
43
+ if (!grepl("-", chrs)) { return(chrs) }
44
+
45
+ # expand chr1-22 -> chr1, chr2, ..., chr22
46
+ # chr1-22 -> 'chr1', '22'
47
+ chrs <- unlist(strsplit(chrs, "-"))
48
+ if (length(chrs) != 2) {
49
+ stop(paste0("Invalid chroms: ", chrs))
50
+ }
51
+ # detect prefix
52
+ prefix1 <- gsub("[0-9]", "", chrs[1])
53
+ prefix2 <- gsub("[0-9]", "", chrs[2])
54
+ if (nchar(prefix2) > 0 && prefix1 != prefix2) {
55
+ stop(paste0("Invalid chroms: ", chrs, " (prefix mismatch)"))
56
+ }
57
+ chr_a <- as.integer(substring(chrs[1], nchar(prefix1) + 1))
58
+ chr_b <- as.integer(substring(chrs[2], nchar(prefix2) + 1))
59
+ chr_min <- min(chr_a, chr_b)
60
+ chr_max <- max(chr_a, chr_b)
61
+ return(paste0(prefix1, chr_min:chr_max))
62
+ }
63
+
64
+ log_info("Preparing data for plotting ...")
65
+ if (length(chroms) == 1 && chroms == "auto") {
66
+ chroms <- unique(data[[chrom_col]])
67
+ } else {
68
+ chroms <- norm_chroms(chroms)
69
+ }
70
+
71
+ # prepare data
72
+ mp_prep_args = list()
73
+ if (length(signif) == 1 && is.character(signif)) {
74
+ signif <- as.numeric(trimws(unlist(strsplit(signif, ","))))
75
+ }
76
+ siglevel <- min(signif)
77
+ if (!is.null(label_col)) {
78
+ data$.label <- ifelse(data[[pval_col]] < siglevel, data[[label_col]], "")
79
+ }
80
+ if (!is.null(hicolors)) {
81
+ sig_str <- "Significant"
82
+ nsig_str <- "Not significant"
83
+ data$.highlight <- ifelse(data[[pval_col]] < siglevel, sig_str, nsig_str)
84
+ if (length(hicolors) == 1) { hicolors <- c(hicolors, "grey") }
85
+ names(hicolors) <- c(sig_str, nsig_str)
86
+ mp_prep_args$highlight.colname <- ".highlight"
87
+ mp_prep_args$highlight.col <- hicolors
88
+ }
89
+ mp_prep_args$x <- data
90
+ mp_prep_args$chr.colname <- chrom_col
91
+ mp_prep_args$pos.colname <- pos_col
92
+ mp_prep_args$pval.colname <- pval_col
93
+ mp_prep_args$chr.order <- chroms
94
+ if (!is.null(thin_n) && thin_n > 0) {
95
+ mp_prep_args$thin.n <- thin_n
96
+ mp_prep_args$thin.bins <- thin_bins
97
+ }
98
+
99
+ mpdata <- do_call(manhattan_data_preprocess, mp_prep_args)
100
+
101
+ # plot
102
+ log_info("Plotting Manhattan plot ...")
103
+ args$x <- mpdata
104
+ args$signif <- signif
105
+ args$plot.title <- title
106
+ args$rescale <- rescale
107
+ args$rescale.ratio.threshold <- rescale_ratio_threshold
108
+ args$y.label <- ylabel
109
+ if (!is.null(hicolors)) { args$color.by.highlight <- TRUE }
110
+ if (!is.null(label_col)) { args$label.colname <- ".label" }
111
+ g <- do_call(manhattan_plot, args)
112
+
113
+ png(outfile, width=devpars$width, height=devpars$height, res=devpars$res)
114
+ print(g)
115
+ dev.off()
116
+
117
+ # zoom into chromosomes
118
+ all_chroms <- as.character(unique(mpdata$data[[mpdata$chr.colname]]))
119
+ if (!is.null(zoom)) {
120
+ log_info("Zooming into chromosomes ...")
121
+ zoom <- norm_chroms(zoom)
122
+ for (z in zoom) {
123
+ if (!z %in% all_chroms) {
124
+ log_warn("- {z}: not found in data")
125
+ next
126
+ }
127
+ log_info("- {z}")
128
+ args_z <- args
129
+ args_z$chromosome <- z
130
+ args_z$plot.title <- paste0(title, " (", z, ")")
131
+ args_z$x.label <- "Position"
132
+ g_z <- do_call(manhattan_plot, args_z)
133
+ outfile_z <- gsub("\\.png$", paste0("-", z, ".png"), outfile)
134
+ zm_devpars <- zoom_devpars
135
+ zm_devpars$res <- zm_devpars$res %||% devpars$res
136
+ zm_devpars$height <- zm_devpars$height %||% devpars$height
137
+ png(
138
+ outfile_z,
139
+ width=zm_devpars$width,
140
+ height=zm_devpars$height,
141
+ res=zm_devpars$res
142
+ )
143
+ print(g_z)
144
+ dev.off()
145
+ }
146
+ }
@@ -0,0 +1,146 @@
1
+ source("{{biopipen_dir}}/utils/misc.R")
2
+
3
+ library(rlang)
4
+ library(stats)
5
+ library(ggplot2)
6
+ library(ggprism)
7
+ library(qqplotr)
8
+
9
+ theme_set(theme_prism())
10
+
11
+ infile <- {{in.infile | r}}
12
+ theorfile <- {{in.theorfile | r}}
13
+ outfile <- {{out.outfile | r}}
14
+ val_col <- {{envs.val_col | r}}
15
+ theor_col <- {{envs.theor_col | r}}
16
+ theor_trans <- {{envs.theor_trans | r}}
17
+ theor_funs <- {{envs.theor_funs | r}}
18
+ devpars <- {{envs.devpars | r}}
19
+ title <- {{envs.title | r}}
20
+ xlabel <- {{envs.xlabel | r}}
21
+ ylabel <- {{envs.ylabel | r}}
22
+ kind <- {{envs.kind | r}}
23
+ trans <- {{envs.trans | r}}
24
+ args <- {{envs.args | r}}
25
+ band_args <- {{envs.band | r}}
26
+ line_args <- {{envs.line | r}}
27
+ point_args <- {{envs.point | r}}
28
+ ggs <- {{envs.ggs | r}}
29
+
30
+ .eval_fun <- function(fun) {
31
+ if (is.character(fun)) {
32
+ fun <- trimws(fun)
33
+ if (grepl("^-\\s*[a-zA-Z\\.][0-9a-zA-Z\\._]*$", fun)) {
34
+ fun <- trimws(substring(fun, 2))
35
+ fun <- eval(parse(text = fun))
36
+ return(function(x) -fun(x))
37
+ } else {
38
+ return(eval(parse(text = fun)))
39
+ }
40
+ } else {
41
+ return(fun)
42
+ }
43
+ }
44
+
45
+ indata <- read.table(infile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
46
+ if (is.numeric(val_col)) {
47
+ val_col <- colnames(indata)[val_col]
48
+ }
49
+ if (!is.null(trans)) {
50
+ trans <- .eval_fun(trans)
51
+ indata[[val_col]] <- trans(indata[[val_col]])
52
+ }
53
+
54
+ if (!is.null(theor_col)) {
55
+ if (is.numeric(theor_col)) {
56
+ theor_col <- colnames(theor)[theor_col]
57
+ }
58
+
59
+ if (!is.null(theorfile)) {
60
+ theor <- read.table(theorfile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
61
+ theor_vals <- theor[[theor_col]]
62
+ } else {
63
+ theor_vals <- indata[[theor_col]]
64
+ }
65
+
66
+ if (!is.null(theor_trans)) {
67
+ theor_trans <- .eval_fun(theor_trans)
68
+ theor_vals <- theor_trans(theor_vals)
69
+ }
70
+ theor_vals <- sort(na.omit(theor_vals))
71
+ }
72
+
73
+ band_fun <- ifelse(kind == "pp", stat_pp_band, stat_qq_band)
74
+ line_fun <- ifelse(kind == "pp", stat_pp_line, stat_qq_line)
75
+ point_fun <- ifelse(kind == "pp", stat_pp_point, stat_qq_point)
76
+
77
+ for (fun in names(theor_funs)) {
78
+ assign(fun, .eval_fun(theor_funs[[fun]]))
79
+ }
80
+
81
+ if (!is.null(band_args) || isFALSE(band_args)) {
82
+ if (isTRUE(band_args$disabled)) {
83
+ band_args <- NULL
84
+ } else {
85
+ band_args$disabled <- NULL
86
+ band_args <- list_update(band_args, args)
87
+ if (band_args$distribution == "custom") {
88
+ band_args$dparams <- band_args$dparams %||% list()
89
+ band_args$dparams$values <- theor_vals
90
+ }
91
+ }
92
+ }
93
+ if (!is.null(line_args) || isFALSE(line_args)) {
94
+ if (isTRUE(line_args$disabled)) {
95
+ line_args <- NULL
96
+ } else {
97
+ line_args$disabled <- NULL
98
+ line_args <- list_update(line_args, args)
99
+ if (line_args$distribution == "custom") {
100
+ line_args$dparams <- line_args$dparams %||% list()
101
+ line_args$dparams$values <- theor_vals
102
+ }
103
+ }
104
+ }
105
+ if (!is.null(point_args) || isFALSE(point_args)) {
106
+ if (isTRUE(point_args$disabled)) {
107
+ point_args <- NULL
108
+ } else {
109
+ point_args$disabled <- NULL
110
+ point_args <- list_update(point_args, args)
111
+ if (point_args$distribution == "custom") {
112
+ point_args$dparams <- point_args$dparams %||% list()
113
+ point_args$dparams$values <- theor_vals
114
+ }
115
+ }
116
+ }
117
+
118
+ title <- title %||% waiver()
119
+ xlabel <- xlabel %||% waiver()
120
+ ylabel <- ylabel %||% waiver()
121
+
122
+ indata <- indata[complete.cases(indata), , drop = FALSE]
123
+ indata <- indata[order(indata[[val_col]]), , drop = FALSE]
124
+
125
+ p <- ggplot(data = indata, mapping = aes(sample = !!sym(val_col))) +
126
+ labs(title = title, x = xlabel, y = ylabel)
127
+
128
+ if (!is.null(band_args)) {
129
+ p <- p + do_call(band_fun, band_args)
130
+ }
131
+ if (!is.null(line_args)) {
132
+ p <- p + do_call(line_fun, line_args)
133
+ }
134
+ if (!is.null(point_args)) {
135
+ p <- p + do_call(point_fun, point_args)
136
+ }
137
+
138
+ if (!is.null(ggs)) {
139
+ for (gg in ggs) {
140
+ p <- p + eval(parse(text = gg))
141
+ }
142
+ }
143
+
144
+ png(outfile, width=devpars$width, height=devpars$height, res=devpars$res)
145
+ print(p)
146
+ dev.off()
@@ -0,0 +1,226 @@
1
+ # Script for regulatory.MotifAffinityTest
2
+
3
+ source("{{biopipen_dir}}/utils/misc.R")
4
+ library(BiocParallel)
5
+ library(BSgenome)
6
+ library(universalmotif)
7
+
8
+ motiffile <- {{in.motiffile | r}}
9
+ varfile <- {{in.varfile | r}}
10
+ outdir <- {{out.outdir | r}}
11
+ ncores <- {{envs.ncores | r}}
12
+ tool <- {{envs.tool | r}}
13
+ bcftools <- {{envs.bcftools | r}}
14
+ genome <- {{envs.genome | r}}
15
+ motif_col <- {{envs.motif_col | r}}
16
+ regulator_col <- {{envs.regulator_col | r}}
17
+ notfound <- {{envs.notfound | r}}
18
+ motifdb <- {{envs.motifdb | r}}
19
+ regmotifs <- {{envs.regmotifs | r}}
20
+ devpars <- {{envs.devpars | r}}
21
+ plot_nvars <- {{envs.plot_nvars | r}}
22
+ plots <- {{envs.plots | r}}
23
+ cutoff <- {{envs.cutoff | r}}
24
+
25
+ if (is.null(motifdb) || !file.exists(motifdb)) {
26
+ stop("Motif database (envs.motifdb) is required and must exist")
27
+ }
28
+
29
+ if (is.null(genome)) {
30
+ stop("Reference genome (envs.ref) is required and must exist")
31
+ }
32
+
33
+ if (is.null(motiffile) || !file.exists(motiffile)) {
34
+ stop("Motif file (in.motiffile) is required and must exist")
35
+ }
36
+
37
+ if (is.null(varfile) || !file.exists(varfile)) {
38
+ stop("Variant file (in.varfile) is required and must exist")
39
+ }
40
+
41
+ if (is.null(motif_col) && is.null(regulator_col)) {
42
+ stop("Either motif (envs.motif_col) or regulator (envs.regulator_col) column must be provided")
43
+ }
44
+
45
+ log_info("Reading input regulator/motif file ...")
46
+ in_motifs <- read.table(motiffile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
47
+
48
+ if (is.null(motif_col)) {
49
+ log_info("Inferring motifs from regulators ...")
50
+ if (is.null(regmotifs) || !file.exists(regmotifs)) {
51
+ stop("Regulator motifs (envs.regmotifs) is required and must exist when no motif column (envs.motif_col) is provided")
52
+ }
53
+ regmotifs <- read.table(regmotifs, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
54
+ rm_motif_col <- c('Motif', 'motif', 'MOTIF', 'Model', 'model', 'MODEL')
55
+ rm_reg_col <- c('Regulator', 'regulator', 'REGULATOR', 'TF', 'tf', 'TF', 'Transcription factor', 'transcription factor', 'Transcription Factor')
56
+ rm_motif_col <- intersect(rm_motif_col, colnames(regmotifs))
57
+ rm_reg_col <- intersect(rm_reg_col, colnames(regmotifs))
58
+ if (length(rm_motif_col) == 0) {
59
+ stop("No motif column found in envs.regmotifs, provide one of: ", paste(rm_motif_col, collapse = ", "))
60
+ }
61
+ if (length(rm_reg_col) == 0) {
62
+ stop("No regulator column found in envs.regmotifs, provide one of: ", paste(rm_reg_col, collapse = ", "))
63
+ }
64
+ rm_motif_col <- rm_motif_col[1]
65
+ rm_reg_col <- rm_reg_col[1]
66
+ # check regulators
67
+ rm_regs <- regmotifs[, rm_reg_col, drop = TRUE]
68
+ regulators <- in_motifs[, regulator_col, drop = TRUE]
69
+ notfound_regs <- setdiff(regulators, rm_regs)
70
+ if (length(notfound_regs) > 0 && notfound == "error") {
71
+ first_notfound <- head(notfound_regs, 3)
72
+ if (length(notfound_regs) > 3) {
73
+ first_notfound <- c(first_notfound, "...")
74
+ notfound_file <- file.path(outdir, "notfound_regulators.txt")
75
+ writeLines(notfound_regs, notfound_file)
76
+ msg1 <- paste0("The following regulators were not found in the envs.regmotifs file: ", paste(first_notfound, collapse = ", "))
77
+ msg2 <- paste0("Check the full list in ", notfound_file)
78
+ stop(msg1, "\n", msg2)
79
+ } else {
80
+ msg <- paste0("The following regulators were not found in the regmotifs file: ", paste(first_notfound, collapse = ", "))
81
+ stop(msg)
82
+ }
83
+ }
84
+ in_motifs <- in_motifs[in_motifs[, regulator_col] %in% rm_regs, , drop = FALSE]
85
+ # add motif column
86
+ in_motifs <- merge(in_motifs, regmotifs, by.x = regulator_col, by.y = rm_reg_col, all.x = TRUE, suffixes = c("", "_db"))
87
+ motif_col <- rm_motif_col
88
+ }
89
+ if (is.null(regulator_col)) {
90
+ # make motifs unique
91
+ in_moitfs <- in_motifs[!duplicated(in_motifs[, motif_col]), , drop = FALSE]
92
+ } else {
93
+ in_motifs <- in_motifs[!duplicated(in_motifs[, c(regulator_col, motif_col)]), , drop = FALSE]
94
+ }
95
+
96
+
97
+ if (!grepl(".", genome, fixed = TRUE)) {
98
+ genome_pkg = sprintf("BSgenome.Hsapiens.UCSC.%s", genome)
99
+ } else {
100
+ genome_pkg = genome
101
+ }
102
+ if (!requireNamespace(genome_pkg, quietly = TRUE)) {
103
+ stop(sprintf("Genome package %s is not installed", genome_pkg))
104
+ }
105
+
106
+ log_info("Reading variant file ...")
107
+ if (grepl("\\.vcf$", varfile) || grepl("\\.vcf\\.gz$", varfile)) {
108
+ log_info("Converting VCF file to BED file ...")
109
+ varfile_bed <- file.path(outdir, gsub("\\.vcf(\\.gz)?$", ".bed", basename(varfile)))
110
+ cmd <- c(
111
+ bcftools, "query",
112
+ "-f", "%CHROM\\t%POS0\\t%END\\t%ID\\t0\\t+\\t%REF\\t%ALT{0}\\n",
113
+ "-i", 'FILTER="PASS" || FILTER="." || FILTER=""',
114
+ "-o", varfile_bed,
115
+ varfile
116
+ )
117
+ run_command(cmd, fg = TRUE)
118
+
119
+ varfile <- varfile_bed
120
+ }
121
+
122
+ # `chrom`, `start`, `end`, `name`, `score`, `strand`, `ref`, `alt`.
123
+ snpinfo <- read.table(varfile, header=FALSE, stringsAsFactors=FALSE)
124
+ colnames(snpinfo) <- c("chrom", "start", "end", "name", "score", "strand", "ref", "alt")
125
+
126
+ log_info("Reading motif database ...")
127
+ meme <- read_meme(motifdb)
128
+
129
+ check_motifs <- function(motifdb_names) {
130
+ motifs <- in_motifs[, motif_col, drop = TRUE]
131
+ notfound_motifs <- setdiff(motifs, motifdb_names)
132
+ if (length(notfound_motifs) > 0) {
133
+ first_notfound <- head(notfound_motifs, 3)
134
+ if (length(notfound_motifs) > 3) {
135
+ first_notfound <- c(first_notfound, "...")
136
+ notfound_file <- file.path(outdir, "notfound_motifs.txt")
137
+ writeLines(notfound_motifs, notfound_file)
138
+ msg1 <- paste0("The following motifs were not found in the motif database: ", paste(first_notfound, collapse = ", "))
139
+ msg2 <- paste0("Check the full list in ", notfound_file)
140
+
141
+ if (notfound == "error") {
142
+ stop(msg1, "\n", msg2)
143
+ } else if (notfound == "ignore") {
144
+ log_warn(msg1)
145
+ log_warn(msg2)
146
+ }
147
+ } else {
148
+ msg <- paste0("The following motifs were not found in the motif database: ", paste(first_notfound, collapse = ", "))
149
+ if (notfound == "error") {
150
+ stop(msg)
151
+ } else if (notfound == "ignore") {
152
+ log_warn(msg)
153
+ }
154
+ }
155
+
156
+ motifs <- setdiff(motifs, notfound_motifs)
157
+ }
158
+ return(motifs)
159
+ }
160
+
161
+ plot_variant <- function(motifbreakr_results) {
162
+ log_info("Plotting variants ...")
163
+ plotdir <- file.path(outdir, "plots")
164
+ dir.create(plotdir, showWarnings = FALSE)
165
+ results <- motifbreakr_results
166
+ if (is.null(plots) || length(plots) == 0) {
167
+ results <- results[order(-abs(results$alleleDiff)), , drop = FALSE]
168
+ results <- results[1:min(plot_nvars, length(results)), , drop = FALSE]
169
+ variants <- unique(results$SNP_id)
170
+ } else {
171
+ variants <- names(plots)
172
+ }
173
+ for (variant in variants) {
174
+ log_info("- Variant: {variant}")
175
+ if (is.null(plots[[variant]])) {
176
+ plots[[variant]] <- list(devpars = devpars, which = "TRUE")
177
+ }
178
+ if (is.null(plots[[variant]]$which)) {
179
+ plots[[variant]]$which <- "TRUE"
180
+ }
181
+ if (is.null(plots[[variant]]$devpars)) {
182
+ plots[[variant]]$devpars <- devpars
183
+ }
184
+ if (is.null(plots[[variant]]$devpars$res)) {
185
+ plots[[variant]]$devpars$res <- 100
186
+ }
187
+ res <- results[results$SNP_id == variant, , drop = FALSE]
188
+ if (length(res) == 0) {
189
+ stop(sprintf("Variant %s not found in results", variant))
190
+ }
191
+ res <- subset(res, subset = eval(parse(text = plots[[variant]]$which)))
192
+ if (length(res) == 0) {
193
+ stop(sprintf("No variants to plot for %s", variant))
194
+ }
195
+ plotfile <- file.path(plotdir, sprintf("%s.png", slugify(variant)))
196
+ # fix motifBreakR 2.12 using names to filter in plotMB
197
+ names(res) <- res$SNP_id
198
+ dv <- plots[[variant]]$devpars
199
+ if (is.null(dv$height)) {
200
+ dv$height <- 2.4 * dv$res + length(res) * 1.2 * dv$res
201
+ }
202
+ if (is.null(dv$width)) {
203
+ left <- min(sapply(res$motifPos, `[`, 1))
204
+ right <- max(sapply(res$motifPos, `[`, 2))
205
+ dv$width <- 1.5 * dv$res + (right - left) * 0.3 * dv$res
206
+ }
207
+ png(plotfile, width = dv$width, height = dv$height, res = dv$res)
208
+ motifbreakR::plotMB(res, variant)
209
+ dev.off()
210
+ }
211
+ }
212
+
213
+ tool <- tolower(tool)
214
+ tool <- match.arg(tool, c("motifbreakr", "atsnp"))
215
+
216
+ if (tool == "motifbreakr") {
217
+ motifbreakr_args <- {{envs.motifbreakr_args | r}}
218
+ {% set sourcefile = biopipen_dir | joinpaths: "scripts", "regulatory", "MotifAffinityTest_MotifBreakR.R" %}
219
+ # {{ sourcefile | getmtime }}
220
+ source("{{sourcefile}}")
221
+ } else { # atsnp
222
+ atsnp_args <- {{envs.atsnp_args | r}}
223
+ {% set sourcefile = biopipen_dir | joinpaths: "scripts", "regulatory", "MotifAffinityTest_AtSNP.R" %}
224
+ # {{ sourcefile | getmtime }}
225
+ source("{{sourcefile}}")
226
+ }