biopipen 0.28.1__py3-none-any.whl → 0.29.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (82) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +8 -0
  3. biopipen/ns/bam.py +0 -2
  4. biopipen/ns/bed.py +35 -0
  5. biopipen/ns/cellranger_pipeline.py +5 -5
  6. biopipen/ns/cnv.py +18 -2
  7. biopipen/ns/cnvkit_pipeline.py +16 -11
  8. biopipen/ns/gene.py +68 -23
  9. biopipen/ns/misc.py +2 -15
  10. biopipen/ns/plot.py +146 -0
  11. biopipen/ns/regulation.py +214 -0
  12. biopipen/ns/scrna.py +15 -3
  13. biopipen/ns/snp.py +516 -8
  14. biopipen/ns/stats.py +74 -2
  15. biopipen/ns/vcf.py +196 -0
  16. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  17. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  18. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  19. biopipen/reports/snp/PlinkHet.svelte +18 -0
  20. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  21. biopipen/scripts/bam/CNVpytor.py +144 -46
  22. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  23. biopipen/scripts/bed/BedtoolsMerge.py +1 -1
  24. biopipen/scripts/cnv/AneuploidyScore.R +30 -7
  25. biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
  26. biopipen/scripts/cnv/TMADScore.R +21 -5
  27. biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
  28. biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
  29. biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
  30. biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
  31. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
  32. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
  33. biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
  34. biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
  35. biopipen/scripts/gene/GeneNameConversion.R +65 -0
  36. biopipen/scripts/gene/GenePromoters.R +61 -0
  37. biopipen/scripts/misc/Shell.sh +15 -0
  38. biopipen/scripts/plot/Manhattan.R +140 -0
  39. biopipen/scripts/plot/QQPlot.R +62 -0
  40. biopipen/scripts/regulation/MotifAffinityTest.R +226 -0
  41. biopipen/scripts/regulation/MotifAffinityTest_AtSNP.R +126 -0
  42. biopipen/scripts/regulation/MotifAffinityTest_MotifBreakR.R +96 -0
  43. biopipen/scripts/regulation/MotifScan.py +159 -0
  44. biopipen/scripts/regulation/atSNP.R +33 -0
  45. biopipen/scripts/regulation/motifBreakR.R +1594 -0
  46. biopipen/scripts/scrna/MarkersFinder.R +59 -67
  47. biopipen/scripts/scrna/SeuratClustering.R +63 -29
  48. biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
  49. biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
  50. biopipen/scripts/snp/MatrixEQTL.R +84 -43
  51. biopipen/scripts/snp/Plink2GTMat.py +133 -0
  52. biopipen/scripts/snp/PlinkCallRate.R +190 -0
  53. biopipen/scripts/snp/PlinkFilter.py +100 -0
  54. biopipen/scripts/snp/PlinkFreq.R +298 -0
  55. biopipen/scripts/snp/PlinkFromVcf.py +78 -0
  56. biopipen/scripts/snp/PlinkHWE.R +80 -0
  57. biopipen/scripts/snp/PlinkHet.R +92 -0
  58. biopipen/scripts/snp/PlinkIBD.R +197 -0
  59. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  60. biopipen/scripts/stats/MetaPvalue.R +2 -1
  61. biopipen/scripts/stats/MetaPvalue1.R +70 -0
  62. biopipen/scripts/tcr/TCRClusterStats.R +12 -7
  63. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  64. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  65. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  66. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  67. biopipen/scripts/vcf/VcfFix_utils.py +1 -1
  68. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  69. biopipen/utils/gene.R +83 -37
  70. biopipen/utils/gene.py +108 -60
  71. biopipen/utils/misc.R +56 -0
  72. biopipen/utils/misc.py +5 -2
  73. biopipen/utils/reference.py +54 -10
  74. {biopipen-0.28.1.dist-info → biopipen-0.29.0.dist-info}/METADATA +2 -2
  75. {biopipen-0.28.1.dist-info → biopipen-0.29.0.dist-info}/RECORD +77 -49
  76. {biopipen-0.28.1.dist-info → biopipen-0.29.0.dist-info}/entry_points.txt +1 -1
  77. biopipen/ns/bcftools.py +0 -111
  78. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  79. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  80. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  81. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  82. {biopipen-0.28.1.dist-info → biopipen-0.29.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,226 @@
1
+ # Script for regulation.MotifAffinityTest
2
+
3
+ source("{{biopipen_dir}}/utils/misc.R")
4
+ library(BiocParallel)
5
+ library(BSgenome)
6
+ library(universalmotif)
7
+
8
+ motiffile <- {{in.motiffile | r}}
9
+ varfile <- {{in.varfile | r}}
10
+ outdir <- {{out.outdir | r}}
11
+ ncores <- {{envs.ncores | r}}
12
+ tool <- {{envs.tool | r}}
13
+ bcftools <- {{envs.bcftools | r}}
14
+ genome <- {{envs.genome | r}}
15
+ motif_col <- {{envs.motif_col | r}}
16
+ regulator_col <- {{envs.regulator_col | r}}
17
+ notfound <- {{envs.notfound | r}}
18
+ motifdb <- {{envs.motifdb | r}}
19
+ regmotifs <- {{envs.regmotifs | r}}
20
+ devpars <- {{envs.devpars | r}}
21
+ plot_nvars <- {{envs.plot_nvars | r}}
22
+ plots <- {{envs.plots | r}}
23
+ cutoff <- {{envs.cutoff | r}}
24
+
25
+ if (is.null(motifdb) || !file.exists(motifdb)) {
26
+ stop("Motif database (envs.motifdb) is required and must exist")
27
+ }
28
+
29
+ if (is.null(genome)) {
30
+ stop("Reference genome (envs.ref) is required and must exist")
31
+ }
32
+
33
+ if (is.null(motiffile) || !file.exists(motiffile)) {
34
+ stop("Motif file (in.motiffile) is required and must exist")
35
+ }
36
+
37
+ if (is.null(varfile) || !file.exists(varfile)) {
38
+ stop("Variant file (in.varfile) is required and must exist")
39
+ }
40
+
41
+ if (is.null(motif_col) && is.null(regulator_col)) {
42
+ stop("Either motif (envs.motif_col) or regulator (envs.regulator_col) column must be provided")
43
+ }
44
+
45
+ log_info("Reading input regulator/motif file ...")
46
+ in_motifs <- read.table(motiffile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
47
+
48
+ if (is.null(motif_col)) {
49
+ log_info("Inferring motifs from regulators ...")
50
+ if (is.null(regmotifs) || !file.exists(regmotifs)) {
51
+ stop("Regulator motifs (envs.regmotifs) is required and must exist when no motif column (envs.motif_col) is provided")
52
+ }
53
+ regmotifs <- read.table(regmotifs, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
54
+ rm_motif_col <- c('Motif', 'motif', 'MOTIF', 'Model', 'model', 'MODEL')
55
+ rm_reg_col <- c('Regulator', 'regulator', 'REGULATOR', 'TF', 'tf', 'TF', 'Transcription factor', 'transcription factor', 'Transcription Factor')
56
+ rm_motif_col <- intersect(rm_motif_col, colnames(regmotifs))
57
+ rm_reg_col <- intersect(rm_reg_col, colnames(regmotifs))
58
+ if (length(rm_motif_col) == 0) {
59
+ stop("No motif column found in envs.regmotifs, provide one of: ", paste(rm_motif_col, collapse = ", "))
60
+ }
61
+ if (length(rm_reg_col) == 0) {
62
+ stop("No regulator column found in envs.regmotifs, provide one of: ", paste(rm_reg_col, collapse = ", "))
63
+ }
64
+ rm_motif_col <- rm_motif_col[1]
65
+ rm_reg_col <- rm_reg_col[1]
66
+ # check regulators
67
+ rm_regs <- regmotifs[, rm_reg_col, drop = TRUE]
68
+ regulators <- in_motifs[, regulator_col, drop = TRUE]
69
+ notfound_regs <- setdiff(regulators, rm_regs)
70
+ if (length(notfound_regs) > 0 && notfound == "error") {
71
+ first_notfound <- head(notfound_regs, 3)
72
+ if (length(notfound_regs) > 3) {
73
+ first_notfound <- c(first_notfound, "...")
74
+ notfound_file <- file.path(outdir, "notfound_regulators.txt")
75
+ writeLines(notfound_regs, notfound_file)
76
+ msg1 <- paste0("The following regulators were not found in the envs.regmotifs file: ", paste(first_notfound, collapse = ", "))
77
+ msg2 <- paste0("Check the full list in ", notfound_file)
78
+ stop(msg1, "\n", msg2)
79
+ } else {
80
+ msg <- paste0("The following regulators were not found in the regmotifs file: ", paste(first_notfound, collapse = ", "))
81
+ stop(msg)
82
+ }
83
+ }
84
+ in_motifs <- in_motifs[in_motifs[, regulator_col] %in% rm_regs, , drop = FALSE]
85
+ # add motif column
86
+ in_motifs <- merge(in_motifs, regmotifs, by.x = regulator_col, by.y = rm_reg_col, all.x = TRUE, suffixes = c("", "_db"))
87
+ motif_col <- rm_motif_col
88
+ }
89
+ if (is.null(regulator_col)) {
90
+ # make motifs unique
91
+ in_moitfs <- in_motifs[!duplicated(in_motifs[, motif_col]), , drop = FALSE]
92
+ } else {
93
+ in_motifs <- in_motifs[!duplicated(in_motifs[, c(regulator_col, motif_col)]), , drop = FALSE]
94
+ }
95
+
96
+
97
+ if (!grepl(".", genome, fixed = TRUE)) {
98
+ genome_pkg = sprintf("BSgenome.Hsapiens.UCSC.%s", genome)
99
+ } else {
100
+ genome_pkg = genome
101
+ }
102
+ if (!requireNamespace(genome_pkg, quietly = TRUE)) {
103
+ stop(sprintf("Genome package %s is not installed", genome_pkg))
104
+ }
105
+
106
+ log_info("Reading variant file ...")
107
+ if (grepl("\\.vcf$", varfile) || grepl("\\.vcf\\.gz$", varfile)) {
108
+ log_info("Converting VCF file to BED file ...")
109
+ varfile_bed <- file.path(outdir, gsub("\\.vcf(\\.gz)?$", ".bed", basename(varfile)))
110
+ cmd <- c(
111
+ bcftools, "query",
112
+ "-f", "%CHROM\\t%POS0\\t%END\\t%ID\\t0\\t+\\t%REF\\t%ALT{0}\\n",
113
+ "-i", 'FILTER="PASS" || FILTER="." || FILTER=""',
114
+ "-o", varfile_bed,
115
+ varfile
116
+ )
117
+ run_command(cmd, fg = TRUE)
118
+
119
+ varfile <- varfile_bed
120
+ }
121
+
122
+ # `chrom`, `start`, `end`, `name`, `score`, `strand`, `ref`, `alt`.
123
+ snpinfo <- read.table(varfile, header=FALSE, stringsAsFactors=FALSE)
124
+ colnames(snpinfo) <- c("chrom", "start", "end", "name", "score", "strand", "ref", "alt")
125
+
126
+ log_info("Reading motif database ...")
127
+ meme <- read_meme(motifdb)
128
+
129
+ check_motifs <- function(motifdb_names) {
130
+ motifs <- in_motifs[, motif_col, drop = TRUE]
131
+ notfound_motifs <- setdiff(motifs, motifdb_names)
132
+ if (length(notfound_motifs) > 0) {
133
+ first_notfound <- head(notfound_motifs, 3)
134
+ if (length(notfound_motifs) > 3) {
135
+ first_notfound <- c(first_notfound, "...")
136
+ notfound_file <- file.path(outdir, "notfound_motifs.txt")
137
+ writeLines(notfound_motifs, notfound_file)
138
+ msg1 <- paste0("The following motifs were not found in the motif database: ", paste(first_notfound, collapse = ", "))
139
+ msg2 <- paste0("Check the full list in ", notfound_file)
140
+
141
+ if (notfound == "error") {
142
+ stop(msg1, "\n", msg2)
143
+ } else if (notfound == "ignore") {
144
+ log_warn(msg1)
145
+ log_warn(msg2)
146
+ }
147
+ } else {
148
+ msg <- paste0("The following motifs were not found in the motif database: ", paste(first_notfound, collapse = ", "))
149
+ if (notfound == "error") {
150
+ stop(msg)
151
+ } else if (notfound == "ignore") {
152
+ log_warn(msg)
153
+ }
154
+ }
155
+
156
+ motifs <- setdiff(motifs, notfound_motifs)
157
+ }
158
+ return(motifs)
159
+ }
160
+
161
+ plot_variant <- function(motifbreakr_results) {
162
+ log_info("Plotting variants ...")
163
+ plotdir <- file.path(outdir, "plots")
164
+ dir.create(plotdir, showWarnings = FALSE)
165
+ results <- motifbreakr_results
166
+ if (is.null(plots) || length(plots) == 0) {
167
+ results <- results[order(-abs(results$alleleDiff)), , drop = FALSE]
168
+ results <- results[1:min(plot_nvars, length(results)), , drop = FALSE]
169
+ variants <- unique(results$SNP_id)
170
+ } else {
171
+ variants <- names(plots)
172
+ }
173
+ for (variant in variants) {
174
+ log_info("- Variant: {variant}")
175
+ if (is.null(plots[[variant]])) {
176
+ plots[[variant]] <- list(devpars = devpars, which = "TRUE")
177
+ }
178
+ if (is.null(plots[[variant]]$which)) {
179
+ plots[[variant]]$which <- "TRUE"
180
+ }
181
+ if (is.null(plots[[variant]]$devpars)) {
182
+ plots[[variant]]$devpars <- devpars
183
+ }
184
+ if (is.null(plots[[variant]]$devpars$res)) {
185
+ plots[[variant]]$devpars$res <- 100
186
+ }
187
+ res <- results[results$SNP_id == variant, , drop = FALSE]
188
+ if (length(res) == 0) {
189
+ stop(sprintf("Variant %s not found in results", variant))
190
+ }
191
+ res <- subset(res, subset = eval(parse(text = plots[[variant]]$which)))
192
+ if (length(res) == 0) {
193
+ stop(sprintf("No variants to plot for %s", variant))
194
+ }
195
+ plotfile <- file.path(plotdir, sprintf("%s.png", slugify(variant)))
196
+ # fix motifBreakR 2.12 using names to filter in plotMB
197
+ names(res) <- res$SNP_id
198
+ dv <- plots[[variant]]$devpars
199
+ if (is.null(dv$height)) {
200
+ dv$height <- 2.4 * dv$res + length(res) * 1.2 * dv$res
201
+ }
202
+ if (is.null(dv$width)) {
203
+ left <- min(sapply(res$motifPos, `[`, 1))
204
+ right <- max(sapply(res$motifPos, `[`, 2))
205
+ dv$width <- 1.5 * dv$res + (right - left) * 0.3 * dv$res
206
+ }
207
+ png(plotfile, width = dv$width, height = dv$height, res = dv$res)
208
+ motifbreakR::plotMB(res, variant)
209
+ dev.off()
210
+ }
211
+ }
212
+
213
+ tool <- tolower(tool)
214
+ tool <- match.arg(tool, c("motifbreakr", "atsnp"))
215
+
216
+ if (tool == "motifbreakr") {
217
+ motifbreakr_args <- {{envs.motifbreakr_args | r}}
218
+ {% set sourcefile = biopipen_dir | joinpaths: "scripts", "regulation", "MotifAffinityTest_MotifBreakR.R" %}
219
+ # {{ sourcefile | getmtime }}
220
+ source("{{sourcefile}}")
221
+ } else { # atsnp
222
+ atsnp_args <- {{envs.atsnp_args | r}}
223
+ {% set sourcefile = biopipen_dir | joinpaths: "scripts", "regulation", "MotifAffinityTest_AtSNP.R" %}
224
+ # {{ sourcefile | getmtime }}
225
+ source("{{sourcefile}}")
226
+ }
@@ -0,0 +1,126 @@
1
+ library(atSNP)
2
+ library(rtracklayer)
3
+
4
+ log_info("Converting universalmotif object to motif_library ...")
5
+
6
+ motifdb_names <- sapply(meme, function(m) m@name)
7
+ motifs <- check_motifs(motifdb_names)
8
+ meme <- filter_motifs(meme, name = motifs)
9
+ # Get the right order of motif names
10
+ motifs <- sapply(meme, function(m) m@name)
11
+
12
+ # used for atSNP
13
+ mdb <- lapply(meme, function(m) t(m@motif))
14
+ names(mdb) <- motifs
15
+
16
+ # compose one used for plotting using motifbreakR
17
+ motifdb_matrices <- lapply(meme, function(m) m@motif)
18
+ names(motifdb_matrices) <- motifs
19
+ motifdb_meta <- do.call(rbind, lapply(meme, function(m) {
20
+ ats <- attributes(m)
21
+ ats$dataSource <- basename(motifdb)
22
+ ats$class <- NULL
23
+ ats$motif <- NULL
24
+ ats$gapinfo <- NULL
25
+ ats$sequenceCount <- ats$nsites
26
+ ats$providerId <- ats$name
27
+ ats$providerName <- ats$name
28
+ ats$organism <- if (is.null(ats$organism) || length(ats$organism) == 0) "Unknown" else ats$organism
29
+ unlist(ats)
30
+ }))
31
+ rownames(motifdb_meta) <- motifs
32
+ pmotifs <- MotifDb:::MotifList(motifdb_matrices, tbl.metadata = motifdb_meta)
33
+
34
+ log_info("Converting snpinfo to atSNP object ...")
35
+
36
+ # c("chrom", "start", "end", "name", "score", "strand", "ref", "alt", "ref_seq", "alt_seq")
37
+ if (any(nchar(snpinfo$ref) != 1) || any(nchar(snpinfo$alt) != 1)) {
38
+ stop("Only SNVs are supported by atSNP. Consider using motifbreakR instead if you have indels.")
39
+ }
40
+ atsnp_bed <- file.path(outdir, gsub("\\.vcf(\\.gz)?$|\\.bed$", ".atsnp.txt", basename(varfile)))
41
+ snpinfo$name <- ifelse(
42
+ snpinfo$name == "." | is.na(snpinfo$name) | nchar(snpinfo$name) == 0,
43
+ sprintf("%s:%s", snpinfo$chrom, snpinfo$end),
44
+ snpinfo$name
45
+ )
46
+ snpinfo$a1 <- snpinfo$ref
47
+ snpinfo$a2 <- snpinfo$alt
48
+ snpinfo$chr <- snpinfo$chrom
49
+ snpinfo$snp <- snpinfo$end
50
+ snpinfo$snpid <- snpinfo$name
51
+ write.table(
52
+ snpinfo[, c("snpid", "a1", "a2", "chr", "snp")],
53
+ file = atsnp_bed,
54
+ sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE
55
+ )
56
+ k <- max(sapply(mdb, nrow))
57
+ snps <- LoadSNPData(
58
+ atsnp_bed,
59
+ genome.lib = genome_pkg,
60
+ mutation = TRUE, # force using given ref and alt
61
+ default.par = nrow(snpinfo) < 1000,
62
+ half.window.size = k
63
+ )
64
+
65
+ # run motifbreakR
66
+ log_info("Running atSNP ...")
67
+ atsnp_scores <- ComputeMotifScore(mdb, snps, ncores = ncores)
68
+
69
+ log_info("Calculating p values ...")
70
+ atsnp_result <- ComputePValues(
71
+ motif.lib = mdb,
72
+ snp.info = snps,
73
+ motif.scores = atsnp_scores$motif.scores,
74
+ ncores = ncores,
75
+ testing.mc = TRUE
76
+ )
77
+
78
+ padj_col <- paste0(atsnp_args$p, "_adj")
79
+ atsnp_result[[padj_col]] <- p.adjust(atsnp_result[[atsnp_args$p]], method = atsnp_args$padj)
80
+ cutoff_col <- if (atsnp_args$padj_cutoff) padj_col else atsnp_args$p
81
+ atsnp_result <- atsnp_result[atsnp_result[[cutoff_col]] < cutoff, , drop = FALSE]
82
+ # order by p value
83
+ atsnp_result <- atsnp_result[order(atsnp_result[[cutoff_col]]), , drop = FALSE]
84
+ snpinfo <- snpinfo[match(atsnp_result$snpid, snpinfo$snpid), , drop = FALSE]
85
+ atsnp_result$chr <- snpinfo$chr
86
+ atsnp_result$start <- snpinfo$start
87
+ atsnp_result$end <- snpinfo$end
88
+ atsnp_result$SNP_id <- snpinfo$snpid
89
+ atsnp_result$snpid <- NULL
90
+ atsnp_result$REF <- snpinfo$ref
91
+ atsnp_result$ALT <- snpinfo$alt
92
+ atsnp_result$providerName <- atsnp_result$motif
93
+ atsnp_result$providerId <- atsnp_result$providerName <- atsnp_result$motif
94
+ atsnp_result$motif <- NULL
95
+ atsnp_result$strand <- snpinfo$strand
96
+ atsnp_result$score <- snpinfo$score
97
+ atsnp_result$snpbase <- NULL
98
+ atsnp_result$altPos <- 1
99
+ atsnp_result$varType <- "SNV"
100
+ atsnp_result$motifPos <- sapply(1:nrow(atsnp_result), function(i) {
101
+ paste(c(atsnp_result$ref_start[i] - k, atsnp_result$ref_end[i] - k), collapse = ",")
102
+ })
103
+ if (!is.null(regulator_col)) {
104
+ atsnp_result$Regulator <- in_motifs[
105
+ match(atsnp_result$providerId, in_motifs[[motif_col]]),
106
+ regulator_col,
107
+ drop = TRUE
108
+ ]
109
+ }
110
+
111
+ write.table(
112
+ atsnp_result,
113
+ file = file.path(outdir, "atsnp.txt"),
114
+ sep = "\t", quote = FALSE, row.names = FALSE
115
+ )
116
+
117
+ log_info("Plotting variants ...")
118
+ # Convert result to GRanges object
119
+ atsnp_result$alleleDiff <- -atsnp_result[[cutoff_col]]
120
+ atsnp_result$effect <- "strong"
121
+ atsnp_result$motifPos <- lapply(atsnp_result$motifPos, function(x) as.integer(unlist(strsplit(x, ","))))
122
+ atsnp_result <- makeGRangesFromDataFrame(atsnp_result, keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE)
123
+ attributes(atsnp_result)$genome.package <- genome_pkg
124
+ attributes(atsnp_result)$motifs <- pmotifs
125
+
126
+ plot_variant(atsnp_result)
@@ -0,0 +1,96 @@
1
+ library(motifbreakR)
2
+ bsgenome <- getBSgenome(genome_pkg)
3
+
4
+ log_info("Converting universalmotif object to MotifDb object ...")
5
+
6
+ motifdb_names <- sapply(meme, function(m) m@name)
7
+ motifs <- check_motifs(motifdb_names)
8
+ meme <- filter_motifs(meme, name = motifs)
9
+ # Get the right order of motif names
10
+ motifs <- sapply(meme, function(m) m@name)
11
+ motifdb_matrices <- lapply(meme, function(m) m@motif)
12
+ names(motifdb_matrices) <- motifs
13
+
14
+ motifdb_meta <- do.call(rbind, lapply(meme, function(m) {
15
+ ats <- attributes(m)
16
+ ats$dataSource <- basename(motifdb)
17
+ ats$class <- NULL
18
+ ats$motif <- NULL
19
+ ats$gapinfo <- NULL
20
+ ats$sequenceCount <- ats$nsites
21
+ ats$providerId <- ats$name
22
+ ats$providerName <- ats$name
23
+ ats$organism <- if (is.null(ats$organism) || length(ats$organism) == 0) "Unknown" else ats$organism
24
+ unlist(ats)
25
+ }))
26
+ rownames(motifdb_meta) <- motifs
27
+ mdb <- MotifDb:::MotifList(motifdb_matrices, tbl.metadata = motifdb_meta)
28
+
29
+ # `chrom`, `start`, `end`, `name`, `score`, `strand`, `ref`, `alt`.
30
+ is_indel <- nchar(snpinfo$ref) != 1 | nchar(snpinfo$alt) != 1
31
+ snpinfo$coordname <- ifelse(
32
+ is_indel,
33
+ sprintf("%s:%s-%s:%s:%s", snpinfo$chrom, snpinfo$start + 1, snpinfo$end, snpinfo$ref, snpinfo$alt),
34
+ sprintf("%s:%s:%s:%s", snpinfo$chrom, snpinfo$end, snpinfo$ref, snpinfo$alt)
35
+ )
36
+ motifbreakr_bed <- file.path(outdir, gsub("\\.vcf(\\.gz)?$|\\.bed$", ".motifbreakr.bed", basename(varfile)))
37
+ write.table(
38
+ snpinfo[, c("chrom", "start", "end", "coordname", "score", "strand")],
39
+ file = motifbreakr_bed,
40
+ sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE
41
+ )
42
+ snps <- snps.from.file(motifbreakr_bed, search.genome = bsgenome, format = "bed", indels = any(is_indel))
43
+ snpinfo <- snpinfo[snpinfo$coordname == snps$SNP_id, , drop = FALSE]
44
+ snps@elementMetadata$SNP_id <- ifelse(
45
+ snpinfo$name == "." | is.na(snpinfo$name) | nchar(snpinfo$name) == 0,
46
+ snpinfo$coordname,
47
+ snpinfo$name
48
+ )
49
+
50
+ # prepare PWMs
51
+ get_bkg <- function(base) {
52
+ base_col <- paste0("bkg.", base)
53
+ base_bkg <- mdb@elementMetadata[[base_col]]
54
+ if (is.null(base_bkg) || length(base_bkg) == 0 || is.na(base_bkg[1])) {
55
+ base_bkg <- 0.25
56
+ } else {
57
+ base_bkg <- as.numeric(base_bkg[1])
58
+ }
59
+ }
60
+ bkg <- c(A = get_bkg("A"), C = get_bkg("C"), G = get_bkg("G"), T = get_bkg("T"))
61
+
62
+ # run motifbreakR
63
+ log_info("Running motifbreakR ...")
64
+ results <- motifbreakR(
65
+ snpList = snps,
66
+ pwmList = mdb,
67
+ threshold = cutoff,
68
+ method = motifbreakr_args$method,
69
+ bkg = bkg,
70
+ filterp = TRUE,
71
+ show.neutral = FALSE,
72
+ BPPARAM = MulticoreParam(ncores)
73
+ )
74
+
75
+ log_info("Calculating p values ...")
76
+ results <- calculatePvalue(results)
77
+ results_to_save <- as.data.frame(unname(results))
78
+ results_to_save$motifPos <- lapply(results_to_save$motifPos, function(x) paste(x, collapse = ","))
79
+ results_to_save$altPos <- lapply(results_to_save$altPos, function(x) paste(x, collapse = ","))
80
+ if (!is.null(regulator_col)) {
81
+ results_to_save$Regulator <- in_motifs[
82
+ match(results_to_save$providerId, in_motifs[[motif_col]]),
83
+ regulator_col,
84
+ drop = TRUE
85
+ ]
86
+ }
87
+ results_to_save <- apply(results_to_save, 2, as.character)
88
+
89
+ write.table(
90
+ results_to_save,
91
+ file = file.path(outdir, "motifbreakr.txt"),
92
+ sep = "\t", quote = FALSE, row.names = FALSE
93
+ )
94
+ rm(results_to_save)
95
+
96
+ plot_variant(results)
@@ -0,0 +1,159 @@
1
+ """Script for regulation.MotifScan"""
2
+ import re
3
+
4
+ # Paths may be passed in args or to motifdb
5
+ from pathlib import PosixPath # noqa: F401
6
+ from biopipen.utils.misc import run_command, dict_to_cli_args, logger
7
+
8
+ motiffile = {{in.motiffile | repr}} # pyright: ignore # noqa: #999
9
+ seqfile = {{in.seqfile | repr}} # pyright: ignore
10
+ outdir = {{out.outdir | repr}} # pyright: ignore
11
+
12
+ tool = {{envs.tool | repr}} # pyright: ignore
13
+ fimo = {{envs.fimo | repr}} # pyright: ignore
14
+ motif_col = {{envs.motif_col | repr}} # pyright: ignore
15
+ regulator_col = {{envs.regulator_col | repr}} # pyright: ignore
16
+ notfound = {{envs.notfound | repr}} # pyright: ignore
17
+ motifdb = {{envs.motifdb | repr}} # pyright: ignore
18
+ cutoff = {{envs.cutoff | repr}} # pyright: ignore
19
+ q = {{envs.q | repr}} # pyright: ignore
20
+ q_cutoff = {{envs.q_cutoff | repr}} # pyright: ignore
21
+ args = {{envs.args | dict | repr}} # pyright: ignore
22
+
23
+ # Check if the tool is supported
24
+ if tool != "fimo":
25
+ raise ValueError(f"Unsupported tool: {tool}, currently only fimo is supported")
26
+
27
+ # Check if the motif database is provided
28
+ if motifdb is None:
29
+ raise ValueError("The motif database is required")
30
+
31
+ # Check if the motif file exists
32
+ if not motiffile:
33
+ raise FileNotFoundError(f"Motif file in.motiffile must be provided")
34
+
35
+ # Check if the sequence file exists
36
+ if not seqfile:
37
+ raise FileNotFoundError(f"Sequence file in.seqfile must be provided")
38
+
39
+ # Normalize motif_col and regulator_col into 0-based indexes
40
+ if isinstance(motif_col, str) or isinstance(regulator_col, str):
41
+ with open(motiffile, "r") as f:
42
+ header = f.readline().strip().split("\t")
43
+ if isinstance(motif_col, str):
44
+ motif_col = header.index(motif_col) + 1
45
+ if isinstance(regulator_col, str):
46
+ regulator_col = header.index(regulator_col) + 1
47
+ if isinstance(motif_col, int):
48
+ motif_col -= 1
49
+ if isinstance(regulator_col, int):
50
+ regulator_col -= 1
51
+
52
+ # Check if motif names exist in the database
53
+ with open(motiffile, "r") as f:
54
+ motif_names = set(
55
+ line.strip().split("\t")[motif_col]
56
+ for i, line in enumerate(f)
57
+ if i > 0 # skip header
58
+ )
59
+
60
+ with open(motifdb, "r") as f:
61
+ motif_db_names = set(
62
+ line[6:].strip()
63
+ for line in f
64
+ if line.startswith("MOTIF")
65
+ )
66
+
67
+ if notfound == "error":
68
+ notfound_motifs = motif_names - motif_db_names
69
+ if notfound_motifs:
70
+ raise ValueError(f"Motifs not found in the database: {notfound_motifs}")
71
+
72
+ # Make a new motif database with only the motifs in the motiffile
73
+ motif_names = motif_names & motif_db_names
74
+ motifdb_filtered = f"{outdir}/motif_db.txt"
75
+ with open(motifdb, "r") as f, open(motifdb_filtered, "w") as f_out:
76
+ should_write = True
77
+ for line in f:
78
+ if line.startswith("MOTIF"):
79
+ motif_name = line[6:].strip()
80
+ if motif_name in motif_names:
81
+ should_write = True
82
+ else:
83
+ should_write = False
84
+
85
+ if should_write:
86
+ f_out.write(line)
87
+ else:
88
+ continue
89
+
90
+ # Now run fimo
91
+ args[""] = fimo
92
+ args["oc"] = f"{outdir}"
93
+ args["thresh"] = cutoff
94
+ args["qv_thresh"] = q_cutoff
95
+ args["no_qvalue"] = not q
96
+ args["no-pgc"] = True
97
+ args["_"] = [motifdb_filtered, seqfile]
98
+
99
+ logger.info("Running fimo ...")
100
+ run_command(dict_to_cli_args(args, dashify=True), fg=True)
101
+
102
+ logger.info("Adding additional information to the output ...")
103
+ # Get the motif to regulator mapping
104
+ motif_regulator_map = {}
105
+ if regulator_col is not None:
106
+ with open(motiffile, "r") as f:
107
+ next(f) # skip header
108
+ for line in f:
109
+ line = line.strip().split("\t")
110
+ motif_name = line[motif_col]
111
+ regulator = line[regulator_col]
112
+ motif_regulator_map[motif_name] = regulator
113
+
114
+ # Get the sequence name information
115
+ seqnames = {}
116
+ seqcoords = {}
117
+ with open(seqfile, "r") as f:
118
+ for line in f:
119
+ if not line.startswith(">"):
120
+ continue
121
+
122
+ seqname = line[1:].strip()
123
+ match = re.match(r"^(.+)::((?:chr)?\d+):(\d+)-(\d+).*$", seqname)
124
+ if not match:
125
+ seqnames[seqname] = seqname
126
+ seqcoords[seqname] = None
127
+ else:
128
+ sname, chrom, start, end = match.groups()
129
+ seqnames[seqname] = sname
130
+ seqcoords[seqname] = (chrom, int(start), int(end))
131
+
132
+ # Add additional information to the output
133
+ with open(f"{outdir}/fimo.tsv", "r") as f, open(f"{outdir}/fimo_output.txt", "w") as f_out:
134
+ header = f.readline().strip().split("\t")
135
+ f_out.write(
136
+ "\t".join(header + ["regulator", "seqname", "seqstart", "seqstop"]) + "\n"
137
+ )
138
+ for line in f:
139
+ line = line.strip()
140
+ if not line or line.startswith("#"):
141
+ continue
142
+ line = line.split("\t")
143
+ motif_name = line[0]
144
+ sequence_name = line[2]
145
+ start = int(line[3])
146
+ stop = int(line[4])
147
+ regulator = motif_regulator_map.get(motif_name, motif_name)
148
+ seqname = seqnames.get(sequence_name, "NA")
149
+ seqcoord = seqcoords.get(sequence_name)
150
+ if not seqcoord:
151
+ seqstart = "NA"
152
+ seqstop = "NA"
153
+ else:
154
+ seqstart = start + seqcoord[1] - 1
155
+ seqstop = stop + seqcoord[2] - 1
156
+
157
+ f_out.write(
158
+ "\t".join(line + [regulator, seqname, str(seqstart), str(seqstop)]) + "\n"
159
+ )
@@ -0,0 +1,33 @@
1
+ snpinfo2atsnp <- function(snpinfo) {
2
+ # c("chrom", "start", "end", "name", "score", "strand", "ref", "alt", "ref_seq", "alt_seq")
3
+ if (any(nchar(snpinfo$ref) != 1) || any(nchar(snpinfo$alt) != 1)) {
4
+ stop("Only SNVs are supported by atSNP. Consider using motifbreakR instead if you have indels.")
5
+ }
6
+ base_encodings <- c(A = 1, C = 2, G = 3, T = 4)
7
+ transition <- matrix(
8
+ c(
9
+ 0.3225035, 0.1738422, 0.24915044, 0.2545039,
10
+ 0.3451410, 0.2642147, 0.05245011, 0.3381942,
11
+ 0.2813089, 0.2136604, 0.26749171, 0.2375390,
12
+ 0.2149776, 0.2071733, 0.25309238, 0.3247568
13
+ ),
14
+ nrow = 4,
15
+ byrow = TRUE
16
+ )
17
+ rownames(transition) <- colnames(transition) <- names(base_encodings)
18
+ list(
19
+ sequence_matrix = unname(sapply(
20
+ snpinfo$ref_seq,
21
+ function(s) as.integer(base_encodings[strsplit(s, "")[[1]]])
22
+ )),
23
+ ref_base = as.integer(base_encodings[snpinfo$ref]),
24
+ snp_base = as.integer(base_encodings[snpinfo$alt]),
25
+ snpids = snpinfo$name,
26
+ transition = transition,
27
+ prior = c(A = 0.287, C = 0.211, G = 0.213, T = 0.289),
28
+ rsid.na = NULL,
29
+ rsid.rm = NULL,
30
+ rsid.duplicate = NULL,
31
+ rsid.missing = NULL
32
+ )
33
+ }