biopipen 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +8 -0
- biopipen/ns/bam.py +0 -2
- biopipen/ns/bed.py +35 -0
- biopipen/ns/cellranger_pipeline.py +5 -5
- biopipen/ns/cnv.py +18 -2
- biopipen/ns/cnvkit_pipeline.py +16 -11
- biopipen/ns/gene.py +68 -23
- biopipen/ns/misc.py +2 -15
- biopipen/ns/plot.py +204 -0
- biopipen/ns/regulatory.py +214 -0
- biopipen/ns/scrna.py +31 -5
- biopipen/ns/snp.py +516 -8
- biopipen/ns/stats.py +167 -3
- biopipen/ns/vcf.py +196 -0
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/scripts/bam/CNVpytor.py +144 -46
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMerge.py +1 -1
- biopipen/scripts/cnv/AneuploidyScore.R +30 -7
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
- biopipen/scripts/cnv/TMADScore.R +21 -5
- biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
- biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
- biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
- biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
- biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
- biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
- biopipen/scripts/delim/SampleInfo.R +10 -5
- biopipen/scripts/gene/GeneNameConversion.R +65 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/plot/Manhattan.R +146 -0
- biopipen/scripts/plot/QQPlot.R +146 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +226 -0
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +126 -0
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +96 -0
- biopipen/scripts/regulatory/MotifScan.py +159 -0
- biopipen/scripts/regulatory/atSNP.R +33 -0
- biopipen/scripts/regulatory/motifBreakR.R +1594 -0
- biopipen/scripts/scrna/MarkersFinder.R +69 -67
- biopipen/scripts/scrna/SeuratClustering.R +71 -29
- biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
- biopipen/scripts/scrna/SeuratPreparing.R +252 -122
- biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
- biopipen/scripts/snp/MatrixEQTL.R +85 -44
- biopipen/scripts/snp/Plink2GTMat.py +133 -0
- biopipen/scripts/snp/PlinkCallRate.R +190 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +298 -0
- biopipen/scripts/snp/PlinkFromVcf.py +78 -0
- biopipen/scripts/snp/PlinkHWE.R +80 -0
- biopipen/scripts/snp/PlinkHet.R +92 -0
- biopipen/scripts/snp/PlinkIBD.R +200 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/Mediation.R +94 -0
- biopipen/scripts/stats/MetaPvalue.R +2 -1
- biopipen/scripts/stats/MetaPvalue1.R +70 -0
- biopipen/scripts/tcr/TCRClusterStats.R +12 -7
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/VcfFix_utils.py +1 -1
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/utils/gene.R +83 -37
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.R +56 -0
- biopipen/utils/misc.py +5 -2
- biopipen/utils/reference.py +54 -10
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/METADATA +2 -2
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/RECORD +80 -51
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/entry_points.txt +1 -1
- biopipen/ns/bcftools.py +0 -111
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
library(atSNP)
|
|
2
|
+
library(rtracklayer)
|
|
3
|
+
|
|
4
|
+
log_info("Converting universalmotif object to motif_library ...")
|
|
5
|
+
|
|
6
|
+
motifdb_names <- sapply(meme, function(m) m@name)
|
|
7
|
+
motifs <- check_motifs(motifdb_names)
|
|
8
|
+
meme <- filter_motifs(meme, name = motifs)
|
|
9
|
+
# Get the right order of motif names
|
|
10
|
+
motifs <- sapply(meme, function(m) m@name)
|
|
11
|
+
|
|
12
|
+
# used for atSNP
|
|
13
|
+
mdb <- lapply(meme, function(m) t(m@motif))
|
|
14
|
+
names(mdb) <- motifs
|
|
15
|
+
|
|
16
|
+
# compose one used for plotting using motifbreakR
|
|
17
|
+
motifdb_matrices <- lapply(meme, function(m) m@motif)
|
|
18
|
+
names(motifdb_matrices) <- motifs
|
|
19
|
+
motifdb_meta <- do.call(rbind, lapply(meme, function(m) {
|
|
20
|
+
ats <- attributes(m)
|
|
21
|
+
ats$dataSource <- basename(motifdb)
|
|
22
|
+
ats$class <- NULL
|
|
23
|
+
ats$motif <- NULL
|
|
24
|
+
ats$gapinfo <- NULL
|
|
25
|
+
ats$sequenceCount <- ats$nsites
|
|
26
|
+
ats$providerId <- ats$name
|
|
27
|
+
ats$providerName <- ats$name
|
|
28
|
+
ats$organism <- if (is.null(ats$organism) || length(ats$organism) == 0) "Unknown" else ats$organism
|
|
29
|
+
unlist(ats)
|
|
30
|
+
}))
|
|
31
|
+
rownames(motifdb_meta) <- motifs
|
|
32
|
+
pmotifs <- MotifDb:::MotifList(motifdb_matrices, tbl.metadata = motifdb_meta)
|
|
33
|
+
|
|
34
|
+
log_info("Converting snpinfo to atSNP object ...")
|
|
35
|
+
|
|
36
|
+
# c("chrom", "start", "end", "name", "score", "strand", "ref", "alt", "ref_seq", "alt_seq")
|
|
37
|
+
if (any(nchar(snpinfo$ref) != 1) || any(nchar(snpinfo$alt) != 1)) {
|
|
38
|
+
stop("Only SNVs are supported by atSNP. Consider using motifbreakR instead if you have indels.")
|
|
39
|
+
}
|
|
40
|
+
atsnp_bed <- file.path(outdir, gsub("\\.vcf(\\.gz)?$|\\.bed$", ".atsnp.txt", basename(varfile)))
|
|
41
|
+
snpinfo$name <- ifelse(
|
|
42
|
+
snpinfo$name == "." | is.na(snpinfo$name) | nchar(snpinfo$name) == 0,
|
|
43
|
+
sprintf("%s:%s", snpinfo$chrom, snpinfo$end),
|
|
44
|
+
snpinfo$name
|
|
45
|
+
)
|
|
46
|
+
snpinfo$a1 <- snpinfo$ref
|
|
47
|
+
snpinfo$a2 <- snpinfo$alt
|
|
48
|
+
snpinfo$chr <- snpinfo$chrom
|
|
49
|
+
snpinfo$snp <- snpinfo$end
|
|
50
|
+
snpinfo$snpid <- snpinfo$name
|
|
51
|
+
write.table(
|
|
52
|
+
snpinfo[, c("snpid", "a1", "a2", "chr", "snp")],
|
|
53
|
+
file = atsnp_bed,
|
|
54
|
+
sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE
|
|
55
|
+
)
|
|
56
|
+
k <- max(sapply(mdb, nrow))
|
|
57
|
+
snps <- LoadSNPData(
|
|
58
|
+
atsnp_bed,
|
|
59
|
+
genome.lib = genome_pkg,
|
|
60
|
+
mutation = TRUE, # force using given ref and alt
|
|
61
|
+
default.par = nrow(snpinfo) < 1000,
|
|
62
|
+
half.window.size = k
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# run motifbreakR
|
|
66
|
+
log_info("Running atSNP ...")
|
|
67
|
+
atsnp_scores <- ComputeMotifScore(mdb, snps, ncores = ncores)
|
|
68
|
+
|
|
69
|
+
log_info("Calculating p values ...")
|
|
70
|
+
atsnp_result <- ComputePValues(
|
|
71
|
+
motif.lib = mdb,
|
|
72
|
+
snp.info = snps,
|
|
73
|
+
motif.scores = atsnp_scores$motif.scores,
|
|
74
|
+
ncores = ncores,
|
|
75
|
+
testing.mc = TRUE
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
padj_col <- paste0(atsnp_args$p, "_adj")
|
|
79
|
+
atsnp_result[[padj_col]] <- p.adjust(atsnp_result[[atsnp_args$p]], method = atsnp_args$padj)
|
|
80
|
+
cutoff_col <- if (atsnp_args$padj_cutoff) padj_col else atsnp_args$p
|
|
81
|
+
atsnp_result <- atsnp_result[atsnp_result[[cutoff_col]] < cutoff, , drop = FALSE]
|
|
82
|
+
# order by p value
|
|
83
|
+
atsnp_result <- atsnp_result[order(atsnp_result[[cutoff_col]]), , drop = FALSE]
|
|
84
|
+
snpinfo <- snpinfo[match(atsnp_result$snpid, snpinfo$snpid), , drop = FALSE]
|
|
85
|
+
atsnp_result$chr <- snpinfo$chr
|
|
86
|
+
atsnp_result$start <- snpinfo$start
|
|
87
|
+
atsnp_result$end <- snpinfo$end
|
|
88
|
+
atsnp_result$SNP_id <- snpinfo$snpid
|
|
89
|
+
atsnp_result$snpid <- NULL
|
|
90
|
+
atsnp_result$REF <- snpinfo$ref
|
|
91
|
+
atsnp_result$ALT <- snpinfo$alt
|
|
92
|
+
atsnp_result$providerName <- atsnp_result$motif
|
|
93
|
+
atsnp_result$providerId <- atsnp_result$providerName <- atsnp_result$motif
|
|
94
|
+
atsnp_result$motif <- NULL
|
|
95
|
+
atsnp_result$strand <- snpinfo$strand
|
|
96
|
+
atsnp_result$score <- snpinfo$score
|
|
97
|
+
atsnp_result$snpbase <- NULL
|
|
98
|
+
atsnp_result$altPos <- 1
|
|
99
|
+
atsnp_result$varType <- "SNV"
|
|
100
|
+
atsnp_result$motifPos <- sapply(1:nrow(atsnp_result), function(i) {
|
|
101
|
+
paste(c(atsnp_result$ref_start[i] - k, atsnp_result$ref_end[i] - k), collapse = ",")
|
|
102
|
+
})
|
|
103
|
+
if (!is.null(regulator_col)) {
|
|
104
|
+
atsnp_result$Regulator <- in_motifs[
|
|
105
|
+
match(atsnp_result$providerId, in_motifs[[motif_col]]),
|
|
106
|
+
regulator_col,
|
|
107
|
+
drop = TRUE
|
|
108
|
+
]
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
write.table(
|
|
112
|
+
atsnp_result,
|
|
113
|
+
file = file.path(outdir, "atsnp.txt"),
|
|
114
|
+
sep = "\t", quote = FALSE, row.names = FALSE
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
log_info("Plotting variants ...")
|
|
118
|
+
# Convert result to GRanges object
|
|
119
|
+
atsnp_result$alleleDiff <- -atsnp_result[[cutoff_col]]
|
|
120
|
+
atsnp_result$effect <- "strong"
|
|
121
|
+
atsnp_result$motifPos <- lapply(atsnp_result$motifPos, function(x) as.integer(unlist(strsplit(x, ","))))
|
|
122
|
+
atsnp_result <- makeGRangesFromDataFrame(atsnp_result, keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE)
|
|
123
|
+
attributes(atsnp_result)$genome.package <- genome_pkg
|
|
124
|
+
attributes(atsnp_result)$motifs <- pmotifs
|
|
125
|
+
|
|
126
|
+
plot_variant(atsnp_result)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
library(motifbreakR)
|
|
2
|
+
bsgenome <- getBSgenome(genome_pkg)
|
|
3
|
+
|
|
4
|
+
log_info("Converting universalmotif object to MotifDb object ...")
|
|
5
|
+
|
|
6
|
+
motifdb_names <- sapply(meme, function(m) m@name)
|
|
7
|
+
motifs <- check_motifs(motifdb_names)
|
|
8
|
+
meme <- filter_motifs(meme, name = motifs)
|
|
9
|
+
# Get the right order of motif names
|
|
10
|
+
motifs <- sapply(meme, function(m) m@name)
|
|
11
|
+
motifdb_matrices <- lapply(meme, function(m) m@motif)
|
|
12
|
+
names(motifdb_matrices) <- motifs
|
|
13
|
+
|
|
14
|
+
motifdb_meta <- do.call(rbind, lapply(meme, function(m) {
|
|
15
|
+
ats <- attributes(m)
|
|
16
|
+
ats$dataSource <- basename(motifdb)
|
|
17
|
+
ats$class <- NULL
|
|
18
|
+
ats$motif <- NULL
|
|
19
|
+
ats$gapinfo <- NULL
|
|
20
|
+
ats$sequenceCount <- ats$nsites
|
|
21
|
+
ats$providerId <- ats$name
|
|
22
|
+
ats$providerName <- ats$name
|
|
23
|
+
ats$organism <- if (is.null(ats$organism) || length(ats$organism) == 0) "Unknown" else ats$organism
|
|
24
|
+
unlist(ats)
|
|
25
|
+
}))
|
|
26
|
+
rownames(motifdb_meta) <- motifs
|
|
27
|
+
mdb <- MotifDb:::MotifList(motifdb_matrices, tbl.metadata = motifdb_meta)
|
|
28
|
+
|
|
29
|
+
# `chrom`, `start`, `end`, `name`, `score`, `strand`, `ref`, `alt`.
|
|
30
|
+
is_indel <- nchar(snpinfo$ref) != 1 | nchar(snpinfo$alt) != 1
|
|
31
|
+
snpinfo$coordname <- ifelse(
|
|
32
|
+
is_indel,
|
|
33
|
+
sprintf("%s:%s-%s:%s:%s", snpinfo$chrom, snpinfo$start + 1, snpinfo$end, snpinfo$ref, snpinfo$alt),
|
|
34
|
+
sprintf("%s:%s:%s:%s", snpinfo$chrom, snpinfo$end, snpinfo$ref, snpinfo$alt)
|
|
35
|
+
)
|
|
36
|
+
motifbreakr_bed <- file.path(outdir, gsub("\\.vcf(\\.gz)?$|\\.bed$", ".motifbreakr.bed", basename(varfile)))
|
|
37
|
+
write.table(
|
|
38
|
+
snpinfo[, c("chrom", "start", "end", "coordname", "score", "strand")],
|
|
39
|
+
file = motifbreakr_bed,
|
|
40
|
+
sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE
|
|
41
|
+
)
|
|
42
|
+
snps <- snps.from.file(motifbreakr_bed, search.genome = bsgenome, format = "bed", indels = any(is_indel))
|
|
43
|
+
snpinfo <- snpinfo[snpinfo$coordname == snps$SNP_id, , drop = FALSE]
|
|
44
|
+
snps@elementMetadata$SNP_id <- ifelse(
|
|
45
|
+
snpinfo$name == "." | is.na(snpinfo$name) | nchar(snpinfo$name) == 0,
|
|
46
|
+
snpinfo$coordname,
|
|
47
|
+
snpinfo$name
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# prepare PWMs
|
|
51
|
+
get_bkg <- function(base) {
|
|
52
|
+
base_col <- paste0("bkg.", base)
|
|
53
|
+
base_bkg <- mdb@elementMetadata[[base_col]]
|
|
54
|
+
if (is.null(base_bkg) || length(base_bkg) == 0 || is.na(base_bkg[1])) {
|
|
55
|
+
base_bkg <- 0.25
|
|
56
|
+
} else {
|
|
57
|
+
base_bkg <- as.numeric(base_bkg[1])
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
bkg <- c(A = get_bkg("A"), C = get_bkg("C"), G = get_bkg("G"), T = get_bkg("T"))
|
|
61
|
+
|
|
62
|
+
# run motifbreakR
|
|
63
|
+
log_info("Running motifbreakR ...")
|
|
64
|
+
results <- motifbreakR(
|
|
65
|
+
snpList = snps,
|
|
66
|
+
pwmList = mdb,
|
|
67
|
+
threshold = cutoff,
|
|
68
|
+
method = motifbreakr_args$method,
|
|
69
|
+
bkg = bkg,
|
|
70
|
+
filterp = TRUE,
|
|
71
|
+
show.neutral = FALSE,
|
|
72
|
+
BPPARAM = MulticoreParam(ncores)
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
log_info("Calculating p values ...")
|
|
76
|
+
results <- calculatePvalue(results)
|
|
77
|
+
results_to_save <- as.data.frame(unname(results))
|
|
78
|
+
results_to_save$motifPos <- lapply(results_to_save$motifPos, function(x) paste(x, collapse = ","))
|
|
79
|
+
results_to_save$altPos <- lapply(results_to_save$altPos, function(x) paste(x, collapse = ","))
|
|
80
|
+
if (!is.null(regulator_col)) {
|
|
81
|
+
results_to_save$Regulator <- in_motifs[
|
|
82
|
+
match(results_to_save$providerId, in_motifs[[motif_col]]),
|
|
83
|
+
regulator_col,
|
|
84
|
+
drop = TRUE
|
|
85
|
+
]
|
|
86
|
+
}
|
|
87
|
+
results_to_save <- apply(results_to_save, 2, as.character)
|
|
88
|
+
|
|
89
|
+
write.table(
|
|
90
|
+
results_to_save,
|
|
91
|
+
file = file.path(outdir, "motifbreakr.txt"),
|
|
92
|
+
sep = "\t", quote = FALSE, row.names = FALSE
|
|
93
|
+
)
|
|
94
|
+
rm(results_to_save)
|
|
95
|
+
|
|
96
|
+
plot_variant(results)
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""Script for regulatory.MotifScan"""
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
# Paths may be passed in args or to motifdb
|
|
5
|
+
from pathlib import PosixPath # noqa: F401
|
|
6
|
+
from biopipen.utils.misc import run_command, dict_to_cli_args, logger
|
|
7
|
+
|
|
8
|
+
motiffile = {{in.motiffile | repr}} # pyright: ignore # noqa: #999
|
|
9
|
+
seqfile = {{in.seqfile | repr}} # pyright: ignore
|
|
10
|
+
outdir = {{out.outdir | repr}} # pyright: ignore
|
|
11
|
+
|
|
12
|
+
tool = {{envs.tool | repr}} # pyright: ignore
|
|
13
|
+
fimo = {{envs.fimo | repr}} # pyright: ignore
|
|
14
|
+
motif_col = {{envs.motif_col | repr}} # pyright: ignore
|
|
15
|
+
regulator_col = {{envs.regulator_col | repr}} # pyright: ignore
|
|
16
|
+
notfound = {{envs.notfound | repr}} # pyright: ignore
|
|
17
|
+
motifdb = {{envs.motifdb | repr}} # pyright: ignore
|
|
18
|
+
cutoff = {{envs.cutoff | repr}} # pyright: ignore
|
|
19
|
+
q = {{envs.q | repr}} # pyright: ignore
|
|
20
|
+
q_cutoff = {{envs.q_cutoff | repr}} # pyright: ignore
|
|
21
|
+
args = {{envs.args | dict | repr}} # pyright: ignore
|
|
22
|
+
|
|
23
|
+
# Check if the tool is supported
|
|
24
|
+
if tool != "fimo":
|
|
25
|
+
raise ValueError(f"Unsupported tool: {tool}, currently only fimo is supported")
|
|
26
|
+
|
|
27
|
+
# Check if the motif database is provided
|
|
28
|
+
if motifdb is None:
|
|
29
|
+
raise ValueError("The motif database is required")
|
|
30
|
+
|
|
31
|
+
# Check if the motif file exists
|
|
32
|
+
if not motiffile:
|
|
33
|
+
raise FileNotFoundError(f"Motif file in.motiffile must be provided")
|
|
34
|
+
|
|
35
|
+
# Check if the sequence file exists
|
|
36
|
+
if not seqfile:
|
|
37
|
+
raise FileNotFoundError(f"Sequence file in.seqfile must be provided")
|
|
38
|
+
|
|
39
|
+
# Normalize motif_col and regulator_col into 0-based indexes
|
|
40
|
+
if isinstance(motif_col, str) or isinstance(regulator_col, str):
|
|
41
|
+
with open(motiffile, "r") as f:
|
|
42
|
+
header = f.readline().strip().split("\t")
|
|
43
|
+
if isinstance(motif_col, str):
|
|
44
|
+
motif_col = header.index(motif_col) + 1
|
|
45
|
+
if isinstance(regulator_col, str):
|
|
46
|
+
regulator_col = header.index(regulator_col) + 1
|
|
47
|
+
if isinstance(motif_col, int):
|
|
48
|
+
motif_col -= 1
|
|
49
|
+
if isinstance(regulator_col, int):
|
|
50
|
+
regulator_col -= 1
|
|
51
|
+
|
|
52
|
+
# Check if motif names exist in the database
|
|
53
|
+
with open(motiffile, "r") as f:
|
|
54
|
+
motif_names = set(
|
|
55
|
+
line.strip().split("\t")[motif_col]
|
|
56
|
+
for i, line in enumerate(f)
|
|
57
|
+
if i > 0 # skip header
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
with open(motifdb, "r") as f:
|
|
61
|
+
motif_db_names = set(
|
|
62
|
+
line[6:].strip()
|
|
63
|
+
for line in f
|
|
64
|
+
if line.startswith("MOTIF")
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
if notfound == "error":
|
|
68
|
+
notfound_motifs = motif_names - motif_db_names
|
|
69
|
+
if notfound_motifs:
|
|
70
|
+
raise ValueError(f"Motifs not found in the database: {notfound_motifs}")
|
|
71
|
+
|
|
72
|
+
# Make a new motif database with only the motifs in the motiffile
|
|
73
|
+
motif_names = motif_names & motif_db_names
|
|
74
|
+
motifdb_filtered = f"{outdir}/motif_db.txt"
|
|
75
|
+
with open(motifdb, "r") as f, open(motifdb_filtered, "w") as f_out:
|
|
76
|
+
should_write = True
|
|
77
|
+
for line in f:
|
|
78
|
+
if line.startswith("MOTIF"):
|
|
79
|
+
motif_name = line[6:].strip()
|
|
80
|
+
if motif_name in motif_names:
|
|
81
|
+
should_write = True
|
|
82
|
+
else:
|
|
83
|
+
should_write = False
|
|
84
|
+
|
|
85
|
+
if should_write:
|
|
86
|
+
f_out.write(line)
|
|
87
|
+
else:
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
# Now run fimo
|
|
91
|
+
args[""] = fimo
|
|
92
|
+
args["oc"] = f"{outdir}"
|
|
93
|
+
args["thresh"] = cutoff
|
|
94
|
+
args["qv_thresh"] = q_cutoff
|
|
95
|
+
args["no_qvalue"] = not q
|
|
96
|
+
args["no-pgc"] = True
|
|
97
|
+
args["_"] = [motifdb_filtered, seqfile]
|
|
98
|
+
|
|
99
|
+
logger.info("Running fimo ...")
|
|
100
|
+
run_command(dict_to_cli_args(args, dashify=True), fg=True)
|
|
101
|
+
|
|
102
|
+
logger.info("Adding additional information to the output ...")
|
|
103
|
+
# Get the motif to regulator mapping
|
|
104
|
+
motif_regulator_map = {}
|
|
105
|
+
if regulator_col is not None:
|
|
106
|
+
with open(motiffile, "r") as f:
|
|
107
|
+
next(f) # skip header
|
|
108
|
+
for line in f:
|
|
109
|
+
line = line.strip().split("\t")
|
|
110
|
+
motif_name = line[motif_col]
|
|
111
|
+
regulator = line[regulator_col]
|
|
112
|
+
motif_regulator_map[motif_name] = regulator
|
|
113
|
+
|
|
114
|
+
# Get the sequence name information
|
|
115
|
+
seqnames = {}
|
|
116
|
+
seqcoords = {}
|
|
117
|
+
with open(seqfile, "r") as f:
|
|
118
|
+
for line in f:
|
|
119
|
+
if not line.startswith(">"):
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
seqname = line[1:].strip()
|
|
123
|
+
match = re.match(r"^(.+)::((?:chr)?\d+):(\d+)-(\d+).*$", seqname)
|
|
124
|
+
if not match:
|
|
125
|
+
seqnames[seqname] = seqname
|
|
126
|
+
seqcoords[seqname] = None
|
|
127
|
+
else:
|
|
128
|
+
sname, chrom, start, end = match.groups()
|
|
129
|
+
seqnames[seqname] = sname
|
|
130
|
+
seqcoords[seqname] = (chrom, int(start), int(end))
|
|
131
|
+
|
|
132
|
+
# Add additional information to the output
|
|
133
|
+
with open(f"{outdir}/fimo.tsv", "r") as f, open(f"{outdir}/fimo_output.txt", "w") as f_out:
|
|
134
|
+
header = f.readline().strip().split("\t")
|
|
135
|
+
f_out.write(
|
|
136
|
+
"\t".join(header + ["regulator", "seqname", "seqstart", "seqstop"]) + "\n"
|
|
137
|
+
)
|
|
138
|
+
for line in f:
|
|
139
|
+
line = line.strip()
|
|
140
|
+
if not line or line.startswith("#"):
|
|
141
|
+
continue
|
|
142
|
+
line = line.split("\t")
|
|
143
|
+
motif_name = line[0]
|
|
144
|
+
sequence_name = line[2]
|
|
145
|
+
start = int(line[3])
|
|
146
|
+
stop = int(line[4])
|
|
147
|
+
regulator = motif_regulator_map.get(motif_name, motif_name)
|
|
148
|
+
seqname = seqnames.get(sequence_name, "NA")
|
|
149
|
+
seqcoord = seqcoords.get(sequence_name)
|
|
150
|
+
if not seqcoord:
|
|
151
|
+
seqstart = "NA"
|
|
152
|
+
seqstop = "NA"
|
|
153
|
+
else:
|
|
154
|
+
seqstart = start + seqcoord[1] - 1
|
|
155
|
+
seqstop = stop + seqcoord[2] - 1
|
|
156
|
+
|
|
157
|
+
f_out.write(
|
|
158
|
+
"\t".join(line + [regulator, seqname, str(seqstart), str(seqstop)]) + "\n"
|
|
159
|
+
)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
snpinfo2atsnp <- function(snpinfo) {
|
|
2
|
+
# c("chrom", "start", "end", "name", "score", "strand", "ref", "alt", "ref_seq", "alt_seq")
|
|
3
|
+
if (any(nchar(snpinfo$ref) != 1) || any(nchar(snpinfo$alt) != 1)) {
|
|
4
|
+
stop("Only SNVs are supported by atSNP. Consider using motifbreakR instead if you have indels.")
|
|
5
|
+
}
|
|
6
|
+
base_encodings <- c(A = 1, C = 2, G = 3, T = 4)
|
|
7
|
+
transition <- matrix(
|
|
8
|
+
c(
|
|
9
|
+
0.3225035, 0.1738422, 0.24915044, 0.2545039,
|
|
10
|
+
0.3451410, 0.2642147, 0.05245011, 0.3381942,
|
|
11
|
+
0.2813089, 0.2136604, 0.26749171, 0.2375390,
|
|
12
|
+
0.2149776, 0.2071733, 0.25309238, 0.3247568
|
|
13
|
+
),
|
|
14
|
+
nrow = 4,
|
|
15
|
+
byrow = TRUE
|
|
16
|
+
)
|
|
17
|
+
rownames(transition) <- colnames(transition) <- names(base_encodings)
|
|
18
|
+
list(
|
|
19
|
+
sequence_matrix = unname(sapply(
|
|
20
|
+
snpinfo$ref_seq,
|
|
21
|
+
function(s) as.integer(base_encodings[strsplit(s, "")[[1]]])
|
|
22
|
+
)),
|
|
23
|
+
ref_base = as.integer(base_encodings[snpinfo$ref]),
|
|
24
|
+
snp_base = as.integer(base_encodings[snpinfo$alt]),
|
|
25
|
+
snpids = snpinfo$name,
|
|
26
|
+
transition = transition,
|
|
27
|
+
prior = c(A = 0.287, C = 0.211, G = 0.213, T = 0.289),
|
|
28
|
+
rsid.na = NULL,
|
|
29
|
+
rsid.rm = NULL,
|
|
30
|
+
rsid.duplicate = NULL,
|
|
31
|
+
rsid.missing = NULL
|
|
32
|
+
)
|
|
33
|
+
}
|