biopipen 0.31.4__py3-none-any.whl → 0.31.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/ns/bam.py +41 -0
- biopipen/ns/protein.py +84 -0
- biopipen/ns/regulatory.py +72 -0
- biopipen/ns/vcf.py +7 -3
- biopipen/reports/protein/ProdigySummary.svelte +16 -0
- biopipen/scripts/bam/BamMerge.py +10 -14
- biopipen/scripts/bam/BamSampling.py +90 -0
- biopipen/scripts/protein/Prodigy.py +119 -0
- biopipen/scripts/protein/ProdigySummary.R +133 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +5 -143
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +31 -37
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +25 -26
- biopipen/scripts/regulatory/VariantMotifPlot.R +76 -0
- biopipen/scripts/regulatory/motifs-common.R +322 -0
- biopipen/scripts/vcf/TruvariBench.sh +14 -7
- biopipen/scripts/vcf/TruvariBenchSummary.R +1 -2
- {biopipen-0.31.4.dist-info → biopipen-0.31.6.dist-info}/METADATA +1 -1
- {biopipen-0.31.4.dist-info → biopipen-0.31.6.dist-info}/RECORD +21 -16
- {biopipen-0.31.4.dist-info → biopipen-0.31.6.dist-info}/entry_points.txt +1 -0
- biopipen/scripts/regulatory/atSNP.R +0 -33
- biopipen/scripts/regulatory/motifBreakR.R +0 -1594
- {biopipen-0.31.4.dist-info → biopipen-0.31.6.dist-info}/WHEEL +0 -0
|
@@ -1,1594 +0,0 @@
|
|
|
1
|
-
# Patch motifBreakR to avoid getSeq using the bsgenome but a reference fasta file instead
|
|
2
|
-
# This keeps consistent with other non-R processes
|
|
3
|
-
library(stringr)
|
|
4
|
-
library(matrixStats)
|
|
5
|
-
library(TFMPvalue)
|
|
6
|
-
library(motifStack)
|
|
7
|
-
library(Biostrings)
|
|
8
|
-
library(GenomicRanges)
|
|
9
|
-
library(MotifDb)
|
|
10
|
-
|
|
11
|
-
snpInfo2snpList <- function(snpinfo) {
|
|
12
|
-
snpinfo$start <- snpinfo$start + 1
|
|
13
|
-
snpgr <- makeGRangesFromDataFrame(snpinfo)
|
|
14
|
-
snpgr_meta <- snpinfo[, c("name", "score", "ref", "alt", "ref_seq", "alt_seq")]
|
|
15
|
-
colnames(snpgr_meta) <- c("SNP_id", "score", "REF", "ALT", "ref_seq", "alt_seq")
|
|
16
|
-
snpgr_meta$ref_seq <- substring(snpgr_meta$ref_seq, 2)
|
|
17
|
-
snpgr_meta$alt_seq <- substring(snpgr_meta$alt_seq, 2)
|
|
18
|
-
elementMetadata(snpgr) <- snpgr_meta
|
|
19
|
-
snpgr
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
revcom <- function(ltr) {
|
|
23
|
-
rclist <- list(A = "T", C = "G", G = "C", T = "A")
|
|
24
|
-
rclist[[ltr]]
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
prepareVariants <- function(fsnplist) {
|
|
28
|
-
# The sequences are already fetched in fsnplist at $ref_seq and $alt_seq
|
|
29
|
-
ref_len <- nchar(fsnplist$REF)
|
|
30
|
-
alt_len <- nchar(fsnplist$ALT)
|
|
31
|
-
is.indel <- ref_len > 1L | alt_len > 1L
|
|
32
|
-
if (sum(is.indel) < length(is.indel) & sum(is.indel) > 0L) {
|
|
33
|
-
fsnplist.indel <- fsnplist[is.indel]
|
|
34
|
-
fsnplist.snv <- fsnplist[!is.indel]
|
|
35
|
-
} else if (sum(is.indel) == length(is.indel)) {
|
|
36
|
-
fsnplist.indel <- fsnplist
|
|
37
|
-
fsnplist.snv <- NULL
|
|
38
|
-
} else if (sum(is.indel) == 0L) {
|
|
39
|
-
fsnplist.indel <- NULL
|
|
40
|
-
fsnplist.snv <- fsnplist
|
|
41
|
-
}
|
|
42
|
-
if (!is.null(fsnplist.indel)) {
|
|
43
|
-
snp.sequence.ref.indel <- fsnplist$ref_seq
|
|
44
|
-
snp.sequence.alt.indel <- fsnplist$alt_seq
|
|
45
|
-
|
|
46
|
-
need.alignment <- !(lengths(fsnplist.indel$REF) == 1 | lengths(fsnplist.indel$ALT) == 1)
|
|
47
|
-
insertion.var <- lengths(fsnplist.indel$REF) < lengths(fsnplist.indel$ALT)
|
|
48
|
-
fsnplist.indel$ALT_loc <- 1L
|
|
49
|
-
fsnplist.indel[insertion.var]$ALT_loc <- Map(seq,
|
|
50
|
-
from = nchar(fsnplist.indel[insertion.var]$REF) + 1L,
|
|
51
|
-
to = nchar(fsnplist.indel[insertion.var]$ALT)
|
|
52
|
-
)
|
|
53
|
-
fsnplist.indel[!insertion.var]$ALT_loc <- Map(seq,
|
|
54
|
-
from = nchar(fsnplist.indel[!insertion.var]$ALT) + 1L,
|
|
55
|
-
to = nchar(fsnplist.indel[!insertion.var]$REF)
|
|
56
|
-
)
|
|
57
|
-
ref.len <- nchar(fsnplist.indel[nchar(fsnplist.indel$REF) == nchar(fsnplist.indel$ALT)]$REF)
|
|
58
|
-
fsnplist.indel[nchar(fsnplist.indel$REF) == nchar(fsnplist.indel$ALT)]$ALT_loc <- lapply(
|
|
59
|
-
ref.len,
|
|
60
|
-
function(x) {seq(from = 1L, to = x)}
|
|
61
|
-
)
|
|
62
|
-
fsnplist.indel$varType <- "Other"
|
|
63
|
-
if (sum(insertion.var) > 0) {
|
|
64
|
-
fsnplist.indel[insertion.var, ]$varType <- "Insertion"
|
|
65
|
-
}
|
|
66
|
-
if (sum(lengths(fsnplist.indel$REF) > lengths(fsnplist.indel$ALT)) > 0) {
|
|
67
|
-
fsnplist.indel[lengths(fsnplist.indel$REF) > lengths(fsnplist.indel$ALT)]$varType <- "Deletion"
|
|
68
|
-
}
|
|
69
|
-
if (any(need.alignment)) {
|
|
70
|
-
need.del <- any(!insertion.var & need.alignment)
|
|
71
|
-
need.ins <- any(insertion.var & need.alignment)
|
|
72
|
-
if (need.del) {
|
|
73
|
-
pattern.del <- Map(matchPattern,
|
|
74
|
-
fsnplist.indel[!insertion.var & need.alignment]$ALT,
|
|
75
|
-
fsnplist.indel[!insertion.var & need.alignment]$REF,
|
|
76
|
-
with.indels = FALSE, max.mismatch = 0
|
|
77
|
-
)
|
|
78
|
-
names(pattern.del) <- names(fsnplist.indel[!insertion.var & need.alignment])
|
|
79
|
-
pattern.del <- sapply(pattern.del, function(x) {
|
|
80
|
-
slen <- length(subject(x))
|
|
81
|
-
x <- x[start(x) == 1 | end(x) == slen]
|
|
82
|
-
if (length(x) > 0) {
|
|
83
|
-
x <- x[1]
|
|
84
|
-
x <- gaps(x)
|
|
85
|
-
x <- as(x, "IRanges")
|
|
86
|
-
}
|
|
87
|
-
})
|
|
88
|
-
pattern.del.valid <- sapply(pattern.del, function(x) {
|
|
89
|
-
length(x) > 0
|
|
90
|
-
})
|
|
91
|
-
nr.pattern.del <- lapply(pattern.del[pattern.del.valid], function(x) {
|
|
92
|
-
start(x):end(x)
|
|
93
|
-
})
|
|
94
|
-
fsnplist.indel[!insertion.var & need.alignment][names(pattern.del[pattern.del.valid])]$ALT_loc <- nr.pattern.del
|
|
95
|
-
if (any((!insertion.var & need.alignment)[!pattern.del.valid])) {
|
|
96
|
-
alignment.del <- Map(pairwiseAlignment,
|
|
97
|
-
fsnplist.indel[!insertion.var & need.alignment][!pattern.del.valid]$ALT,
|
|
98
|
-
fsnplist.indel[!insertion.var & need.alignment][!pattern.del.valid]$REF,
|
|
99
|
-
type = "global"
|
|
100
|
-
)
|
|
101
|
-
alt.width <- width(fsnplist.indel[!insertion.var & need.alignment][!pattern.del.valid]$ALT)
|
|
102
|
-
ref.width <- width(fsnplist.indel[!insertion.var & need.alignment][!pattern.del.valid]$REF)
|
|
103
|
-
names(alignment.del) <- names(fsnplist.indel[!insertion.var & need.alignment][!pattern.del.valid])
|
|
104
|
-
alignment.ins <- sapply(sapply(alignment.del, insertion), unlist)
|
|
105
|
-
alignment.del <- sapply(sapply(alignment.del, deletion), unlist)
|
|
106
|
-
alignment.del.valid <- sapply(alignment.del, function(x) {
|
|
107
|
-
length(x) > 0
|
|
108
|
-
})
|
|
109
|
-
alignment.ins.valid <- sapply(alignment.ins, function(x) {
|
|
110
|
-
length(x) > 0
|
|
111
|
-
})
|
|
112
|
-
full.replace <- (alignment.del.valid & alignment.ins.valid & alt.width == ref.width)
|
|
113
|
-
alignment.del.valid <- alignment.del.valid & !full.replace
|
|
114
|
-
nr.alignment.del <- lapply(alignment.del[alignment.del.valid], function(x) {
|
|
115
|
-
start(x):end(x)
|
|
116
|
-
})
|
|
117
|
-
fsnplist.indel[!insertion.var & need.alignment][names(alignment.del[alignment.del.valid])]$ALT_loc <- nr.alignment.del
|
|
118
|
-
rm(alignment.del, nr.alignment.del)
|
|
119
|
-
}
|
|
120
|
-
rm(pattern.del, nr.pattern.del, pattern.del.valid, need.del)
|
|
121
|
-
}
|
|
122
|
-
if (need.ins) {
|
|
123
|
-
pattern.ins <- Map(matchPattern,
|
|
124
|
-
fsnplist.indel[insertion.var & need.alignment]$REF,
|
|
125
|
-
fsnplist.indel[insertion.var & need.alignment]$ALT,
|
|
126
|
-
with.indels = FALSE, max.mismatch = 0
|
|
127
|
-
)
|
|
128
|
-
names(pattern.ins) <- names(fsnplist.indel[insertion.var & need.alignment])
|
|
129
|
-
pattern.ins <- sapply(pattern.ins, function(x) {
|
|
130
|
-
slen <- length(subject(x))
|
|
131
|
-
x <- x[start(x) == 1 | end(x) == slen]
|
|
132
|
-
if (length(x) > 0) {
|
|
133
|
-
x <- x[1]
|
|
134
|
-
x <- gaps(x)
|
|
135
|
-
x <- as(x, "IRanges")
|
|
136
|
-
}
|
|
137
|
-
})
|
|
138
|
-
pattern.ins.valid <- sapply(pattern.ins, function(x) {
|
|
139
|
-
length(x) > 0
|
|
140
|
-
})
|
|
141
|
-
nr.pattern.ins <- lapply(pattern.ins[pattern.ins.valid], function(x) {
|
|
142
|
-
start(x):end(x)
|
|
143
|
-
})
|
|
144
|
-
fsnplist.indel[insertion.var & need.alignment][names(pattern.ins[pattern.ins.valid])]$ALT_loc <- nr.pattern.ins
|
|
145
|
-
if (any((insertion.var & need.alignment)[!pattern.ins.valid])) {
|
|
146
|
-
alignment.ins <- Map(pairwiseAlignment,
|
|
147
|
-
fsnplist.indel[insertion.var & need.alignment][!pattern.ins.valid]$ALT,
|
|
148
|
-
fsnplist.indel[insertion.var & need.alignment][!pattern.ins.valid]$REF,
|
|
149
|
-
type = "global"
|
|
150
|
-
)
|
|
151
|
-
names(alignment.ins) <- names(fsnplist.indel[insertion.var & need.alignment][!pattern.ins.valid])
|
|
152
|
-
alignment.ins <- sapply(sapply(alignment.ins, insertion), unlist)
|
|
153
|
-
alignment.ins.valid <- sapply(alignment.ins, function(x) {
|
|
154
|
-
length(x) > 0
|
|
155
|
-
})
|
|
156
|
-
nr.alignment.ins <- lapply(alignment.ins[alignment.ins.valid], function(x) {
|
|
157
|
-
start(x):end(x)
|
|
158
|
-
})
|
|
159
|
-
fsnplist.indel[insertion.var & need.alignment][names(alignment.ins[alignment.ins.valid])]$ALT_loc <- nr.alignment.ins
|
|
160
|
-
rm(alignment.ins, nr.alignment.ins)
|
|
161
|
-
}
|
|
162
|
-
rm(pattern.ins, nr.pattern.ins, pattern.ins.valid, need.ins)
|
|
163
|
-
}
|
|
164
|
-
rm(ref.len, insertion.var, need.alignment)
|
|
165
|
-
}
|
|
166
|
-
}
|
|
167
|
-
if (!is.null(fsnplist.snv)) {
|
|
168
|
-
snp.sequence.ref.snv <- fsnplist$ref_seq
|
|
169
|
-
snp.sequence.alt.snv <- fsnplist$alt_seq
|
|
170
|
-
fsnplist.snv$ALT_loc <- 1L
|
|
171
|
-
fsnplist.snv$varType <- "SNV"
|
|
172
|
-
}
|
|
173
|
-
if (sum(is.indel) < length(is.indel) & sum(is.indel) > 0L) {
|
|
174
|
-
fsnplist <- c(fsnplist.indel, fsnplist.snv)
|
|
175
|
-
snp.sequence.alt <- strsplit(as.character(c(
|
|
176
|
-
snp.sequence.alt.indel,
|
|
177
|
-
snp.sequence.alt.snv
|
|
178
|
-
)), "")
|
|
179
|
-
snp.sequence.ref <- strsplit(as.character(c(
|
|
180
|
-
snp.sequence.ref.indel,
|
|
181
|
-
snp.sequence.ref.snv
|
|
182
|
-
)), "")
|
|
183
|
-
rm(
|
|
184
|
-
fsnplist.indel, fsnplist.snv,
|
|
185
|
-
snp.sequence.alt.indel, snp.sequence.ref.indel,
|
|
186
|
-
snp.sequence.alt.snv, snp.sequence.ref.snv
|
|
187
|
-
)
|
|
188
|
-
} else if (sum(is.indel) == length(is.indel)) {
|
|
189
|
-
fsnplist <- fsnplist.indel
|
|
190
|
-
snp.sequence.alt <- strsplit(as.character(snp.sequence.alt.indel), "")
|
|
191
|
-
snp.sequence.ref <- strsplit(as.character(snp.sequence.ref.indel), "")
|
|
192
|
-
rm(fsnplist.indel, snp.sequence.alt.indel, snp.sequence.ref.indel)
|
|
193
|
-
} else if (sum(is.indel) == 0L) {
|
|
194
|
-
fsnplist <- fsnplist.snv
|
|
195
|
-
snp.sequence.alt <- strsplit(as.character(snp.sequence.alt.snv), "")
|
|
196
|
-
snp.sequence.ref <- strsplit(as.character(snp.sequence.ref.snv), "")
|
|
197
|
-
rm(fsnplist.snv, snp.sequence.alt.snv, snp.sequence.ref.snv)
|
|
198
|
-
}
|
|
199
|
-
gc()
|
|
200
|
-
return(list(
|
|
201
|
-
fsnplist = fsnplist,
|
|
202
|
-
ref_seq = snp.sequence.ref,
|
|
203
|
-
alt_seq = snp.sequence.alt
|
|
204
|
-
))
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
## An evaluator function for SNP effect
|
|
208
|
-
|
|
209
|
-
varEff <- function(allelR, allelA) {
|
|
210
|
-
score <- allelA - allelR
|
|
211
|
-
if (abs(score) >= 0.7) {
|
|
212
|
-
return(list(score = score, effect = "strong"))
|
|
213
|
-
} else if (abs(score) > 0.4) {
|
|
214
|
-
return(list(score = score, effect = "weak"))
|
|
215
|
-
} else {
|
|
216
|
-
return(list(score = score, effect = "neut"))
|
|
217
|
-
}
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
scoreIndel <- function(pwm,
|
|
221
|
-
ref.seq, alt.seq,
|
|
222
|
-
hit.ref, hit.alt) {
|
|
223
|
-
ref.windows <- scoreSeqWindows(ppm = pwm, seq = ref.seq)
|
|
224
|
-
alt.windows <- scoreSeqWindows(ppm = pwm, seq = alt.seq)
|
|
225
|
-
score <- alt.windows[hit.alt$strand, hit.alt$window] - ref.windows[hit.ref$strand, hit.ref$window]
|
|
226
|
-
if (abs(score) >= 0.7) {
|
|
227
|
-
return(list(score = score, effect = "strong"))
|
|
228
|
-
} else if (abs(score) > 0.4) {
|
|
229
|
-
return(list(score = score, effect = "weak"))
|
|
230
|
-
} else {
|
|
231
|
-
return(list(score = score, effect = "neut"))
|
|
232
|
-
}
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
reverseComplementMotif <- function(pwm) {
|
|
236
|
-
rows <- rownames(pwm)
|
|
237
|
-
cols <- colnames(pwm)
|
|
238
|
-
Ns <- pwm["N", ]
|
|
239
|
-
pwm <- pwm[4:1, length(cols):1]
|
|
240
|
-
pwm <- rbind(pwm, Ns)
|
|
241
|
-
rownames(pwm) <- rows
|
|
242
|
-
colnames(pwm) <- cols
|
|
243
|
-
return(pwm)
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
scoreSeqWindows <- function(ppm, seq) {
|
|
248
|
-
ppm.width <- ncol(ppm)
|
|
249
|
-
seq.len <- length(seq)
|
|
250
|
-
diag.ind <- rep.int(ppm.width, seq.len - ppm.width)
|
|
251
|
-
ranges <- vapply(
|
|
252
|
-
c(0L, cumsum(diag.ind)),
|
|
253
|
-
function(x,
|
|
254
|
-
range = (1L + 0L:(ppm.width - 1L) * (ppm.width + 1L))) {
|
|
255
|
-
x + range
|
|
256
|
-
},
|
|
257
|
-
integer(ppm.width)
|
|
258
|
-
)
|
|
259
|
-
scores <- t(ppm[seq, ])[ranges]
|
|
260
|
-
scores_rc <- t(reverseComplementMotif(ppm)[seq, ])[ranges]
|
|
261
|
-
scores <- split(scores, ceiling(seq_along(scores) / ppm.width))
|
|
262
|
-
scores_rc <- split(scores_rc, ceiling(seq_along(scores_rc) / ppm.width))
|
|
263
|
-
res <- vapply(
|
|
264
|
-
Map(
|
|
265
|
-
function(x, y) {
|
|
266
|
-
matrix(
|
|
267
|
-
data = c(x, y), nrow = 2,
|
|
268
|
-
byrow = TRUE, dimnames = list(c(1, 2))
|
|
269
|
-
)
|
|
270
|
-
},
|
|
271
|
-
scores, scores_rc
|
|
272
|
-
),
|
|
273
|
-
rowSums,
|
|
274
|
-
numeric(2)
|
|
275
|
-
)
|
|
276
|
-
return(res)
|
|
277
|
-
}
|
|
278
|
-
|
|
279
|
-
maxThresholdWindows <- function(window.frame) {
|
|
280
|
-
start.ind <- as.integer(colnames(window.frame)[1]) - 1L
|
|
281
|
-
max.win <- arrayInd(which.max(window.frame), dim(window.frame))
|
|
282
|
-
return(list(
|
|
283
|
-
window = as.integer(colnames(window.frame)[max.win[, 2] + start.ind]),
|
|
284
|
-
strand = c(1, 2)[max.win[, 1]]
|
|
285
|
-
))
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
#' @import methods
|
|
290
|
-
#' @import GenomicRanges
|
|
291
|
-
#' @import S4Vectors
|
|
292
|
-
#' @import BiocGenerics
|
|
293
|
-
#' @import IRanges
|
|
294
|
-
#' @importFrom Biostrings getSeq replaceLetterAt reverseComplement complement replaceAt pairwiseAlignment insertion deletion matchPattern
|
|
295
|
-
#' @importFrom TFMPvalue TFMpv2sc
|
|
296
|
-
#' @importFrom stringr str_locate_all str_sub
|
|
297
|
-
scoreSnpList <- function(fsnplist, pwmList, method = "default", bkg = NULL,
|
|
298
|
-
threshold = 1e-3, show.neutral = FALSE, verbose = FALSE,
|
|
299
|
-
pwmList.pc = NULL, pwmRanges = NULL, filterp = TRUE) {
|
|
300
|
-
k <- max(sapply(pwmList, ncol))
|
|
301
|
-
snp.sequence.alt <- fsnplist$alt_seq
|
|
302
|
-
snp.sequence.ref <- fsnplist$ref_seq
|
|
303
|
-
fsnplist <- fsnplist$fsnplist
|
|
304
|
-
|
|
305
|
-
res.el.e <- new.env()
|
|
306
|
-
for (snp.map.i in seq_along(snp.sequence.alt)) {
|
|
307
|
-
snp.ref <- snp.sequence.ref[[snp.map.i]]
|
|
308
|
-
snp.alt <- snp.sequence.alt[[snp.map.i]]
|
|
309
|
-
ref.len <- nchar(fsnplist[snp.map.i]$REF)
|
|
310
|
-
alt.len <- nchar(fsnplist[snp.map.i]$ALT)
|
|
311
|
-
alt.loc <- fsnplist[snp.map.i]$ALT_loc[[1]]
|
|
312
|
-
res.el <- rep(fsnplist[snp.map.i], length(pwmList))
|
|
313
|
-
res.el$motifPos <- as.integer(NA)
|
|
314
|
-
res.el$motifID <- mcols(pwmList)$providerID
|
|
315
|
-
res.el$geneSymbol <- mcols(pwmList)$geneSymbol
|
|
316
|
-
res.el$dataSource <- mcols(pwmList)$dataSource
|
|
317
|
-
res.el$providerName <- mcols(pwmList)$providerName
|
|
318
|
-
res.el$providerId <- mcols(pwmList)$providerId
|
|
319
|
-
res.el$seqMatch <- as.character(NA)
|
|
320
|
-
res.el$pctRef <- as.numeric(NA)
|
|
321
|
-
res.el$pctAlt <- as.numeric(NA)
|
|
322
|
-
res.el$scoreRef <- as.numeric(NA)
|
|
323
|
-
res.el$scoreAlt <- as.numeric(NA)
|
|
324
|
-
if (filterp) {
|
|
325
|
-
res.el$Refpvalue <- as.numeric(NA)
|
|
326
|
-
res.el$Altpvalue <- as.numeric(NA)
|
|
327
|
-
}
|
|
328
|
-
if (ref.len > 1 | alt.len > 1) {
|
|
329
|
-
res.el$altPos <- as.numeric(NA)
|
|
330
|
-
res.el$alleleDiff <- as.numeric(NA)
|
|
331
|
-
res.el$alleleEffectSize <- as.numeric(NA)
|
|
332
|
-
} else {
|
|
333
|
-
res.el$snpPos <- as.integer(NA)
|
|
334
|
-
res.el$alleleRef <- as.numeric(NA)
|
|
335
|
-
res.el$alleleAlt <- as.numeric(NA)
|
|
336
|
-
}
|
|
337
|
-
res.el$effect <- as.character(NA)
|
|
338
|
-
for (pwm.i in seq_along(pwmList)) {
|
|
339
|
-
pwm.basic <- pwmList[[pwm.i]]
|
|
340
|
-
pwm <- pwmList.pc[[pwm.i]]
|
|
341
|
-
len <- ncol(pwm)
|
|
342
|
-
thresh <- threshold[[pwm.i]]
|
|
343
|
-
seq.start <- min(alt.loc)
|
|
344
|
-
seq.len <- length(alt.loc)
|
|
345
|
-
alt.range <- ref.range <- (k - (ncol(pwm) - seq.start)):(k + ncol(pwm) + seq.start + seq.len - 2)
|
|
346
|
-
if (!show.neutral & identical(snp.ref[ref.range], snp.alt[alt.range])) next()
|
|
347
|
-
seq.remove <- ref.len - alt.len
|
|
348
|
-
if (seq.remove < 0) {
|
|
349
|
-
ref.range <- ref.range[1:(length(ref.range) + seq.remove)]
|
|
350
|
-
} else {
|
|
351
|
-
alt.range <- alt.range[1:(length(alt.range) - seq.remove)]
|
|
352
|
-
}
|
|
353
|
-
ref.windows <- scoreSeqWindows(ppm = pwm, seq = snp.ref[ref.range])
|
|
354
|
-
alt.windows <- scoreSeqWindows(ppm = pwm, seq = snp.alt[alt.range])
|
|
355
|
-
pass.effect <- ifelse(filterp,
|
|
356
|
-
any(alt.windows > thresh) | any(ref.windows > thresh),
|
|
357
|
-
any(((alt.windows - pwmRanges[[pwm.i]][1]) / (pwmRanges[[pwm.i]][2] - pwmRanges[[pwm.i]][1]) > thresh)) |
|
|
358
|
-
any((ref.windows - pwmRanges[[pwm.i]][1]) / (pwmRanges[[pwm.i]][2] - pwmRanges[[pwm.i]][1]) > thresh)
|
|
359
|
-
)
|
|
360
|
-
if (pass.effect) {
|
|
361
|
-
hit.alt <- maxThresholdWindows(alt.windows)
|
|
362
|
-
hit.ref <- maxThresholdWindows(ref.windows)
|
|
363
|
-
bigger <- ref.windows[hit.ref$strand, hit.ref$window] >= alt.windows[hit.alt$strand, hit.alt$window]
|
|
364
|
-
if (bigger) {
|
|
365
|
-
hit <- hit.ref
|
|
366
|
-
} else {
|
|
367
|
-
hit <- hit.alt
|
|
368
|
-
}
|
|
369
|
-
} else {
|
|
370
|
-
hit.alt <- list(window = 0L, strand = 0L)
|
|
371
|
-
hit.ref <- list(window = 0L, strand = 0L)
|
|
372
|
-
hit <- NULL
|
|
373
|
-
}
|
|
374
|
-
if (!show.neutral) {
|
|
375
|
-
if (identical(
|
|
376
|
-
alt.windows[hit.alt$strand, hit.alt$window],
|
|
377
|
-
ref.windows[hit.ref$strand, hit.ref$window]
|
|
378
|
-
)) {
|
|
379
|
-
next()
|
|
380
|
-
}
|
|
381
|
-
}
|
|
382
|
-
if (!is.null(hit)) {
|
|
383
|
-
result <- res.el[pwm.i]
|
|
384
|
-
uniquename <- paste(names(result), result$dataSource, result$providerName, result$providerId, sep = "%%")
|
|
385
|
-
if (nchar(result$REF) > 1 | nchar(result$ALT) > 1) {
|
|
386
|
-
allelR <- ref.windows[hit.ref$strand, hit.ref$window]
|
|
387
|
-
allelA <- alt.windows[hit.alt$strand, hit.alt$window]
|
|
388
|
-
scorediff <- varEff(allelR, allelA)
|
|
389
|
-
effect <- scorediff$effect
|
|
390
|
-
score <- scorediff$score
|
|
391
|
-
# ref.pos <- ref.range[(ncol(pwm) - (seq.start - 1)):((ncol(pwm) + ref.len - seq.start))]
|
|
392
|
-
ref.pos <- k:(k + nchar(result$REF) - 1L)
|
|
393
|
-
# alt.pos <- alt.range[(ncol(pwm) - (seq.start - 1)):((ncol(pwm) + alt.len - seq.start))]
|
|
394
|
-
alt.pos <- k:(k + nchar(result$ALT) - 1L)
|
|
395
|
-
if ((effect == "neut" & show.neutral) | effect != "neut") {
|
|
396
|
-
res.el.e[[uniquename]] <- updateResultsIndel(result,
|
|
397
|
-
snp.ref, snp.alt,
|
|
398
|
-
ref.pos, alt.pos,
|
|
399
|
-
hit.ref, hit.alt,
|
|
400
|
-
ref.windows, alt.windows,
|
|
401
|
-
score, effect, len,
|
|
402
|
-
k, pwm,
|
|
403
|
-
calcp = filterp
|
|
404
|
-
)
|
|
405
|
-
}
|
|
406
|
-
} else {
|
|
407
|
-
snp.pos <- k:(k + nchar(result$REF) - 1L)
|
|
408
|
-
allelR <- ref.windows[hit.ref$strand, hit.ref$window]
|
|
409
|
-
allelA <- alt.windows[hit.alt$strand, hit.alt$window]
|
|
410
|
-
scorediff <- varEff(allelR, allelA)
|
|
411
|
-
effect <- scorediff$effect
|
|
412
|
-
score <- scorediff$score
|
|
413
|
-
if ((effect == "neut" & show.neutral) | effect != "neut") {
|
|
414
|
-
res.el.e[[uniquename]] <- updateResultsIndel(result,
|
|
415
|
-
snp.ref, snp.alt,
|
|
416
|
-
snp.pos, snp.pos,
|
|
417
|
-
hit.ref, hit.alt,
|
|
418
|
-
ref.windows, alt.windows,
|
|
419
|
-
score, effect, len,
|
|
420
|
-
k, pwm,
|
|
421
|
-
calcp = filterp
|
|
422
|
-
)
|
|
423
|
-
}
|
|
424
|
-
}
|
|
425
|
-
}
|
|
426
|
-
}
|
|
427
|
-
}
|
|
428
|
-
resultSet <- unlist(GRangesList(as.list.environment(res.el.e)), use.names = FALSE)
|
|
429
|
-
if (length(resultSet) < 1) {
|
|
430
|
-
if (verbose) {
|
|
431
|
-
message(paste(
|
|
432
|
-
"reached end of SNPs list length =", length(fsnplist),
|
|
433
|
-
"with 0 potentially disruptive matches to", length(unique(resultSet$geneSymbol)),
|
|
434
|
-
"of", length(pwmList), "motifs."
|
|
435
|
-
))
|
|
436
|
-
}
|
|
437
|
-
return(NULL)
|
|
438
|
-
} else {
|
|
439
|
-
if ("ALT_loc" %in% names(mcols(resultSet))) mcols(resultSet)$ALT_loc <- NULL
|
|
440
|
-
max.match <- max(vapply(str_locate_all(resultSet$seqMatch, "\\w"), max, integer(1)))
|
|
441
|
-
min.match <- min(vapply(str_locate_all(resultSet$seqMatch, "\\w"), min, integer(1)))
|
|
442
|
-
resultSet$seqMatch <- str_sub(resultSet$seqMatch,
|
|
443
|
-
start = min.match + 1,
|
|
444
|
-
end = max.match + 1
|
|
445
|
-
)
|
|
446
|
-
if (verbose) {
|
|
447
|
-
message(paste(
|
|
448
|
-
"reached end of SNPs list length =", length(fsnplist),
|
|
449
|
-
"with", length(resultSet), "potentially disruptive matches to", length(unique(resultSet$geneSymbol)),
|
|
450
|
-
"of", length(pwmList), "motifs."
|
|
451
|
-
))
|
|
452
|
-
}
|
|
453
|
-
return(resultSet)
|
|
454
|
-
}
|
|
455
|
-
}
|
|
456
|
-
|
|
457
|
-
#' @importFrom matrixStats colRanges
|
|
458
|
-
#' @importFrom stringr str_pad
|
|
459
|
-
#' @importFrom TFMPvalue TFMsc2pv
|
|
460
|
-
updateResultsSnv <- function(result, snp.seq, snp.pos, hit, ref.windows, alt.windows,
|
|
461
|
-
allelR, allelA, effect, len, k, pwm, calcp) {
|
|
462
|
-
strand.opt <- c("+", "-")
|
|
463
|
-
strand(result) <- strand.opt[[hit$strand]]
|
|
464
|
-
hit$window <- as.integer(hit$window)
|
|
465
|
-
mresult <- mcols(result)
|
|
466
|
-
mresult[["snpPos"]] <- start(result)
|
|
467
|
-
mresult[["motifPos"]] <- as.integer(snp.pos)
|
|
468
|
-
matchs <- snp.seq
|
|
469
|
-
seq.pos <- snp.pos + hit$window - 1
|
|
470
|
-
matchs[-(seq.pos)] <- tolower(matchs[-(seq.pos)])
|
|
471
|
-
matchs <- paste(matchs, collapse = "")
|
|
472
|
-
mresult[["seqMatch"]] <- str_pad(matchs, width = k * 2, side = "both")
|
|
473
|
-
start(result) <- start(result) - snp.pos + 1
|
|
474
|
-
end(result) <- end(result) - snp.pos + len
|
|
475
|
-
if (calcp) {
|
|
476
|
-
mresult[["scoreRef"]] <- ref.windows[hit$strand, hit$window]
|
|
477
|
-
mresult[["scoreAlt"]] <- alt.windows[hit$strand, hit$window]
|
|
478
|
-
mresult[["Refpvalue"]] <- NA
|
|
479
|
-
mresult[["Altpvalue"]] <- NA
|
|
480
|
-
pwmrange <- colSums(colRanges(pwm))
|
|
481
|
-
mresult[["pctRef"]] <- (mresult[["scoreRef"]] - pwmrange[[1]]) / (pwmrange[[2]] - pwmrange[[1]])
|
|
482
|
-
mresult[["pctAlt"]] <- (mresult[["scoreAlt"]] - pwmrange[[1]]) / (pwmrange[[2]] - pwmrange[[1]])
|
|
483
|
-
} else {
|
|
484
|
-
mresult[["pctRef"]] <- ref.windows[hit$strand, hit$window]
|
|
485
|
-
mresult[["pctAlt"]] <- alt.windows[hit$strand, hit$window]
|
|
486
|
-
}
|
|
487
|
-
mresult[["alleleRef"]] <- allelR
|
|
488
|
-
mresult[["alleleAlt"]] <- allelA
|
|
489
|
-
mresult[["effect"]] <- effect
|
|
490
|
-
mcols(result) <- mresult
|
|
491
|
-
return(result)
|
|
492
|
-
}
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
updateResultsIndel <- function(result,
|
|
496
|
-
ref.seq, alt.seq,
|
|
497
|
-
ref.pos, alt.pos,
|
|
498
|
-
hit.ref, hit.alt,
|
|
499
|
-
ref.windows, alt.windows,
|
|
500
|
-
score, effect, len, k, pwm, calcp) {
|
|
501
|
-
strand.opt <- c("+", "-")
|
|
502
|
-
if (score > 0L) {
|
|
503
|
-
best.hit <- hit.alt
|
|
504
|
-
matchs <- alt.seq
|
|
505
|
-
snp.pos <- alt.pos
|
|
506
|
-
} else {
|
|
507
|
-
best.hit <- hit.ref
|
|
508
|
-
matchs <- ref.seq
|
|
509
|
-
snp.pos <- ref.pos
|
|
510
|
-
}
|
|
511
|
-
|
|
512
|
-
strand(result) <- strand.opt[[best.hit$strand]]
|
|
513
|
-
best.hit$window <- as.integer(best.hit$window)
|
|
514
|
-
mresult <- mcols(result)
|
|
515
|
-
alt_loc <- range(mresult$ALT_loc)
|
|
516
|
-
ref_start <- (1 - alt_loc[[1]])
|
|
517
|
-
ref_start <- ifelse(ref_start <= 0, ref_start - 1, ref_start)
|
|
518
|
-
motif.start <- (alt_loc[[1]]) + (-len) + (best.hit$window) + ref_start
|
|
519
|
-
motif.start <- ifelse(motif.start >= 0, motif.start + 1, motif.start)
|
|
520
|
-
if ((mresult$varType == "Insertion" & score < 0) |
|
|
521
|
-
(mresult$varType == "Deletion" & score > 0)) {
|
|
522
|
-
motif.end <- motif.start + len
|
|
523
|
-
} else {
|
|
524
|
-
if (motif.start > 0) {
|
|
525
|
-
motif.end <- len - length(motif.start:length(alt_loc[1]:alt_loc[2]))
|
|
526
|
-
} else {
|
|
527
|
-
motif.end <- motif.start + len - length(alt_loc[1]:alt_loc[2])
|
|
528
|
-
}
|
|
529
|
-
}
|
|
530
|
-
motif.end <- ifelse(motif.end <= 0, motif.end - 1, motif.end)
|
|
531
|
-
mresult$motifPos <- list(c(motif.start, motif.end))
|
|
532
|
-
mresult$altPos <- mresult$ALT_loc
|
|
533
|
-
seq.range <- (k - (len - alt_loc[[1]])):(k + len + alt_loc[[2]] - 2)
|
|
534
|
-
matchs[-(snp.pos)] <- tolower(matchs[-(snp.pos)])
|
|
535
|
-
matchs <- paste(matchs[seq.range], collapse = "")
|
|
536
|
-
mresult[["seqMatch"]] <- str_pad(matchs, width = (k * 2) + alt_loc[[2]], side = "both")
|
|
537
|
-
pwmrange <- colSums(colRanges(pwm[-5, ]))
|
|
538
|
-
mresult[["scoreRef"]] <- ref.windows[hit.ref$strand, hit.ref$window]
|
|
539
|
-
mresult[["scoreAlt"]] <- alt.windows[hit.alt$strand, hit.alt$window]
|
|
540
|
-
mresult[["pctRef"]] <- (mresult[["scoreRef"]] - pwmrange[[1]]) / (pwmrange[[2]] - pwmrange[[1]])
|
|
541
|
-
mresult[["pctAlt"]] <- (mresult[["scoreAlt"]] - pwmrange[[1]]) / (pwmrange[[2]] - pwmrange[[1]])
|
|
542
|
-
if (calcp) {
|
|
543
|
-
mresult[["Refpvalue"]] <- NA
|
|
544
|
-
mresult[["Altpvalue"]] <- NA
|
|
545
|
-
}
|
|
546
|
-
mresult[["alleleDiff"]] <- score
|
|
547
|
-
mresult[["effect"]] <- effect
|
|
548
|
-
mresult[["alleleEffectSize"]] <- score / pwmrange[[2]]
|
|
549
|
-
mcols(result) <- mresult
|
|
550
|
-
return(result)
|
|
551
|
-
}
|
|
552
|
-
|
|
553
|
-
#' @importFrom matrixStats colMaxs colMins
|
|
554
|
-
preparePWM <- function(pwmList,
|
|
555
|
-
filterp,
|
|
556
|
-
bkg,
|
|
557
|
-
scoreThresh,
|
|
558
|
-
method = "default") {
|
|
559
|
-
bkg <- bkg[c("A", "C", "G", "T")]
|
|
560
|
-
|
|
561
|
-
scounts <- as.integer(mcols(pwmList)$sequenceCount)
|
|
562
|
-
scounts[is.na(scounts)] <- 20L
|
|
563
|
-
pwmList.pc <- Map(function(pwm, scount) {
|
|
564
|
-
pwm <- (pwm * scount + 0.25) / (scount + 1)
|
|
565
|
-
}, pwmList, scounts)
|
|
566
|
-
if (method == "ic") {
|
|
567
|
-
pwmOmegas <- lapply(pwmList.pc, function(pwm, b = bkg) {
|
|
568
|
-
omegaic <- colSums(pwm * log2(pwm / b))
|
|
569
|
-
})
|
|
570
|
-
}
|
|
571
|
-
if (method == "default") {
|
|
572
|
-
pwmOmegas <- lapply(pwmList.pc, function(pwm) {
|
|
573
|
-
omegadefault <- colMaxs(pwm) - colMins(pwm)
|
|
574
|
-
})
|
|
575
|
-
}
|
|
576
|
-
if (method == "log") {
|
|
577
|
-
pwmList.pc <- lapply(pwmList.pc, function(pwm, b) {
|
|
578
|
-
pwm <- log(pwm) - log(b)
|
|
579
|
-
}, b = bkg)
|
|
580
|
-
pwmOmegas <- 1
|
|
581
|
-
}
|
|
582
|
-
if (method == "notrans") {
|
|
583
|
-
pwmOmegas <- 1
|
|
584
|
-
}
|
|
585
|
-
pwmList.pc <- Map(function(pwm, omega) {
|
|
586
|
-
if (length(omega) == 1 && omega == 1) {
|
|
587
|
-
return(pwm)
|
|
588
|
-
} else {
|
|
589
|
-
omegamatrix <- matrix(rep(omega, 4), nrow = 4, byrow = TRUE)
|
|
590
|
-
pwm <- pwm * omegamatrix
|
|
591
|
-
}
|
|
592
|
-
}, pwmList.pc, pwmOmegas)
|
|
593
|
-
pwmRanges <- Map(function(pwm, omega) {
|
|
594
|
-
x <- colSums(colRanges(pwm))
|
|
595
|
-
return(x)
|
|
596
|
-
}, pwmList.pc, pwmOmegas)
|
|
597
|
-
if (filterp) {
|
|
598
|
-
pwmList.pc2 <- lapply(pwmList.pc, round, digits = 2)
|
|
599
|
-
pwmThresh <- lapply(pwmList.pc2, TFMpv2sc, pvalue = scoreThresh, bg = bkg, type = "PWM")
|
|
600
|
-
pwmThresh <- Map("+", pwmThresh, -0.02)
|
|
601
|
-
} else {
|
|
602
|
-
pwmThresh <- rep.int(scoreThresh, times = length(pwmRanges))
|
|
603
|
-
}
|
|
604
|
-
pwmList@listData <- lapply(pwmList, function(pwm) {
|
|
605
|
-
pwm <- pwm[c("A", "C", "G", "T"), ]
|
|
606
|
-
pwm <- rbind(pwm, N = 0)
|
|
607
|
-
colnames(pwm) <- as.character(1:ncol(pwm))
|
|
608
|
-
return(pwm)
|
|
609
|
-
})
|
|
610
|
-
pwmList.pc <- lapply(pwmList.pc, function(pwm) {
|
|
611
|
-
pwm <- pwm[c("A", "C", "G", "T"), ]
|
|
612
|
-
pwm <- rbind(pwm, N = 0)
|
|
613
|
-
colnames(pwm) <- as.character(1:ncol(pwm))
|
|
614
|
-
return(pwm)
|
|
615
|
-
})
|
|
616
|
-
return(list(
|
|
617
|
-
pwmList = pwmList,
|
|
618
|
-
pwmListPseudoCount = pwmList.pc,
|
|
619
|
-
pwmRange = pwmRanges,
|
|
620
|
-
pwmThreshold = pwmThresh
|
|
621
|
-
))
|
|
622
|
-
}
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
#' Predict The Disruptiveness Of Single Nucleotide Polymorphisms On
|
|
626
|
-
#' Transcription Factor Binding Sites.
|
|
627
|
-
#'
|
|
628
|
-
#' @param snpList The output of \code{snps.from.rsid} or \code{snps.from.file}
|
|
629
|
-
#' @param pwmList An object of class \code{MotifList} containing the motifs that
|
|
630
|
-
#' you wish to interrogate
|
|
631
|
-
#' @param threshold Numeric; the maximum p-value for a match to be called or a minimum score threshold
|
|
632
|
-
#' @param method Character; one of \code{default}, \code{log}, \code{ic}, or \code{notrans}; see
|
|
633
|
-
#' details.
|
|
634
|
-
#' @param bkg Numeric Vector; the background probabilites of the nucleotides
|
|
635
|
-
#' used with method=\code{log} method=\code{ic}
|
|
636
|
-
#' @param filterp Logical; filter by p-value instead of by pct score.
|
|
637
|
-
#' @param show.neutral Logical; include neutral changes in the output
|
|
638
|
-
#' @param verbose Logical; if running serially, show verbose messages
|
|
639
|
-
#' @param BPPARAM a BiocParallel object see \code{\link[BiocParallel]{register}}
|
|
640
|
-
#' and see \code{getClass("BiocParallelParam")} for additional parameter
|
|
641
|
-
#' classes. Try \code{BiocParallel::registered()} to see what's availible and
|
|
642
|
-
#' for example \code{BiocParallel::bpparam("SerialParam")} would allow serial
|
|
643
|
-
#' evaluation.
|
|
644
|
-
#' @seealso See \code{\link{snps.from.rsid}} and \code{\link{snps.from.file}} for
|
|
645
|
-
#' information about how to generate the input to this function and
|
|
646
|
-
#' \code{\link{plotMB}} for information on how to visualize it's output
|
|
647
|
-
#' @details \pkg{motifbreakR} works with position probability matrices (PPM). PPM
|
|
648
|
-
#' are derived as the fractional occurrence of nucleotides A,C,G, and T at
|
|
649
|
-
#' each position of a position frequency matrix (PFM). PFM are simply the
|
|
650
|
-
#' tally of each nucleotide at each position across a set of aligned
|
|
651
|
-
#' sequences. With a PPM, one can generate probabilities based on the
|
|
652
|
-
#' genome, or more practically, create any number of position specific
|
|
653
|
-
#' scoring matrices (PSSM) based on the principle that the PPM contains
|
|
654
|
-
#' information about the likelihood of observing a particular nucleotide at
|
|
655
|
-
#' a particular position of a true transcription factor binding site. What
|
|
656
|
-
#' follows is a discussion of the three different algorithms that may be
|
|
657
|
-
#' employed in calls to the \pkg{motifbreakR} function via the \code{method}
|
|
658
|
-
#' argument.
|
|
659
|
-
#'
|
|
660
|
-
#' Suppose we have a frequency matrix \eqn{M} of width \eqn{n} (\emph{i.e.} a
|
|
661
|
-
#' PPM as described above). Furthermore, we have a sequence \eqn{s} also of
|
|
662
|
-
#' length \eqn{n}, such that
|
|
663
|
-
#' \eqn{s_{i} \in \{ A,T,C,G \}, i = 1,\ldots n}{s_i in {A,T,G,C}, i = 1 \ldots n}.
|
|
664
|
-
#' Each column of
|
|
665
|
-
#' \eqn{M} contains the frequencies of each letter in each position.
|
|
666
|
-
#'
|
|
667
|
-
#' Commonly in the literature sequences are scored as the sum of log
|
|
668
|
-
#' probabilities:
|
|
669
|
-
#'
|
|
670
|
-
#' \strong{Equation 1}
|
|
671
|
-
#'
|
|
672
|
-
#' \deqn{F( s,M ) = \sum_{i = 1}^{n}{\log( \frac{M_{s_{i},i}}{b_{s_{i}}} )}}{
|
|
673
|
-
#' F( s,M ) = \sum_(i = 1)^n log ((M_s_i,_i)/b_s_i)}
|
|
674
|
-
#'
|
|
675
|
-
#' where \eqn{b_{s_{i}}}{b_s_i} is the background frequency of letter \eqn{s_{i}}{s_i} in
|
|
676
|
-
#' the genome of interest. This method can be specified by the user as
|
|
677
|
-
#' \code{method='log'}.
|
|
678
|
-
#'
|
|
679
|
-
#' As an alternative to this method, we introduced a scoring method to
|
|
680
|
-
#' directly weight the score by the importance of the position within the
|
|
681
|
-
#' match sequence. This method of weighting is accessed by specifying
|
|
682
|
-
#' \code{method='ic'} (information content). A general representation
|
|
683
|
-
#' of this scoring method is given by:
|
|
684
|
-
#'
|
|
685
|
-
#' \strong{Equation 2}
|
|
686
|
-
#'
|
|
687
|
-
#' \deqn{F( s,M ) = p_{s} \cdot \omega_{M}}{F( s,M ) = p_s . \omega_M}
|
|
688
|
-
#'
|
|
689
|
-
#' where \eqn{p_{s}}{p_s} is the scoring vector derived from sequence \eqn{s} and matrix
|
|
690
|
-
#' \eqn{M}, and \eqn{w_{M}}{w_M} is a weight vector derived from \eqn{M}. First, we
|
|
691
|
-
#' compute the scoring vector of position scores \eqn{p}
|
|
692
|
-
#'
|
|
693
|
-
#' \strong{Equation 3}
|
|
694
|
-
#'
|
|
695
|
-
#' \deqn{p_{s} = ( M_{s_{i},i} ) \textrm{\ \ \ where\ \ \ } \frac{i = 1,\ldots n}{s_{i} \in \{ A,C,G,T \}}}{
|
|
696
|
-
#' p_s = ( M_s_i,_i ) where (i = 1 \ldots n)/(s_i in {A,C,G,T})}
|
|
697
|
-
#'
|
|
698
|
-
#' and second, for each \eqn{M} a constant vector of weights
|
|
699
|
-
#' \eqn{\omega_{M} = ( \omega_{1},\omega_{2},\ldots,\omega_{n} )}{\omega_M = ( \omega_1, \omega_2, \ldots, \omega_n)}.
|
|
700
|
-
#'
|
|
701
|
-
#' There are two methods for producing \eqn{\omega_{M}}{\omega_M}. The first, which we
|
|
702
|
-
#' call weighted sum, is the difference in the probabilities for the two
|
|
703
|
-
#' letters of the polymorphism (or variant), \emph{i.e.}
|
|
704
|
-
#' \eqn{\Delta p_{s_{i}}}{\Delta p_s_i}, or the difference of the maximum and minimum
|
|
705
|
-
#' values for each column of \eqn{M}:
|
|
706
|
-
#'
|
|
707
|
-
#' \strong{Equation 4.1}
|
|
708
|
-
#'
|
|
709
|
-
#' \deqn{\omega_{i} = \max \{ M_{i} \} - \min \{ M_{i} \}\textrm{\ \ \ \ where\ \ \ \ \ \ }i = 1,\ldots n}{
|
|
710
|
-
#' \omega_i = max{M_i} - min{M_i} where i = 1 \ldots n}
|
|
711
|
-
#'
|
|
712
|
-
#' The second variation of this theme is to weight by relative entropy.
|
|
713
|
-
#' Thus the relative entropy weight for each column \eqn{i} of the matrix is
|
|
714
|
-
#' given by:
|
|
715
|
-
#'
|
|
716
|
-
#' \strong{Equation 4.2}
|
|
717
|
-
#'
|
|
718
|
-
#' \deqn{\omega_{i} = \sum_{j \in \{ A,C,G,T \}}^{}{M_{j,i}\log_2( \frac{M_{j,i}}{b_{i}} )}\textrm{\ \ \ \ \ where\ \ \ \ \ }i = 1,\ldots n}{
|
|
719
|
-
#' \omega_i = \sum_{j in {A,C,G,T}} {M_(j,i)} log2(M_(j,i)/b_i) where i = 1 \ldots n}
|
|
720
|
-
#'
|
|
721
|
-
#' where \eqn{b_{i}}{b_i} is again the background frequency of the letter \eqn{i}.
|
|
722
|
-
#'
|
|
723
|
-
#' Thus, there are 3 possible algorithms to apply via the \code{method}
|
|
724
|
-
#' argument. The first is the standard summation of log probabilities
|
|
725
|
-
#' (\code{method='log'}). The second and third are the weighted sum and
|
|
726
|
-
#' information content methods (\code{method='default'} and \code{method='ic'}) specified by
|
|
727
|
-
#' equations 4.1 and 4.2, respectively. \pkg{motifbreakR} assumes a
|
|
728
|
-
#' uniform background nucleotide distribution (\eqn{b}) in equations 1 and
|
|
729
|
-
#' 4.2 unless otherwise specified by the user. Since we are primarily
|
|
730
|
-
#' interested in the difference between alleles, background frequency is
|
|
731
|
-
#' not a major factor, although it can change the results. Additionally,
|
|
732
|
-
#' inclusion of background frequency introduces potential bias when
|
|
733
|
-
#' collections of motifs are employed, since motifs are themselves
|
|
734
|
-
#' unbalanced with respect to nucleotide composition. With these cautions
|
|
735
|
-
#' in mind, users may override the uniform distribution if so desired. For
|
|
736
|
-
#' all three methods, \pkg{motifbreakR} scores and reports the reference
|
|
737
|
-
#' and alternate alleles of the sequence
|
|
738
|
-
#' (\eqn{F( s_{\textsc{ref}},M )}{F( s_ref,M )} and
|
|
739
|
-
#' \eqn{F( s_{\textsc{alt}},M )}{F( s_alt,M )}), and provides the matrix scores
|
|
740
|
-
#' \eqn{p_{s_{\textsc{ref}}}}{p_s_ref} and \eqn{p_{s_{\textsc{alt}}}}{p_s_alt} of the SNP (or
|
|
741
|
-
#' variant). The scores are scaled as a fraction of scoring range 0-1 of
|
|
742
|
-
#' the motif matrix, \eqn{M}. If either of
|
|
743
|
-
#' \eqn{F( s_{\textsc{ref}},M )}{F( s_ref,M )} and
|
|
744
|
-
#' \eqn{F( s_{\textsc{alt}},M )}{F( s_alt,M )} is greater than a user-specified
|
|
745
|
-
#' threshold (default value of 0.85) the SNP is reported. By default
|
|
746
|
-
#' \pkg{motifbreakR} does not display neutral effects,
|
|
747
|
-
#' (\eqn{\Delta p_{i} < 0.4}{\Delta p_i < 0.4}) but this behaviour can be
|
|
748
|
-
#' overridden.
|
|
749
|
-
#'
|
|
750
|
-
#' Additionally, now, with the use of \code{\link{TFMPvalue-package}}, we may filter by p-value of the match.
|
|
751
|
-
#' This is unfortunately a two step process. First, by invoking \code{filterp=TRUE} and setting a threshold at
|
|
752
|
-
#' a desired p-value e.g 1e-4, we perform a rough filter on the results by rounding all values in the PWM to two
|
|
753
|
-
#' decimal place, and calculating a scoring threshold based upon that. The second step is to use the function \code{\link{calculatePvalue}()}
|
|
754
|
-
#' on a selection of results which will change the \code{Refpvalue} and \code{Altpvalue} columns in the output from \code{NA} to the p-value
|
|
755
|
-
#' calculated by \code{\link{TFMsc2pv}}. This can be (although not always) a very memory and time intensive process if the algorithm doesn't converge rapidly.
|
|
756
|
-
#'
|
|
757
|
-
#' @return a GRanges object containing:
|
|
758
|
-
#' \item{REF}{the reference allele for the variant}
|
|
759
|
-
#' \item{ALT}{the alternate allele for the variant}
|
|
760
|
-
#' \item{snpPos}{the coordinates of the variant}
|
|
761
|
-
#' \item{motifPos}{The position of the motif relative the the variant}
|
|
762
|
-
#' \item{geneSymbol}{the geneSymbol corresponding to the TF of the TF binding motif}
|
|
763
|
-
#' \item{dataSource}{the source of the TF binding motif}
|
|
764
|
-
#' \item{providerName, providerId}{the name and id provided by the source}
|
|
765
|
-
#' \item{seqMatch}{the sequence on the 5' -> 3' direction of the "+" strand
|
|
766
|
-
#' that corresponds to DNA at the position that the TF binding motif was found.}
|
|
767
|
-
#' \item{pctRef}{The score as determined by the scoring method, when the sequence contains the reference variant allele, normalized to a scale from 0 - 1. If \code{filterp = FALSE},
|
|
768
|
-
#' this is the value that is thresholded.}
|
|
769
|
-
#' \item{pctAlt}{The score as determined by the scoring method, when the sequence contains the alternate variant allele, normalized to a scale from 0 - 1. If \code{filterp = FALSE},
|
|
770
|
-
#' this is the value that is thresholded.}
|
|
771
|
-
#' \item{scoreRef}{The score as determined by the scoring method, when the sequence contains the reference variant allele}
|
|
772
|
-
#' \item{scoreAlt}{The score as determined by the scoring method, when the sequence contains the alternate variant allele}
|
|
773
|
-
#' \item{Refpvalue}{p-value for the match for the pctRef score, initially set to \code{NA}. see \code{\link{calculatePvalue}} for more information}
|
|
774
|
-
#' \item{Altpvalue}{p-value for the match for the pctAlt score, initially set to \code{NA}. see \code{\link{calculatePvalue}} for more information}
|
|
775
|
-
#' \item{alleleRef}{The proportional frequency of the reference allele at position \code{motifPos} in the motif}
|
|
776
|
-
#' \item{alleleAlt}{The proportional frequency of the alternate allele at position \code{motifPos} in the motif}
|
|
777
|
-
#' \item{altPos}{the position, relative to the reference allele, of the alternate allele}
|
|
778
|
-
#' \item{alleleDiff}{The difference between the score on the reference allele and the score on the alternate allele}
|
|
779
|
-
#' \item{alleleEffectSize}{The ratio of the \code{alleleDiff} and the maximal score of a sequence under the PWM}
|
|
780
|
-
#' \item{effect}{one of weak, strong, or neutral indicating the strength of the effect.}
|
|
781
|
-
#' each SNP in this object may be plotted with \code{\link{plotMB}}
|
|
782
|
-
#' @examples
|
|
783
|
-
#' library(BSgenome.Hsapiens.UCSC.hg19)
|
|
784
|
-
#' # prepare variants
|
|
785
|
-
#' load(system.file("extdata",
|
|
786
|
-
#' "pca.enhancer.snps.rda",
|
|
787
|
-
#' package = "motifbreakR"
|
|
788
|
-
#' )) # loads snps.mb
|
|
789
|
-
#' pca.enhancer.snps <- sample(snps.mb, 20)
|
|
790
|
-
#' # Get motifs to interrogate
|
|
791
|
-
#' data(hocomoco)
|
|
792
|
-
#' motifs <- sample(hocomoco, 50)
|
|
793
|
-
#' # run motifbreakR
|
|
794
|
-
#' results <- motifbreakR(pca.enhancer.snps,
|
|
795
|
-
#' motifs,
|
|
796
|
-
#' threshold = 0.85,
|
|
797
|
-
#' method = "ic",
|
|
798
|
-
#' BPPARAM = BiocParallel::SerialParam()
|
|
799
|
-
#' )
|
|
800
|
-
#' @import BiocParallel
|
|
801
|
-
#' @import parallel
|
|
802
|
-
#' @importFrom parallel clusterEvalQ
|
|
803
|
-
#' @importFrom BiocParallel bplapply
|
|
804
|
-
#' @importFrom stringr str_length str_trim
|
|
805
|
-
#' @export
|
|
806
|
-
motifbreakR <- function(snpList, pwmList, threshold = 0.85, filterp = FALSE,
|
|
807
|
-
method = "default", show.neutral = FALSE, verbose = FALSE,
|
|
808
|
-
bkg = c(A = 0.25, C = 0.25, G = 0.25, T = 0.25),
|
|
809
|
-
BPPARAM = bpparam()) {
|
|
810
|
-
## Cluster / MC setup
|
|
811
|
-
if (.Platform$OS.type == "windows" && inherits(BPPARAM, "MulticoreParam")) {
|
|
812
|
-
warning(paste0(
|
|
813
|
-
"Serial evaluation under effect, to achive parallel evaluation under\n",
|
|
814
|
-
"Windows, please supply an alternative BPPARAM"
|
|
815
|
-
))
|
|
816
|
-
}
|
|
817
|
-
cores <- bpnworkers(BPPARAM)
|
|
818
|
-
num.snps <- length(snpList)
|
|
819
|
-
if (num.snps < cores) {
|
|
820
|
-
cores <- num.snps
|
|
821
|
-
}
|
|
822
|
-
if (!(is(BPPARAM, "MulticoreParam") | is(BPPARAM, "SerialParam"))) {
|
|
823
|
-
bpstart(BPPARAM)
|
|
824
|
-
cl <- bpbackend(BPPARAM)
|
|
825
|
-
clusterEvalQ(cl, library("MotifDb"))
|
|
826
|
-
}
|
|
827
|
-
|
|
828
|
-
pwms <- preparePWM(
|
|
829
|
-
pwmList = pwmList, filterp = filterp,
|
|
830
|
-
scoreThresh = threshold, bkg = bkg,
|
|
831
|
-
method = method
|
|
832
|
-
)
|
|
833
|
-
snpList <- prepareVariants(fsnplist = snpList)
|
|
834
|
-
|
|
835
|
-
snpList_cores <- split(as.list(rep(names(snpList), times = cores)), 1:cores)
|
|
836
|
-
for (splitr in seq_along(snpList)) {
|
|
837
|
-
splitcores <- sapply(suppressWarnings(split(snpList[[splitr]], 1:cores)), list)
|
|
838
|
-
for (splitcore in seq_along(snpList_cores)) {
|
|
839
|
-
snpList_cores[[splitcore]][[splitr]] <- splitcores[[splitcore]]
|
|
840
|
-
names(snpList_cores[[splitcore]])[splitr] <- names(snpList)[splitr]
|
|
841
|
-
}
|
|
842
|
-
}
|
|
843
|
-
snpList <- snpList_cores
|
|
844
|
-
rm(snpList_cores)
|
|
845
|
-
|
|
846
|
-
x <- bplapply(snpList, scoreSnpList,
|
|
847
|
-
pwmList = pwms$pwmList, threshold = pwms$pwmThreshold,
|
|
848
|
-
pwmList.pc = pwms$pwmListPseudoCount, pwmRanges = pwms$pwmRange,
|
|
849
|
-
method = method, bkg = bkg, show.neutral = show.neutral,
|
|
850
|
-
verbose = ifelse(cores == 1, verbose, FALSE),
|
|
851
|
-
filterp = filterp, BPPARAM = BPPARAM
|
|
852
|
-
)
|
|
853
|
-
|
|
854
|
-
## Cluster / MC cleanup
|
|
855
|
-
if (inherits(x, "try-error")) {
|
|
856
|
-
if (is(BPPARAM, "SnowParam")) {
|
|
857
|
-
bpstop(BPPARAM)
|
|
858
|
-
}
|
|
859
|
-
stop(attributes(x)$condition)
|
|
860
|
-
}
|
|
861
|
-
if (is(BPPARAM, "SnowParam")) {
|
|
862
|
-
bpstop(BPPARAM)
|
|
863
|
-
}
|
|
864
|
-
|
|
865
|
-
drops <- sapply(x, is.null)
|
|
866
|
-
x <- x[!drops]
|
|
867
|
-
pwmList <- pwms$pwmList
|
|
868
|
-
pwmList@listData <- lapply(pwms$pwmList, function(pwm) {
|
|
869
|
-
pwm <- pwm[c("A", "C", "G", "T"), ]
|
|
870
|
-
return(pwm)
|
|
871
|
-
})
|
|
872
|
-
pwmList.pc <- lapply(pwms$pwmListPseudoCount, function(pwm) {
|
|
873
|
-
pwm <- pwm[c("A", "C", "G", "T"), ]
|
|
874
|
-
return(pwm)
|
|
875
|
-
})
|
|
876
|
-
|
|
877
|
-
if (length(x) > 1) {
|
|
878
|
-
x <- unlist(GRangesList(unname(x)))
|
|
879
|
-
snpList <- unlist(GRangesList(lapply(snpList, `[[`, "fsnplist")), use.names = FALSE)
|
|
880
|
-
x <- x[order(match(x$SNP_id, snpList$SNP_id)), , drop = FALSE]
|
|
881
|
-
# attributes(x)$genome.package <- genome.package
|
|
882
|
-
attributes(x)$motifs <- pwmList[mcols(pwmList)$providerId %in% unique(x$providerId) &
|
|
883
|
-
mcols(pwmList)$providerName %in% unique(x$providerName), ]
|
|
884
|
-
attributes(x)$scoremotifs <- pwmList.pc[names(attributes(x)$motifs)]
|
|
885
|
-
} else {
|
|
886
|
-
if (length(x) == 1L) {
|
|
887
|
-
x <- x[[1]]
|
|
888
|
-
# attributes(x)$genome.package <- genome.package
|
|
889
|
-
attributes(x)$motifs <- pwmList[mcols(pwmList)$providerId %in% unique(x$providerId) &
|
|
890
|
-
mcols(pwmList)$providerName %in% unique(x$providerName), ]
|
|
891
|
-
attributes(x)$scoremotifs <- pwmList.pc[names(attributes(x)$motifs)]
|
|
892
|
-
} else {
|
|
893
|
-
warning("No SNP/Motif Interactions reached threshold")
|
|
894
|
-
x <- NULL
|
|
895
|
-
}
|
|
896
|
-
}
|
|
897
|
-
if (verbose && cores > 1) {
|
|
898
|
-
if (is.null(x)) {
|
|
899
|
-
message(paste(
|
|
900
|
-
"reached end of SNPs list length =", num.snps, "with 0 potentially disruptive matches to",
|
|
901
|
-
length(unique(x$geneSymbol)), "of", length(pwmList), "motifs."
|
|
902
|
-
))
|
|
903
|
-
} else {
|
|
904
|
-
message(paste(
|
|
905
|
-
"reached end of SNPs list length =", num.snps, "with",
|
|
906
|
-
length(x), "potentially disruptive matches to", length(unique(x$geneSymbol)),
|
|
907
|
-
"of", length(pwmList), "motifs."
|
|
908
|
-
))
|
|
909
|
-
}
|
|
910
|
-
}
|
|
911
|
-
return(x)
|
|
912
|
-
}
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
#' Calculate the significance of the matches for the reference and alternate alleles for the for their PWM
|
|
916
|
-
#'
|
|
917
|
-
#' @param results The output of \code{motifbreakR} that was run with \code{filterp=TRUE}
|
|
918
|
-
#' @param background Numeric Vector; the background probabilities of the nucleotides
|
|
919
|
-
#' @param granularity Numeric Vector; the granularity to which to round the PWM,
|
|
920
|
-
#' larger values compromise full accuracy for speed of calculation. A value of
|
|
921
|
-
#' \code{NULL} does no rounding.
|
|
922
|
-
#' @param BPPARAM a BiocParallel object see \code{\link[BiocParallel]{register}}
|
|
923
|
-
#' and see \code{getClass("BiocParallelParam")} for additional parameter
|
|
924
|
-
#' classes. Try \code{BiocParallel::registered()} to see what's available and
|
|
925
|
-
#' for example \code{BiocParallel::bpparam("SerialParam")} would allow serial
|
|
926
|
-
#' evaluation.
|
|
927
|
-
#' @return a GRanges object. The same Granges object that was input as \code{results}, but with
|
|
928
|
-
#' \code{Refpvalue} and \code{Altpvalue} columns in the output modified from \code{NA} to the p-value
|
|
929
|
-
#' calculated by \code{\link{TFMsc2pv}}.
|
|
930
|
-
#' @seealso See \code{\link{TFMsc2pv}} from the \pkg{TFMPvalue} package for
|
|
931
|
-
#' information about how the p-values are calculated.
|
|
932
|
-
#' @details This function is intended to be used on a selection of results produced by \code{\link{motifbreakR}}, and
|
|
933
|
-
#' this can be (although not always) a very memory and time intensive process if the algorithm doesn't converge rapidly.
|
|
934
|
-
#' @source H{\'e}l{\`e}ne Touzet and Jean-St{\'e}phane Varr{\'e} (2007) Efficient and accurate P-value computation for Position Weight Matrices.
|
|
935
|
-
#' Algorithms for Molecular Biology, \bold{2: 15}.
|
|
936
|
-
#' @examples
|
|
937
|
-
#' data(example.results)
|
|
938
|
-
#' rs1006140 <- example.results[example.results$SNP_id %in% "rs1006140"]
|
|
939
|
-
#' # low granularity for speed; 1e-6 or 1e-7 recommended for accuracy
|
|
940
|
-
#' rs1006140 <- calculatePvalue(rs1006140, BPPARAM = BiocParallel::SerialParam(), granularity = 1e-4)
|
|
941
|
-
#'
|
|
942
|
-
#' ##' @importFrom qvalue qvalue
|
|
943
|
-
#'
|
|
944
|
-
#' @export
|
|
945
|
-
calculatePvalue <- function(results,
|
|
946
|
-
background = c(A = 0.25, C = 0.25, G = 0.25, T = 0.25),
|
|
947
|
-
granularity = NULL,
|
|
948
|
-
BPPARAM = BiocParallel::SerialParam()) {
|
|
949
|
-
|
|
950
|
-
## Cluster / MC setup
|
|
951
|
-
if (.Platform$OS.type == "windows" && inherits(BPPARAM, "MulticoreParam")) {
|
|
952
|
-
warning(paste0(
|
|
953
|
-
"Serial evaluation under effect, to achive parallel evaluation under\n",
|
|
954
|
-
"Windows, please supply an alternative BPPARAM"
|
|
955
|
-
))
|
|
956
|
-
}
|
|
957
|
-
cores <- bpnworkers(BPPARAM)
|
|
958
|
-
num.res <- length(results)
|
|
959
|
-
if (num.res < cores) {
|
|
960
|
-
cores <- num.res
|
|
961
|
-
}
|
|
962
|
-
if (!(is(BPPARAM, "MulticoreParam") | is(BPPARAM, "SerialParam"))) {
|
|
963
|
-
bpstart(BPPARAM)
|
|
964
|
-
cl <- bpbackend(BPPARAM)
|
|
965
|
-
clusterEvalQ(cl, library("MotifDb"))
|
|
966
|
-
}
|
|
967
|
-
if (!("scoreRef" %in% names(mcols(results)))) {
|
|
968
|
-
stop("incorrect results format; please rerun analysis with filterp=TRUE")
|
|
969
|
-
} else {
|
|
970
|
-
pwmListmeta <- mcols(attributes(results)$motifs, use.names = TRUE)
|
|
971
|
-
pwmList <- attributes(results)$scoremotifs
|
|
972
|
-
if (!is.null(granularity)) {
|
|
973
|
-
pwmList <- lapply(pwmList, function(x, g) {
|
|
974
|
-
x <- floor(x / g) * g
|
|
975
|
-
return(x)
|
|
976
|
-
}, g = granularity)
|
|
977
|
-
}
|
|
978
|
-
results_sp <- split(results, 1:length(results))
|
|
979
|
-
pvalues <- bplapply(results_sp, function(i, pwmList, pwmListmeta, bkg) {
|
|
980
|
-
result <- i
|
|
981
|
-
pwm.id <- result$providerId
|
|
982
|
-
pwm.name.f <- result$providerName
|
|
983
|
-
pwmmeta <- pwmListmeta[pwmListmeta$providerId == pwm.id & pwmListmeta$providerName == pwm.name.f, ]
|
|
984
|
-
pwm <- pwmList[[rownames(pwmmeta)[1]]]
|
|
985
|
-
ref <- TFMsc2pv(pwm, mcols(result)[["scoreRef"]], bg = bkg, type = "PWM")
|
|
986
|
-
alt <- TFMsc2pv(pwm, mcols(result)[["scoreAlt"]], bg = bkg, type = "PWM")
|
|
987
|
-
# gc()
|
|
988
|
-
return(data.frame(ref = ref, alt = alt))
|
|
989
|
-
}, pwmList = pwmList, pwmListmeta = pwmListmeta, bkg = background, BPPARAM = BPPARAM)
|
|
990
|
-
## Cluster / MC cleanup
|
|
991
|
-
if (inherits(pvalues, "try-error")) {
|
|
992
|
-
if (is(BPPARAM, "SnowParam")) {
|
|
993
|
-
bpstop(BPPARAM)
|
|
994
|
-
}
|
|
995
|
-
stop(attributes(pvalues)$condition)
|
|
996
|
-
}
|
|
997
|
-
pvalues.df <- base::do.call("rbind", c(pvalues, make.row.names = FALSE))
|
|
998
|
-
results$Refpvalue <- pvalues.df[, "ref"]
|
|
999
|
-
results$Altpvalue <- pvalues.df[, "alt"]
|
|
1000
|
-
|
|
1001
|
-
if (is(BPPARAM, "SnowParam")) {
|
|
1002
|
-
bpstop(BPPARAM)
|
|
1003
|
-
}
|
|
1004
|
-
|
|
1005
|
-
return(results)
|
|
1006
|
-
}
|
|
1007
|
-
}
|
|
1008
|
-
|
|
1009
|
-
addPWM.stack <- function(identifier, index, GdObject, pwm_stack, ...) {
|
|
1010
|
-
plotMotifLogoStack.3(pwm_stack)
|
|
1011
|
-
}
|
|
1012
|
-
|
|
1013
|
-
selcor <- function(identifier, index, GdObject, ...) {
|
|
1014
|
-
if (identical(index, 1L)) {
|
|
1015
|
-
return(TRUE)
|
|
1016
|
-
} else {
|
|
1017
|
-
return(FALSE)
|
|
1018
|
-
}
|
|
1019
|
-
}
|
|
1020
|
-
|
|
1021
|
-
selall <- function(identifier, GdObject, ...) {
|
|
1022
|
-
return(TRUE)
|
|
1023
|
-
}
|
|
1024
|
-
|
|
1025
|
-
#' @importFrom grid grid.newpage pushViewport viewport popViewport
|
|
1026
|
-
plotMotifLogoStack.3 <- function(pfms, ...) {
|
|
1027
|
-
n <- length(pfms)
|
|
1028
|
-
lapply(pfms, function(.ele) {
|
|
1029
|
-
# if (class(.ele) != "pfm")
|
|
1030
|
-
if (!is(.ele, "pfm")) {
|
|
1031
|
-
stop("pfms must be a list of class pfm")
|
|
1032
|
-
}
|
|
1033
|
-
})
|
|
1034
|
-
assign("tmp_motifStack_symbolsCache", list(), pos = ".GlobalEnv")
|
|
1035
|
-
# grid.newpage()
|
|
1036
|
-
ht <- 1 / n
|
|
1037
|
-
y0 <- 0.5 * ht
|
|
1038
|
-
for (i in rev(seq.int(n))) {
|
|
1039
|
-
pushViewport(viewport(y = y0, height = ht))
|
|
1040
|
-
plotMotifLogo(pfms[[i]],
|
|
1041
|
-
motifName = pfms[[i]]@name, ncex = 1,
|
|
1042
|
-
p = pfms[[i]]@background, colset = pfms[[i]]@color,
|
|
1043
|
-
xlab = NA, newpage = FALSE, margins = c(
|
|
1044
|
-
1.5, 4.1,
|
|
1045
|
-
1.1, 0.1
|
|
1046
|
-
), ...
|
|
1047
|
-
)
|
|
1048
|
-
popViewport()
|
|
1049
|
-
y0 <- y0 + ht
|
|
1050
|
-
}
|
|
1051
|
-
rm(list = "tmp_motifStack_symbolsCache", pos = ".GlobalEnv")
|
|
1052
|
-
return()
|
|
1053
|
-
}
|
|
1054
|
-
|
|
1055
|
-
#' @importFrom stringr str_replace
|
|
1056
|
-
#' @importFrom motifStack addBlank
|
|
1057
|
-
DNAmotifAlignment.2snp <- function(pwms, result) {
|
|
1058
|
-
from <- min(sapply(result$motifPos, `[`, 1))
|
|
1059
|
-
to <- max(sapply(result$motifPos, `[`, 2))
|
|
1060
|
-
# pos <- mcols(result)$motifPos
|
|
1061
|
-
# pos <- pos[as.logical(strand(result) == "+")][1]
|
|
1062
|
-
for (pwm.i in seq_along(pwms)) {
|
|
1063
|
-
# pwm <- pwms[[pwm.i]]@mat
|
|
1064
|
-
## get pwm info from result data
|
|
1065
|
-
pwm.name <- pwms[[pwm.i]]@name
|
|
1066
|
-
pwm.name <- str_replace(pwm.name, pattern = "-:rc$", replacement = "")
|
|
1067
|
-
pwm.name <- str_replace(pwm.name, pattern = "-:r$", replacement = "")
|
|
1068
|
-
pwm.info <- attributes(result)$motifs
|
|
1069
|
-
pwm.id <- mcols(pwm.info[pwm.name, ])$providerId
|
|
1070
|
-
pwm.name <- mcols(pwm.info[pwm.name, ])$providerName
|
|
1071
|
-
mresult <- result[result$providerId == pwm.id & result$providerName == pwm.name, ]
|
|
1072
|
-
mstart <- mresult$motifPos[[1]][1]
|
|
1073
|
-
mend <- mresult$motifPos[[1]][2]
|
|
1074
|
-
if ((mcols(mresult)$varType == "Insertion" & mcols(mresult)$alleleDiff < 0) |
|
|
1075
|
-
(mcols(mresult)$varType == "Deletion" & mcols(mresult)$alleleDiff > 0)) {
|
|
1076
|
-
new.mat <- cbind(
|
|
1077
|
-
pwms[[pwm.i]]@mat[, 1:abs(mresult$motifPos[[1]][1])],
|
|
1078
|
-
matrix(c(0.25, 0.25, 0.25, 0.25), ncol = length(mcols(mresult)$altPos[[1]]), nrow = 4),
|
|
1079
|
-
pwms[[pwm.i]]@mat[, (abs(mresult$motifPos[[1]][1]) + 1):ncol(pwms[[pwm.i]]@mat)]
|
|
1080
|
-
)
|
|
1081
|
-
pwms[[pwm.i]]@mat <- new.mat
|
|
1082
|
-
start.offset <- mstart - from
|
|
1083
|
-
end.offset <- to - mend
|
|
1084
|
-
} else {
|
|
1085
|
-
if (mstart < 0 | from > 0) {
|
|
1086
|
-
start.offset <- mstart - from
|
|
1087
|
-
} else {
|
|
1088
|
-
start.offset <- (mstart - 1) - from
|
|
1089
|
-
}
|
|
1090
|
-
if (mend > 0 | to < 0) {
|
|
1091
|
-
end.offset <- to - mend
|
|
1092
|
-
} else {
|
|
1093
|
-
end.offset <- to - (mend + 1)
|
|
1094
|
-
}
|
|
1095
|
-
}
|
|
1096
|
-
if (start.offset > 0) {
|
|
1097
|
-
pwms[[pwm.i]] <- addBlank(x = pwms[[pwm.i]], n = start.offset, b = FALSE)
|
|
1098
|
-
}
|
|
1099
|
-
if (end.offset > 0) {
|
|
1100
|
-
pwms[[pwm.i]] <- addBlank(x = pwms[[pwm.i]], n = end.offset, b = TRUE)
|
|
1101
|
-
}
|
|
1102
|
-
}
|
|
1103
|
-
return(pwms)
|
|
1104
|
-
}
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
#' Plot a genomic region surrounding a genomic variant, and potentially disrupted
|
|
1109
|
-
#' motifs
|
|
1110
|
-
#'
|
|
1111
|
-
#' @param results The output of \code{motifbreakR}
|
|
1112
|
-
#' @param rsid Character; the identifier of the variant to be visualized
|
|
1113
|
-
#' @param reverseMotif Logical; if the motif is on the "-" strand show the
|
|
1114
|
-
#' the motifs as reversed \code{FALSE} or reverse complement \code{TRUE}
|
|
1115
|
-
#' @param effect Character; show motifs that are strongly effected \code{c("strong")},
|
|
1116
|
-
#' weakly effected \code{c("weak")}, or both \code{c("strong", "weak")}
|
|
1117
|
-
#' @param altAllele Character; The default value of \code{NULL} uses the first (or only)
|
|
1118
|
-
#' alternative allele for the SNP to be plotted.
|
|
1119
|
-
#' @seealso See \code{\link{motifbreakR}} for the function that produces output to be
|
|
1120
|
-
#' visualized here, also \code{\link{snps.from.rsid}} and \code{\link{snps.from.file}}
|
|
1121
|
-
#' for information about how to generate the input to \code{\link{motifbreakR}}
|
|
1122
|
-
#' function.
|
|
1123
|
-
#' @details \code{plotMB} produces output showing the location of the SNP on the
|
|
1124
|
-
#' chromosome, the surrounding sequence of the + strand, the footprint of any
|
|
1125
|
-
#' motif that is disrupted by the SNP or SNV, and the DNA sequence motif(s).
|
|
1126
|
-
#' The \code{altAllele} argument is included for variants like rs1006140 where
|
|
1127
|
-
#' multiple alternate alleles exist, the reference allele is A, and the alternate
|
|
1128
|
-
#' can be G,T, or C. \code{plotMB} only plots one alternate allele at a time.
|
|
1129
|
-
#' @return plots a figure representing the results of \code{motifbreakR} at the
|
|
1130
|
-
#' location of a single SNP, returns invisible \code{NULL}.
|
|
1131
|
-
#' @examples
|
|
1132
|
-
#' data(example.results)
|
|
1133
|
-
#' example.results
|
|
1134
|
-
#' \donttest{
|
|
1135
|
-
#' library(BSgenome.Hsapiens.UCSC.hg19)
|
|
1136
|
-
#' plotMB(results = example.results, rsid = "rs1006140", effect = "strong", altAllele = "C")
|
|
1137
|
-
#' }
|
|
1138
|
-
#' @importFrom motifStack DNAmotifAlignment colorset motifStack plotMotifLogo plotMotifLogoStack
|
|
1139
|
-
#' @importClassesFrom motifStack pfm marker
|
|
1140
|
-
#' @import grDevices
|
|
1141
|
-
#' @importFrom grid gpar
|
|
1142
|
-
#' @importFrom Gviz IdeogramTrack SequenceTrack GenomeAxisTrack HighlightTrack
|
|
1143
|
-
#' AnnotationTrack plotTracks
|
|
1144
|
-
#' @export
|
|
1145
|
-
plotMB <- function(results, rsid, reverseMotif = TRUE, effect = c("strong", "weak"), altAllele = NULL) {
|
|
1146
|
-
motif.starts <- sapply(results$motifPos, `[`, 1)
|
|
1147
|
-
motif.starts <- start(results) + motif.starts
|
|
1148
|
-
motif.starts <- order(motif.starts)
|
|
1149
|
-
results <- results[motif.starts]
|
|
1150
|
-
g <- genome(results)[[1]]
|
|
1151
|
-
result <- results[results$SNP_id %in% rsid]
|
|
1152
|
-
if (is.null(altAllele)) {
|
|
1153
|
-
altAllele <- result$ALT[[1]]
|
|
1154
|
-
}
|
|
1155
|
-
result <- result[result$ALT == altAllele]
|
|
1156
|
-
result <- result[order(sapply(result$motifPos, min), sapply(result$motifPos, max)), ]
|
|
1157
|
-
result <- result[result$effect %in% effect]
|
|
1158
|
-
chromosome <- as.character(seqnames(result))[[1]]
|
|
1159
|
-
genome.package <- attributes(result)$genome.package
|
|
1160
|
-
genome.bsgenome <- eval(parse(text = genome.package))
|
|
1161
|
-
seq.len <- max(length(result$REF[[1]]), length(result$ALT[[1]]))
|
|
1162
|
-
distance.to.edge <- max(abs(c(
|
|
1163
|
-
sapply(result$motifPos, min),
|
|
1164
|
-
sapply(result$motifPos, max)
|
|
1165
|
-
))) + 4
|
|
1166
|
-
from <- start(result)[[1]] - distance.to.edge + 1
|
|
1167
|
-
to <- end(result)[[1]] + distance.to.edge
|
|
1168
|
-
pwmList <- attributes(result)$motifs
|
|
1169
|
-
pwm.names <- result$providerId
|
|
1170
|
-
results_motifs <- paste0(result$providerId, result$providerName)
|
|
1171
|
-
list_motifs <- paste0(mcols(pwmList)$providerId, mcols(pwmList)$providerName)
|
|
1172
|
-
pwms <- pwmList <- pwmList[match(results_motifs, list_motifs)]
|
|
1173
|
-
if (reverseMotif) {
|
|
1174
|
-
for (pwm.i in seq_along(pwms)) {
|
|
1175
|
-
pwm.name <- names(pwms[pwm.i])
|
|
1176
|
-
pwm.id <- mcols(pwms[pwm.name, ])$providerId
|
|
1177
|
-
pwm.name.f <- mcols(pwms[pwm.name, ])$providerName
|
|
1178
|
-
doRev <- as.logical(strand(result[result$providerId == pwm.id & result$providerName == pwm.name.f, ]) == "-")
|
|
1179
|
-
if (doRev) {
|
|
1180
|
-
pwm <- pwms[[pwm.i]]
|
|
1181
|
-
pwm <- pwm[, rev(1:ncol(pwm))]
|
|
1182
|
-
rownames(pwm) <- c("T", "G", "C", "A")
|
|
1183
|
-
pwm <- pwm[c("A", "C", "G", "T"), ]
|
|
1184
|
-
pwms[[pwm.i]] <- pwm
|
|
1185
|
-
names(pwms)[pwm.i] <- paste0(names(pwms)[pwm.i], "-:rc")
|
|
1186
|
-
}
|
|
1187
|
-
}
|
|
1188
|
-
} else {
|
|
1189
|
-
for (pwm.i in seq_along(pwms)) {
|
|
1190
|
-
pwm.name <- names(pwms[pwm.i])
|
|
1191
|
-
pwm.id <- mcols(pwms[pwm.name, ])$providerId
|
|
1192
|
-
pwm.name.f <- mcols(pwms[pwm.name, ])$providerName
|
|
1193
|
-
doRev <- as.logical(strand(result[result$providerId == pwm.id & result$providerName == pwm.name.f, ]) == "-")
|
|
1194
|
-
if (doRev) {
|
|
1195
|
-
pwm <- pwms[[pwm.i]]
|
|
1196
|
-
pwm <- pwm[, rev(1:ncol(pwm))]
|
|
1197
|
-
pwms[[pwm.i]] <- pwm
|
|
1198
|
-
names(pwms)[pwm.i] <- paste0(names(pwms)[pwm.i], "-:r")
|
|
1199
|
-
}
|
|
1200
|
-
}
|
|
1201
|
-
}
|
|
1202
|
-
pwms <- lapply(names(pwms), function(x, pwms = pwms) {
|
|
1203
|
-
new("pfm",
|
|
1204
|
-
mat = pwms[[x]],
|
|
1205
|
-
name = x
|
|
1206
|
-
)
|
|
1207
|
-
}, pwms)
|
|
1208
|
-
pwms <- DNAmotifAlignment.2snp(pwms, result)
|
|
1209
|
-
pwmwide <- max(sapply(pwms, function(x) {
|
|
1210
|
-
ncol(x@mat)
|
|
1211
|
-
}))
|
|
1212
|
-
|
|
1213
|
-
markerStart <- result$motifPos[[1]][1]
|
|
1214
|
-
if (markerStart > 0) {
|
|
1215
|
-
markerEnd <- length(result$altPos[[1]]) + 1
|
|
1216
|
-
markerEnd <- markerEnd - markerStart
|
|
1217
|
-
markerStart <- 1
|
|
1218
|
-
} else {
|
|
1219
|
-
markerStart <- -1 * markerStart
|
|
1220
|
-
markerEnd <- markerStart + length(result$altPos[[1]])
|
|
1221
|
-
if (result$varType[[1]] %in% c("Other", "SNV")) {
|
|
1222
|
-
markerStart <- markerStart + 1
|
|
1223
|
-
}
|
|
1224
|
-
}
|
|
1225
|
-
varType <- result$varType[[1]]
|
|
1226
|
-
varType <- switch(varType,
|
|
1227
|
-
Deletion = "firebrick",
|
|
1228
|
-
Insertion = "springgreen4",
|
|
1229
|
-
Other = "gray13"
|
|
1230
|
-
)
|
|
1231
|
-
markerRect <- new("marker",
|
|
1232
|
-
type = "rect",
|
|
1233
|
-
start = markerStart,
|
|
1234
|
-
stop = markerEnd,
|
|
1235
|
-
gp = gpar(
|
|
1236
|
-
lty = 2,
|
|
1237
|
-
fill = NA,
|
|
1238
|
-
lwd = 3,
|
|
1239
|
-
col = varType
|
|
1240
|
-
)
|
|
1241
|
-
)
|
|
1242
|
-
for (pwm.i in seq_along(pwms)) {
|
|
1243
|
-
pwms[[pwm.i]]@markers <- list(markerRect)
|
|
1244
|
-
}
|
|
1245
|
-
ideoT <- try(IdeogramTrack(genome = g, chromosome = chromosome), silent = TRUE)
|
|
1246
|
-
if (inherits(ideoT, "try-error")) {
|
|
1247
|
-
backup.band <- data.frame(
|
|
1248
|
-
chrom = chromosome, chromStart = 0,
|
|
1249
|
-
chromEnd = length(genome.bsgenome[[chromosome]]),
|
|
1250
|
-
name = chromosome, gieStain = "gneg"
|
|
1251
|
-
)
|
|
1252
|
-
ideoT <- IdeogramTrack(genome = g, chromosome = chromosome, bands = backup.band)
|
|
1253
|
-
}
|
|
1254
|
-
|
|
1255
|
-
### blank alt sequence
|
|
1256
|
-
altseq <- genome.bsgenome[[chromosome]]
|
|
1257
|
-
|
|
1258
|
-
### Replace longer sections
|
|
1259
|
-
at <- IRanges(start = start(result[1]), width = width(result[1]))
|
|
1260
|
-
if (result$varType[[1]] == "Deletion") {
|
|
1261
|
-
reflen <- length(result$REF[[1]])
|
|
1262
|
-
addedN <- DNAString(paste0(rep.int(".", reflen), collapse = ""))
|
|
1263
|
-
addedN <- replaceLetterAt(addedN, at = (1:reflen)[-result$altPos[[1]]], result$ALT[[1]])
|
|
1264
|
-
axisT <- GenomeAxisTrack(exponent = 0)
|
|
1265
|
-
seqT <- SequenceTrack(genome.bsgenome, fontcolor = colorset("DNA", "auto"))
|
|
1266
|
-
altseq <- replaceAt(x = altseq, at = at, addedN)
|
|
1267
|
-
} else if (result$varType[[1]] == "Insertion") {
|
|
1268
|
-
altlen <- length(result$ALT[[1]])
|
|
1269
|
-
addedN <- DNAString(paste0(rep.int(".", altlen), collapse = ""))
|
|
1270
|
-
addedN <- replaceLetterAt(addedN, at = (1:altlen)[-result$altPos[[1]]], result$REF[[1]])
|
|
1271
|
-
refseq <- genome.bsgenome[[chromosome]]
|
|
1272
|
-
refseq <- DNAStringSet(replaceAt(x = refseq, at = at, addedN))
|
|
1273
|
-
altseq <- replaceAt(x = altseq, at = at, result$ALT[[1]])
|
|
1274
|
-
names(refseq) <- chromosome
|
|
1275
|
-
seqT <- SequenceTrack(refseq,
|
|
1276
|
-
fontcolor = c(colorset("DNA", "auto"), N = "#FFFFFF", . = "#FFE3E6"),
|
|
1277
|
-
chromosome = chromosome
|
|
1278
|
-
)
|
|
1279
|
-
} else {
|
|
1280
|
-
axisT <- GenomeAxisTrack(exponent = 0)
|
|
1281
|
-
altseq <- replaceAt(x = altseq, at = at, result$ALT[[1]])
|
|
1282
|
-
seqT <- SequenceTrack(genome.bsgenome, fontcolor = colorset("DNA", "auto"))
|
|
1283
|
-
}
|
|
1284
|
-
altseq <- DNAStringSet(altseq)
|
|
1285
|
-
names(altseq) <- chromosome
|
|
1286
|
-
seqAltT <- SequenceTrack(altseq,
|
|
1287
|
-
fontcolor = c(colorset("DNA", "auto"), N = "#FFFFFF", . = "#FFE3E6"),
|
|
1288
|
-
chromosome = chromosome
|
|
1289
|
-
)
|
|
1290
|
-
|
|
1291
|
-
# altseq <- replaceLetterAt(altseq, at = wherereplace, letter = rep.int("N", sum(wherereplace)))
|
|
1292
|
-
# altseq <- replaceLetterAt(altseq, at = !wherereplace, letter = result$ALT[[1]])
|
|
1293
|
-
histart <- start(result[1]) + min(result[1]$altPos[[1]]) - 2
|
|
1294
|
-
histart <- ifelse(result[1]$varType %in% c("Other", "SNV"), histart + 1, histart)
|
|
1295
|
-
hiend <- start(result[1]) + min(result[1]$altPos[[1]]) - 2 + length(result[1]$altPos[[1]])
|
|
1296
|
-
hiT <- HighlightTrack(
|
|
1297
|
-
trackList = list(seqT, seqAltT),
|
|
1298
|
-
start = histart,
|
|
1299
|
-
end = hiend,
|
|
1300
|
-
chromosome = chromosome
|
|
1301
|
-
)
|
|
1302
|
-
|
|
1303
|
-
selectingfun <- selcor
|
|
1304
|
-
detailfun <- addPWM.stack
|
|
1305
|
-
|
|
1306
|
-
motif_ids <- names(pwmList)
|
|
1307
|
-
names(motif_ids) <- mcols(pwmList)$providerName
|
|
1308
|
-
|
|
1309
|
-
for (mymotif_i in seq_along(result)) {
|
|
1310
|
-
mymotif <- result[mymotif_i]
|
|
1311
|
-
start(mymotif) <- start(mymotif) + min(mymotif$altPos[[1]]) - 1
|
|
1312
|
-
width(mymotif) <- length(mymotif$altPos[[1]])
|
|
1313
|
-
variant.start <- start(mymotif)
|
|
1314
|
-
variant.end <- end(mymotif)
|
|
1315
|
-
if (mymotif$motifPos[[1]][1] < 0) {
|
|
1316
|
-
start(mymotif) <- start(mymotif) + (mymotif$motifPos[[1]][1])
|
|
1317
|
-
} else {
|
|
1318
|
-
start(mymotif) <- start(mymotif) + (mymotif$motifPos[[1]][1] - 1)
|
|
1319
|
-
}
|
|
1320
|
-
if (mymotif$motifPos[[1]][2] < 0) {
|
|
1321
|
-
end(mymotif) <- end(mymotif) + (mymotif$motifPos[[1]][2] + 1)
|
|
1322
|
-
} else {
|
|
1323
|
-
end(mymotif) <- end(mymotif) + (mymotif$motifPos[[1]][2])
|
|
1324
|
-
}
|
|
1325
|
-
if ((result[mymotif_i]$varType == "Deletion" & result[mymotif_i]$alleleDiff > 0) |
|
|
1326
|
-
(result[mymotif_i]$varType == "Insertion" & result[mymotif_i]$alleleDiff < 0)) {
|
|
1327
|
-
mymotif <- c(mymotif, mymotif)
|
|
1328
|
-
end(mymotif)[1] <- variant.start - 1
|
|
1329
|
-
start(mymotif)[2] <- variant.end + 1
|
|
1330
|
-
mymotif[which.min(width(mymotif))]$motifPos <- NA
|
|
1331
|
-
}
|
|
1332
|
-
if (exists("mres")) {
|
|
1333
|
-
mres <- c(mres, mymotif)
|
|
1334
|
-
} else {
|
|
1335
|
-
mres <- mymotif
|
|
1336
|
-
}
|
|
1337
|
-
}
|
|
1338
|
-
result <- mres
|
|
1339
|
-
rm(mres)
|
|
1340
|
-
motif_ids <- motif_ids[result$providerName]
|
|
1341
|
-
presult <- result
|
|
1342
|
-
strand(presult) <- "*"
|
|
1343
|
-
pres_cols <- DataFrame(
|
|
1344
|
-
feature = ifelse(!is.na(result$motifPos),
|
|
1345
|
-
paste(result$geneSymbol, "motif", sep = "_"), ""
|
|
1346
|
-
),
|
|
1347
|
-
group = result$providerName,
|
|
1348
|
-
id = motif_ids
|
|
1349
|
-
)
|
|
1350
|
-
presult <- GRanges(
|
|
1351
|
-
seqnames = seqnames(result[1]),
|
|
1352
|
-
ranges = ranges(result)
|
|
1353
|
-
)
|
|
1354
|
-
mcols(presult) <- pres_cols
|
|
1355
|
-
|
|
1356
|
-
motifT <- AnnotationTrack(presult,
|
|
1357
|
-
fun = detailfun,
|
|
1358
|
-
detailsFunArgs = list(pwm_stack = pwms),
|
|
1359
|
-
name = names(result)[[1]],
|
|
1360
|
-
selectFun = selectingfun,
|
|
1361
|
-
reverseStacking = FALSE,
|
|
1362
|
-
stacking = "squish"
|
|
1363
|
-
)
|
|
1364
|
-
|
|
1365
|
-
if (exists("axisT")) {
|
|
1366
|
-
track_list <- list(ideoT, motifT, hiT, axisT)
|
|
1367
|
-
} else {
|
|
1368
|
-
track_list <- list(ideoT, motifT, hiT)
|
|
1369
|
-
}
|
|
1370
|
-
plotTracks(track_list,
|
|
1371
|
-
from = from, to = to, showBandId = TRUE,
|
|
1372
|
-
cex.main = 0.8, col.main = "darkgrey",
|
|
1373
|
-
add53 = TRUE, labelpos = "below", chromosome = chromosome, # groupAnnotation = "id",
|
|
1374
|
-
fontcolor.item = "black",
|
|
1375
|
-
collapse = FALSE, min.width = 1, featureAnnotation = "feature", cex.feature = 0.8,
|
|
1376
|
-
details.size = 0.85, detailsConnector.pch = NA, detailsConnector.lty = 0,
|
|
1377
|
-
shape = "box", cex.group = 0.8, fonts = c("sans", "Helvetica")
|
|
1378
|
-
)
|
|
1379
|
-
return(invisible(NULL))
|
|
1380
|
-
}
|
|
1381
|
-
|
|
1382
|
-
#' Export motifbreakR results to csv or tsv
|
|
1383
|
-
#'
|
|
1384
|
-
#' @param results The output of \code{motifbreakR}
|
|
1385
|
-
#' @param file Character; the file name of the destination file
|
|
1386
|
-
#' @param format Character; one of tsv (tab separated values) or csv (comma separated values)
|
|
1387
|
-
#' @return \code{exportMBresults} produces an output file containing the output
|
|
1388
|
-
#' of the motifbreakR function.
|
|
1389
|
-
#' @examples
|
|
1390
|
-
#' data(example.results)
|
|
1391
|
-
#' example.results
|
|
1392
|
-
#' \donttest{
|
|
1393
|
-
#' exportMBresults(example.results, file = "output.tsv", format = "tsv")
|
|
1394
|
-
#' }
|
|
1395
|
-
#' @export
|
|
1396
|
-
exportMBresults <- function(results, file, format = c("tsv")) {
|
|
1397
|
-
if (missing(file)) {
|
|
1398
|
-
stop("select output file location")
|
|
1399
|
-
}
|
|
1400
|
-
if (!(format %in% c("csv", "tsv"))) {
|
|
1401
|
-
stop("format must be one of csv or tsv")
|
|
1402
|
-
}
|
|
1403
|
-
sep <- switch(format,
|
|
1404
|
-
csv = ",",
|
|
1405
|
-
tsv = "\t"
|
|
1406
|
-
)
|
|
1407
|
-
names(results) <- NULL
|
|
1408
|
-
results <- as.data.frame(results)
|
|
1409
|
-
results <- results[, !colnames(results) %in% "width"]
|
|
1410
|
-
results$start <- results$start - 1
|
|
1411
|
-
results$motifPos <- vapply(results$motifPos, function(x) {
|
|
1412
|
-
paste0(x[1], ";", x[2])
|
|
1413
|
-
}, FUN.VALUE = character(1))
|
|
1414
|
-
if (format == "tsv") {
|
|
1415
|
-
write.table(x = results, file = file, quote = FALSE, sep = sep, row.names = FALSE, col.names = TRUE)
|
|
1416
|
-
} else {
|
|
1417
|
-
write.csv(x = results, file = file, row.names = FALSE, col.names = TRUE)
|
|
1418
|
-
}
|
|
1419
|
-
}
|
|
1420
|
-
|
|
1421
|
-
#' Find Corresponding TF Binding From The ReMap2022 Project
|
|
1422
|
-
#'
|
|
1423
|
-
#' @param remap_data Character; either the path to a local copy of the non-redundant
|
|
1424
|
-
#' peak set, or one of \code{hg38} for Homo sapiens, \code{mm10} for Mus musculus,
|
|
1425
|
-
#' \code{dm6} for Drosophila melanogaster, or \code{TAIR10} for Arabidopsis thaliana,
|
|
1426
|
-
#' to be queried (slowly) online.
|
|
1427
|
-
#' @details \code{snps.from.file} takes a character vector describing the file path
|
|
1428
|
-
#' to a bed file that contains the necissary information to generate the input for
|
|
1429
|
-
#' \code{motifbreakR} see \url{http://www.genome.ucsc.edu/FAQ/FAQformat.html#format1}
|
|
1430
|
-
#' for a complete description of the BED format. Our convention deviates in that there
|
|
1431
|
-
#' is a required format for the name field. \code{name} is defined as chromosome:start:REF:ALT
|
|
1432
|
-
#' or the rsid from dbSNP (if you've included the optional SNPlocs argument).
|
|
1433
|
-
#' For example if you were to include rs123 in it's alternate
|
|
1434
|
-
#' format it would be entered as chr7:24966446:C:A
|
|
1435
|
-
#' @return a GRanges object containing:
|
|
1436
|
-
#' \item{SNP_id}{The rsid of the snp with the "rs" portion stripped}
|
|
1437
|
-
#' \item{alleles_as_ambig}{THE IUPAC ambiguity code between the reference and
|
|
1438
|
-
#' alternate allele for this SNP}
|
|
1439
|
-
#' \item{REF}{The reference allele for the SNP}
|
|
1440
|
-
#' \item{ALT}{The alternate allele for the SNP}
|
|
1441
|
-
#' @examples
|
|
1442
|
-
#' library(BSgenome.Drerio.UCSC.danRer7)
|
|
1443
|
-
#'
|
|
1444
|
-
#' @importFrom rtracklayer import
|
|
1445
|
-
#' @importFrom Biostrings IUPAC_CODE_MAP uniqueLetters BStringSetList DNA_ALPHABET
|
|
1446
|
-
#' @importFrom VariantAnnotation readVcf ref alt isSNV VcfFile ScanVcfParam
|
|
1447
|
-
#' @importFrom SummarizedExperiment rowRanges
|
|
1448
|
-
#' @importFrom stringr str_sort str_split
|
|
1449
|
-
#' @export
|
|
1450
|
-
|
|
1451
|
-
findSupportingRemapPeaks <- function(results, genome) {
|
|
1452
|
-
genome <- match.arg(genome, c("hg38", "mm10", "dm6", "TAIR10_TF", "TAIR10_HISTONE"))
|
|
1453
|
-
|
|
1454
|
-
remap_links <- list(
|
|
1455
|
-
hg38 = "https://remap.univ-amu.fr/storage/remap2022/hg38/MACS2/remap2022_nr_macs2_hg38_v1_0.bed.gz",
|
|
1456
|
-
mm10 = "https://remap.univ-amu.fr/storage/remap2022/mm10/MACS2/remap2022_nr_macs2_mm10_v1_0.bed.gz",
|
|
1457
|
-
dm6 = "https://remap.univ-amu.fr/storage/remap2022/dm6/MACS2/remap2022_nr_macs2_dm6_v1_0.bed.gz",
|
|
1458
|
-
TAIR10_TF = "https://remap.univ-amu.fr/storage/remap2022/tair10/tf/MACS2/remap2022_nr_macs2_TAIR10_v1_0.bed.gz",
|
|
1459
|
-
TAIR10_HISTONE = "https://remap.univ-amu.fr/storage/remap2022/tair10/histones/MACS2/remap2022_histone_nr_macs2_TAIR10_v1_0.bed.gz"
|
|
1460
|
-
)
|
|
1461
|
-
|
|
1462
|
-
results$matchingBindingEvent <- NA
|
|
1463
|
-
results$matchingCelltype <- NA
|
|
1464
|
-
|
|
1465
|
-
uniqmb <- results
|
|
1466
|
-
mcols(uniqmb) <- NULL
|
|
1467
|
-
names(uniqmb) <- NULL
|
|
1468
|
-
uniqmb <- sort(unique(uniqmb))
|
|
1469
|
-
|
|
1470
|
-
remap_binding <- loadPeakFile(remap_links, genome)
|
|
1471
|
-
remap_binding <- subsetByOverlaps(remap_binding, uniqmb)
|
|
1472
|
-
|
|
1473
|
-
if (length(remap_binding) < 1) {
|
|
1474
|
-
return(results)
|
|
1475
|
-
}
|
|
1476
|
-
|
|
1477
|
-
mb_overlaps <- findOverlaps(results, remap_binding)
|
|
1478
|
-
variant_to_peak <- unique(data.frame(variant = names(results[queryHits(mb_overlaps), ]), tf.binding = remap_binding[subjectHits(mb_overlaps), ]$name))
|
|
1479
|
-
tf_ct <- stringr::str_split(variant_to_peak$tf.binding, ":")
|
|
1480
|
-
variant_to_peak$tf.binding <- vapply(tf_ct, FUN = `[`, character(1), 1)
|
|
1481
|
-
variant_to_peak$tf.celltype <- vapply(tf_ct, FUN = `[`, character(1), 2)
|
|
1482
|
-
variant_to_peak$tf.celltype <- lapply(variant_to_peak$tf.celltype, function(x) {
|
|
1483
|
-
stringr::str_split(x, ",", simplify = T)[1, ]
|
|
1484
|
-
})
|
|
1485
|
-
mcol_variant_to_gene <- data.frame(variant = names(results), tf.gene = results$geneSymbol, motif = results$providerId)
|
|
1486
|
-
|
|
1487
|
-
if ("manuallyCuratedGeneMotifAssociationTable" %in% slotNames(attributes(results)$motifs)) {
|
|
1488
|
-
mb_genelist <- attributes(results)$motifs@manuallyCuratedGeneMotifAssociationTable
|
|
1489
|
-
mb_genelist <- unique(mb_genelist[mb_genelist$motif %in% results$providerId, c("motif", "tf.gene")])
|
|
1490
|
-
mb_genelist_aug <- unique(data.frame(motif = results$providerId, tf.gene = results$geneSymbol))
|
|
1491
|
-
mb_genelist <- unique(rbind(mb_genelist, mb_genelist_aug))
|
|
1492
|
-
variant_to_gene <- data.frame(variant = names(results), motif = results$providerId)
|
|
1493
|
-
variant_to_gene_master <- merge(variant_to_gene, mb_genelist, all.x = TRUE, sort = FALSE)
|
|
1494
|
-
variant_to_gene <- unique(variant_to_gene_master[, c("variant", "tf.gene")])
|
|
1495
|
-
} else {
|
|
1496
|
-
variant_to_gene_master <- mcol_variant_to_gene
|
|
1497
|
-
variant_to_gene <- unique(variant_to_gene_master[, c("variant", "tf.gene")])
|
|
1498
|
-
}
|
|
1499
|
-
|
|
1500
|
-
variant_mb_peak <- merge(variant_to_gene_master, variant_to_peak, all.x = TRUE, sort = FALSE)
|
|
1501
|
-
variant_mb_peak <- unique(variant_mb_peak[variant_mb_peak$tf.gene == variant_mb_peak$tf.binding, ])
|
|
1502
|
-
variant_mb_peak <- variant_mb_peak[!is.na(variant_mb_peak$variant), c("variant", "tf.binding", "tf.celltype", "motif")]
|
|
1503
|
-
|
|
1504
|
-
split_vmbp <- split(variant_mb_peak, as.factor(variant_mb_peak$variant))
|
|
1505
|
-
|
|
1506
|
-
for (variant in seq_along(split_vmbp)) {
|
|
1507
|
-
pvariant <- names(split_vmbp[variant])
|
|
1508
|
-
vgm <- split_vmbp[[variant]]
|
|
1509
|
-
mbv_sel <- which(mcol_variant_to_gene$variant == pvariant)
|
|
1510
|
-
vgm$motif <- factor(vgm$motif, levels = mcol_variant_to_gene[mbv_sel, "motif"])
|
|
1511
|
-
names(vgm$tf.celltype) <- vgm$tf.binding
|
|
1512
|
-
vgmm <- vgm[which(vgm$motif %in% mcol_variant_to_gene$motif), ]
|
|
1513
|
-
vgmm <- split(vgmm, vgmm$motif)
|
|
1514
|
-
vgmg <- vgm[which(vgm$tf.gene %in% mcol_variant_to_gene$tf.gene), ]
|
|
1515
|
-
vgmg <- split(vgmg, vgmg$motif)
|
|
1516
|
-
vgm <- mapply(rbind, vgmm, vgmg, SIMPLIFY = F)
|
|
1517
|
-
|
|
1518
|
-
results[mbv_sel]$matchingBindingEvent <- lapply(vgm, function(x) {
|
|
1519
|
-
unique(x$tf.binding)
|
|
1520
|
-
})[results[mbv_sel]$providerId]
|
|
1521
|
-
results[mbv_sel]$matchingCelltype <- lapply(vgm, function(x) {
|
|
1522
|
-
x$tf.celltype
|
|
1523
|
-
})[results[mbv_sel]$providerId]
|
|
1524
|
-
if (any(lengths(results[mbv_sel]$matchingBindingEvent) == 0)) {
|
|
1525
|
-
results[mbv_sel][lengths(results[mbv_sel]$matchingBindingEvent) == 0]$matchingBindingEvent <- NA
|
|
1526
|
-
results[mbv_sel][lengths(results[mbv_sel]$matchingCelltype) == 0]$matchingCelltype <- NA
|
|
1527
|
-
}
|
|
1528
|
-
}
|
|
1529
|
-
attributes(results)$peaks <- remap_binding
|
|
1530
|
-
return(results)
|
|
1531
|
-
}
|
|
1532
|
-
|
|
1533
|
-
#' @importFrom tools R_user_dir
|
|
1534
|
-
#' @importFrom BiocFileCache BiocFileCache
|
|
1535
|
-
.get_cache <- function() {
|
|
1536
|
-
cache <- R_user_dir("motifbreakR", which = "cache")
|
|
1537
|
-
BiocFileCache(cache = cache, ask = F)
|
|
1538
|
-
}
|
|
1539
|
-
|
|
1540
|
-
cachePeakFile <- function(fileURL, genome) {
|
|
1541
|
-
bfc <- .get_cache()
|
|
1542
|
-
rname <- paste("remap2022", genome, sep = "_")
|
|
1543
|
-
rid <- bfcquery(bfc, genome, "rname")$rid
|
|
1544
|
-
if (!length(rid)) {
|
|
1545
|
-
rid <- names(bfcadd(bfc, rname, fileURL, ext = ".Rdata", download = FALSE))
|
|
1546
|
-
}
|
|
1547
|
-
if (!isFALSE(bfcneedsupdate(bfc, rid))) {
|
|
1548
|
-
message("downloading peak file")
|
|
1549
|
-
bfcdownload(bfc, rid, ask = FALSE, FUN = convertPeakFile)
|
|
1550
|
-
message("peak download complete")
|
|
1551
|
-
}
|
|
1552
|
-
bfcrpath(bfc, rids = rid)
|
|
1553
|
-
}
|
|
1554
|
-
|
|
1555
|
-
#' @importFrom vroom vroom
|
|
1556
|
-
loadPeakFile2 <- function(url_list, genome) {
|
|
1557
|
-
peak_path <- cachePeakFile(url_list[[genome]], genome)
|
|
1558
|
-
remap_peaks <- GRanges(vroom(peak_path,
|
|
1559
|
-
delim = "\t",
|
|
1560
|
-
col_names = c(
|
|
1561
|
-
"chr", "start", "end", "name",
|
|
1562
|
-
"score", "strand", "tstart",
|
|
1563
|
-
"tend", "color"
|
|
1564
|
-
),
|
|
1565
|
-
col_types = c("ciiciciic"),
|
|
1566
|
-
col_select = c(chr, start, end, name),
|
|
1567
|
-
progress = FALSE
|
|
1568
|
-
))
|
|
1569
|
-
return(remap_peaks)
|
|
1570
|
-
}
|
|
1571
|
-
|
|
1572
|
-
loadPeakFile <- function(url_list, genome) {
|
|
1573
|
-
peak_path <- cachePeakFile(url_list[[genome]], genome)
|
|
1574
|
-
remap_peaks <- readRDS(peak_path)
|
|
1575
|
-
return(remap_peaks)
|
|
1576
|
-
}
|
|
1577
|
-
|
|
1578
|
-
convertPeakFile <- function(from, to) {
|
|
1579
|
-
message("processing peak file")
|
|
1580
|
-
remap_peaks <- GRanges(vroom(from,
|
|
1581
|
-
delim = "\t",
|
|
1582
|
-
col_names = c(
|
|
1583
|
-
"chr", "start", "end", "name",
|
|
1584
|
-
"score", "strand", "tstart",
|
|
1585
|
-
"tend", "color"
|
|
1586
|
-
),
|
|
1587
|
-
col_types = c("ciiciciic"),
|
|
1588
|
-
col_select = c(chr, start, end, name),
|
|
1589
|
-
progress = FALSE
|
|
1590
|
-
))
|
|
1591
|
-
saveRDS(remap_peaks, file = to)
|
|
1592
|
-
message("processing peak file")
|
|
1593
|
-
TRUE
|
|
1594
|
-
}
|