biopipen 0.31.5__py3-none-any.whl → 0.31.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

@@ -1,1594 +0,0 @@
1
- # Patch motifBreakR to avoid getSeq using the bsgenome but a reference fasta file instead
2
- # This keeps consistent with other non-R processes
3
- library(stringr)
4
- library(matrixStats)
5
- library(TFMPvalue)
6
- library(motifStack)
7
- library(Biostrings)
8
- library(GenomicRanges)
9
- library(MotifDb)
10
-
11
- snpInfo2snpList <- function(snpinfo) {
12
- snpinfo$start <- snpinfo$start + 1
13
- snpgr <- makeGRangesFromDataFrame(snpinfo)
14
- snpgr_meta <- snpinfo[, c("name", "score", "ref", "alt", "ref_seq", "alt_seq")]
15
- colnames(snpgr_meta) <- c("SNP_id", "score", "REF", "ALT", "ref_seq", "alt_seq")
16
- snpgr_meta$ref_seq <- substring(snpgr_meta$ref_seq, 2)
17
- snpgr_meta$alt_seq <- substring(snpgr_meta$alt_seq, 2)
18
- elementMetadata(snpgr) <- snpgr_meta
19
- snpgr
20
- }
21
-
22
- revcom <- function(ltr) {
23
- rclist <- list(A = "T", C = "G", G = "C", T = "A")
24
- rclist[[ltr]]
25
- }
26
-
27
- prepareVariants <- function(fsnplist) {
28
- # The sequences are already fetched in fsnplist at $ref_seq and $alt_seq
29
- ref_len <- nchar(fsnplist$REF)
30
- alt_len <- nchar(fsnplist$ALT)
31
- is.indel <- ref_len > 1L | alt_len > 1L
32
- if (sum(is.indel) < length(is.indel) & sum(is.indel) > 0L) {
33
- fsnplist.indel <- fsnplist[is.indel]
34
- fsnplist.snv <- fsnplist[!is.indel]
35
- } else if (sum(is.indel) == length(is.indel)) {
36
- fsnplist.indel <- fsnplist
37
- fsnplist.snv <- NULL
38
- } else if (sum(is.indel) == 0L) {
39
- fsnplist.indel <- NULL
40
- fsnplist.snv <- fsnplist
41
- }
42
- if (!is.null(fsnplist.indel)) {
43
- snp.sequence.ref.indel <- fsnplist$ref_seq
44
- snp.sequence.alt.indel <- fsnplist$alt_seq
45
-
46
- need.alignment <- !(lengths(fsnplist.indel$REF) == 1 | lengths(fsnplist.indel$ALT) == 1)
47
- insertion.var <- lengths(fsnplist.indel$REF) < lengths(fsnplist.indel$ALT)
48
- fsnplist.indel$ALT_loc <- 1L
49
- fsnplist.indel[insertion.var]$ALT_loc <- Map(seq,
50
- from = nchar(fsnplist.indel[insertion.var]$REF) + 1L,
51
- to = nchar(fsnplist.indel[insertion.var]$ALT)
52
- )
53
- fsnplist.indel[!insertion.var]$ALT_loc <- Map(seq,
54
- from = nchar(fsnplist.indel[!insertion.var]$ALT) + 1L,
55
- to = nchar(fsnplist.indel[!insertion.var]$REF)
56
- )
57
- ref.len <- nchar(fsnplist.indel[nchar(fsnplist.indel$REF) == nchar(fsnplist.indel$ALT)]$REF)
58
- fsnplist.indel[nchar(fsnplist.indel$REF) == nchar(fsnplist.indel$ALT)]$ALT_loc <- lapply(
59
- ref.len,
60
- function(x) {seq(from = 1L, to = x)}
61
- )
62
- fsnplist.indel$varType <- "Other"
63
- if (sum(insertion.var) > 0) {
64
- fsnplist.indel[insertion.var, ]$varType <- "Insertion"
65
- }
66
- if (sum(lengths(fsnplist.indel$REF) > lengths(fsnplist.indel$ALT)) > 0) {
67
- fsnplist.indel[lengths(fsnplist.indel$REF) > lengths(fsnplist.indel$ALT)]$varType <- "Deletion"
68
- }
69
- if (any(need.alignment)) {
70
- need.del <- any(!insertion.var & need.alignment)
71
- need.ins <- any(insertion.var & need.alignment)
72
- if (need.del) {
73
- pattern.del <- Map(matchPattern,
74
- fsnplist.indel[!insertion.var & need.alignment]$ALT,
75
- fsnplist.indel[!insertion.var & need.alignment]$REF,
76
- with.indels = FALSE, max.mismatch = 0
77
- )
78
- names(pattern.del) <- names(fsnplist.indel[!insertion.var & need.alignment])
79
- pattern.del <- sapply(pattern.del, function(x) {
80
- slen <- length(subject(x))
81
- x <- x[start(x) == 1 | end(x) == slen]
82
- if (length(x) > 0) {
83
- x <- x[1]
84
- x <- gaps(x)
85
- x <- as(x, "IRanges")
86
- }
87
- })
88
- pattern.del.valid <- sapply(pattern.del, function(x) {
89
- length(x) > 0
90
- })
91
- nr.pattern.del <- lapply(pattern.del[pattern.del.valid], function(x) {
92
- start(x):end(x)
93
- })
94
- fsnplist.indel[!insertion.var & need.alignment][names(pattern.del[pattern.del.valid])]$ALT_loc <- nr.pattern.del
95
- if (any((!insertion.var & need.alignment)[!pattern.del.valid])) {
96
- alignment.del <- Map(pairwiseAlignment,
97
- fsnplist.indel[!insertion.var & need.alignment][!pattern.del.valid]$ALT,
98
- fsnplist.indel[!insertion.var & need.alignment][!pattern.del.valid]$REF,
99
- type = "global"
100
- )
101
- alt.width <- width(fsnplist.indel[!insertion.var & need.alignment][!pattern.del.valid]$ALT)
102
- ref.width <- width(fsnplist.indel[!insertion.var & need.alignment][!pattern.del.valid]$REF)
103
- names(alignment.del) <- names(fsnplist.indel[!insertion.var & need.alignment][!pattern.del.valid])
104
- alignment.ins <- sapply(sapply(alignment.del, insertion), unlist)
105
- alignment.del <- sapply(sapply(alignment.del, deletion), unlist)
106
- alignment.del.valid <- sapply(alignment.del, function(x) {
107
- length(x) > 0
108
- })
109
- alignment.ins.valid <- sapply(alignment.ins, function(x) {
110
- length(x) > 0
111
- })
112
- full.replace <- (alignment.del.valid & alignment.ins.valid & alt.width == ref.width)
113
- alignment.del.valid <- alignment.del.valid & !full.replace
114
- nr.alignment.del <- lapply(alignment.del[alignment.del.valid], function(x) {
115
- start(x):end(x)
116
- })
117
- fsnplist.indel[!insertion.var & need.alignment][names(alignment.del[alignment.del.valid])]$ALT_loc <- nr.alignment.del
118
- rm(alignment.del, nr.alignment.del)
119
- }
120
- rm(pattern.del, nr.pattern.del, pattern.del.valid, need.del)
121
- }
122
- if (need.ins) {
123
- pattern.ins <- Map(matchPattern,
124
- fsnplist.indel[insertion.var & need.alignment]$REF,
125
- fsnplist.indel[insertion.var & need.alignment]$ALT,
126
- with.indels = FALSE, max.mismatch = 0
127
- )
128
- names(pattern.ins) <- names(fsnplist.indel[insertion.var & need.alignment])
129
- pattern.ins <- sapply(pattern.ins, function(x) {
130
- slen <- length(subject(x))
131
- x <- x[start(x) == 1 | end(x) == slen]
132
- if (length(x) > 0) {
133
- x <- x[1]
134
- x <- gaps(x)
135
- x <- as(x, "IRanges")
136
- }
137
- })
138
- pattern.ins.valid <- sapply(pattern.ins, function(x) {
139
- length(x) > 0
140
- })
141
- nr.pattern.ins <- lapply(pattern.ins[pattern.ins.valid], function(x) {
142
- start(x):end(x)
143
- })
144
- fsnplist.indel[insertion.var & need.alignment][names(pattern.ins[pattern.ins.valid])]$ALT_loc <- nr.pattern.ins
145
- if (any((insertion.var & need.alignment)[!pattern.ins.valid])) {
146
- alignment.ins <- Map(pairwiseAlignment,
147
- fsnplist.indel[insertion.var & need.alignment][!pattern.ins.valid]$ALT,
148
- fsnplist.indel[insertion.var & need.alignment][!pattern.ins.valid]$REF,
149
- type = "global"
150
- )
151
- names(alignment.ins) <- names(fsnplist.indel[insertion.var & need.alignment][!pattern.ins.valid])
152
- alignment.ins <- sapply(sapply(alignment.ins, insertion), unlist)
153
- alignment.ins.valid <- sapply(alignment.ins, function(x) {
154
- length(x) > 0
155
- })
156
- nr.alignment.ins <- lapply(alignment.ins[alignment.ins.valid], function(x) {
157
- start(x):end(x)
158
- })
159
- fsnplist.indel[insertion.var & need.alignment][names(alignment.ins[alignment.ins.valid])]$ALT_loc <- nr.alignment.ins
160
- rm(alignment.ins, nr.alignment.ins)
161
- }
162
- rm(pattern.ins, nr.pattern.ins, pattern.ins.valid, need.ins)
163
- }
164
- rm(ref.len, insertion.var, need.alignment)
165
- }
166
- }
167
- if (!is.null(fsnplist.snv)) {
168
- snp.sequence.ref.snv <- fsnplist$ref_seq
169
- snp.sequence.alt.snv <- fsnplist$alt_seq
170
- fsnplist.snv$ALT_loc <- 1L
171
- fsnplist.snv$varType <- "SNV"
172
- }
173
- if (sum(is.indel) < length(is.indel) & sum(is.indel) > 0L) {
174
- fsnplist <- c(fsnplist.indel, fsnplist.snv)
175
- snp.sequence.alt <- strsplit(as.character(c(
176
- snp.sequence.alt.indel,
177
- snp.sequence.alt.snv
178
- )), "")
179
- snp.sequence.ref <- strsplit(as.character(c(
180
- snp.sequence.ref.indel,
181
- snp.sequence.ref.snv
182
- )), "")
183
- rm(
184
- fsnplist.indel, fsnplist.snv,
185
- snp.sequence.alt.indel, snp.sequence.ref.indel,
186
- snp.sequence.alt.snv, snp.sequence.ref.snv
187
- )
188
- } else if (sum(is.indel) == length(is.indel)) {
189
- fsnplist <- fsnplist.indel
190
- snp.sequence.alt <- strsplit(as.character(snp.sequence.alt.indel), "")
191
- snp.sequence.ref <- strsplit(as.character(snp.sequence.ref.indel), "")
192
- rm(fsnplist.indel, snp.sequence.alt.indel, snp.sequence.ref.indel)
193
- } else if (sum(is.indel) == 0L) {
194
- fsnplist <- fsnplist.snv
195
- snp.sequence.alt <- strsplit(as.character(snp.sequence.alt.snv), "")
196
- snp.sequence.ref <- strsplit(as.character(snp.sequence.ref.snv), "")
197
- rm(fsnplist.snv, snp.sequence.alt.snv, snp.sequence.ref.snv)
198
- }
199
- gc()
200
- return(list(
201
- fsnplist = fsnplist,
202
- ref_seq = snp.sequence.ref,
203
- alt_seq = snp.sequence.alt
204
- ))
205
- }
206
-
207
- ## An evaluator function for SNP effect
208
-
209
- varEff <- function(allelR, allelA) {
210
- score <- allelA - allelR
211
- if (abs(score) >= 0.7) {
212
- return(list(score = score, effect = "strong"))
213
- } else if (abs(score) > 0.4) {
214
- return(list(score = score, effect = "weak"))
215
- } else {
216
- return(list(score = score, effect = "neut"))
217
- }
218
- }
219
-
220
- scoreIndel <- function(pwm,
221
- ref.seq, alt.seq,
222
- hit.ref, hit.alt) {
223
- ref.windows <- scoreSeqWindows(ppm = pwm, seq = ref.seq)
224
- alt.windows <- scoreSeqWindows(ppm = pwm, seq = alt.seq)
225
- score <- alt.windows[hit.alt$strand, hit.alt$window] - ref.windows[hit.ref$strand, hit.ref$window]
226
- if (abs(score) >= 0.7) {
227
- return(list(score = score, effect = "strong"))
228
- } else if (abs(score) > 0.4) {
229
- return(list(score = score, effect = "weak"))
230
- } else {
231
- return(list(score = score, effect = "neut"))
232
- }
233
- }
234
-
235
- reverseComplementMotif <- function(pwm) {
236
- rows <- rownames(pwm)
237
- cols <- colnames(pwm)
238
- Ns <- pwm["N", ]
239
- pwm <- pwm[4:1, length(cols):1]
240
- pwm <- rbind(pwm, Ns)
241
- rownames(pwm) <- rows
242
- colnames(pwm) <- cols
243
- return(pwm)
244
- }
245
-
246
-
247
- scoreSeqWindows <- function(ppm, seq) {
248
- ppm.width <- ncol(ppm)
249
- seq.len <- length(seq)
250
- diag.ind <- rep.int(ppm.width, seq.len - ppm.width)
251
- ranges <- vapply(
252
- c(0L, cumsum(diag.ind)),
253
- function(x,
254
- range = (1L + 0L:(ppm.width - 1L) * (ppm.width + 1L))) {
255
- x + range
256
- },
257
- integer(ppm.width)
258
- )
259
- scores <- t(ppm[seq, ])[ranges]
260
- scores_rc <- t(reverseComplementMotif(ppm)[seq, ])[ranges]
261
- scores <- split(scores, ceiling(seq_along(scores) / ppm.width))
262
- scores_rc <- split(scores_rc, ceiling(seq_along(scores_rc) / ppm.width))
263
- res <- vapply(
264
- Map(
265
- function(x, y) {
266
- matrix(
267
- data = c(x, y), nrow = 2,
268
- byrow = TRUE, dimnames = list(c(1, 2))
269
- )
270
- },
271
- scores, scores_rc
272
- ),
273
- rowSums,
274
- numeric(2)
275
- )
276
- return(res)
277
- }
278
-
279
- maxThresholdWindows <- function(window.frame) {
280
- start.ind <- as.integer(colnames(window.frame)[1]) - 1L
281
- max.win <- arrayInd(which.max(window.frame), dim(window.frame))
282
- return(list(
283
- window = as.integer(colnames(window.frame)[max.win[, 2] + start.ind]),
284
- strand = c(1, 2)[max.win[, 1]]
285
- ))
286
- }
287
-
288
-
289
- #' @import methods
290
- #' @import GenomicRanges
291
- #' @import S4Vectors
292
- #' @import BiocGenerics
293
- #' @import IRanges
294
- #' @importFrom Biostrings getSeq replaceLetterAt reverseComplement complement replaceAt pairwiseAlignment insertion deletion matchPattern
295
- #' @importFrom TFMPvalue TFMpv2sc
296
- #' @importFrom stringr str_locate_all str_sub
297
- scoreSnpList <- function(fsnplist, pwmList, method = "default", bkg = NULL,
298
- threshold = 1e-3, show.neutral = FALSE, verbose = FALSE,
299
- pwmList.pc = NULL, pwmRanges = NULL, filterp = TRUE) {
300
- k <- max(sapply(pwmList, ncol))
301
- snp.sequence.alt <- fsnplist$alt_seq
302
- snp.sequence.ref <- fsnplist$ref_seq
303
- fsnplist <- fsnplist$fsnplist
304
-
305
- res.el.e <- new.env()
306
- for (snp.map.i in seq_along(snp.sequence.alt)) {
307
- snp.ref <- snp.sequence.ref[[snp.map.i]]
308
- snp.alt <- snp.sequence.alt[[snp.map.i]]
309
- ref.len <- nchar(fsnplist[snp.map.i]$REF)
310
- alt.len <- nchar(fsnplist[snp.map.i]$ALT)
311
- alt.loc <- fsnplist[snp.map.i]$ALT_loc[[1]]
312
- res.el <- rep(fsnplist[snp.map.i], length(pwmList))
313
- res.el$motifPos <- as.integer(NA)
314
- res.el$motifID <- mcols(pwmList)$providerID
315
- res.el$geneSymbol <- mcols(pwmList)$geneSymbol
316
- res.el$dataSource <- mcols(pwmList)$dataSource
317
- res.el$providerName <- mcols(pwmList)$providerName
318
- res.el$providerId <- mcols(pwmList)$providerId
319
- res.el$seqMatch <- as.character(NA)
320
- res.el$pctRef <- as.numeric(NA)
321
- res.el$pctAlt <- as.numeric(NA)
322
- res.el$scoreRef <- as.numeric(NA)
323
- res.el$scoreAlt <- as.numeric(NA)
324
- if (filterp) {
325
- res.el$Refpvalue <- as.numeric(NA)
326
- res.el$Altpvalue <- as.numeric(NA)
327
- }
328
- if (ref.len > 1 | alt.len > 1) {
329
- res.el$altPos <- as.numeric(NA)
330
- res.el$alleleDiff <- as.numeric(NA)
331
- res.el$alleleEffectSize <- as.numeric(NA)
332
- } else {
333
- res.el$snpPos <- as.integer(NA)
334
- res.el$alleleRef <- as.numeric(NA)
335
- res.el$alleleAlt <- as.numeric(NA)
336
- }
337
- res.el$effect <- as.character(NA)
338
- for (pwm.i in seq_along(pwmList)) {
339
- pwm.basic <- pwmList[[pwm.i]]
340
- pwm <- pwmList.pc[[pwm.i]]
341
- len <- ncol(pwm)
342
- thresh <- threshold[[pwm.i]]
343
- seq.start <- min(alt.loc)
344
- seq.len <- length(alt.loc)
345
- alt.range <- ref.range <- (k - (ncol(pwm) - seq.start)):(k + ncol(pwm) + seq.start + seq.len - 2)
346
- if (!show.neutral & identical(snp.ref[ref.range], snp.alt[alt.range])) next()
347
- seq.remove <- ref.len - alt.len
348
- if (seq.remove < 0) {
349
- ref.range <- ref.range[1:(length(ref.range) + seq.remove)]
350
- } else {
351
- alt.range <- alt.range[1:(length(alt.range) - seq.remove)]
352
- }
353
- ref.windows <- scoreSeqWindows(ppm = pwm, seq = snp.ref[ref.range])
354
- alt.windows <- scoreSeqWindows(ppm = pwm, seq = snp.alt[alt.range])
355
- pass.effect <- ifelse(filterp,
356
- any(alt.windows > thresh) | any(ref.windows > thresh),
357
- any(((alt.windows - pwmRanges[[pwm.i]][1]) / (pwmRanges[[pwm.i]][2] - pwmRanges[[pwm.i]][1]) > thresh)) |
358
- any((ref.windows - pwmRanges[[pwm.i]][1]) / (pwmRanges[[pwm.i]][2] - pwmRanges[[pwm.i]][1]) > thresh)
359
- )
360
- if (pass.effect) {
361
- hit.alt <- maxThresholdWindows(alt.windows)
362
- hit.ref <- maxThresholdWindows(ref.windows)
363
- bigger <- ref.windows[hit.ref$strand, hit.ref$window] >= alt.windows[hit.alt$strand, hit.alt$window]
364
- if (bigger) {
365
- hit <- hit.ref
366
- } else {
367
- hit <- hit.alt
368
- }
369
- } else {
370
- hit.alt <- list(window = 0L, strand = 0L)
371
- hit.ref <- list(window = 0L, strand = 0L)
372
- hit <- NULL
373
- }
374
- if (!show.neutral) {
375
- if (identical(
376
- alt.windows[hit.alt$strand, hit.alt$window],
377
- ref.windows[hit.ref$strand, hit.ref$window]
378
- )) {
379
- next()
380
- }
381
- }
382
- if (!is.null(hit)) {
383
- result <- res.el[pwm.i]
384
- uniquename <- paste(names(result), result$dataSource, result$providerName, result$providerId, sep = "%%")
385
- if (nchar(result$REF) > 1 | nchar(result$ALT) > 1) {
386
- allelR <- ref.windows[hit.ref$strand, hit.ref$window]
387
- allelA <- alt.windows[hit.alt$strand, hit.alt$window]
388
- scorediff <- varEff(allelR, allelA)
389
- effect <- scorediff$effect
390
- score <- scorediff$score
391
- # ref.pos <- ref.range[(ncol(pwm) - (seq.start - 1)):((ncol(pwm) + ref.len - seq.start))]
392
- ref.pos <- k:(k + nchar(result$REF) - 1L)
393
- # alt.pos <- alt.range[(ncol(pwm) - (seq.start - 1)):((ncol(pwm) + alt.len - seq.start))]
394
- alt.pos <- k:(k + nchar(result$ALT) - 1L)
395
- if ((effect == "neut" & show.neutral) | effect != "neut") {
396
- res.el.e[[uniquename]] <- updateResultsIndel(result,
397
- snp.ref, snp.alt,
398
- ref.pos, alt.pos,
399
- hit.ref, hit.alt,
400
- ref.windows, alt.windows,
401
- score, effect, len,
402
- k, pwm,
403
- calcp = filterp
404
- )
405
- }
406
- } else {
407
- snp.pos <- k:(k + nchar(result$REF) - 1L)
408
- allelR <- ref.windows[hit.ref$strand, hit.ref$window]
409
- allelA <- alt.windows[hit.alt$strand, hit.alt$window]
410
- scorediff <- varEff(allelR, allelA)
411
- effect <- scorediff$effect
412
- score <- scorediff$score
413
- if ((effect == "neut" & show.neutral) | effect != "neut") {
414
- res.el.e[[uniquename]] <- updateResultsIndel(result,
415
- snp.ref, snp.alt,
416
- snp.pos, snp.pos,
417
- hit.ref, hit.alt,
418
- ref.windows, alt.windows,
419
- score, effect, len,
420
- k, pwm,
421
- calcp = filterp
422
- )
423
- }
424
- }
425
- }
426
- }
427
- }
428
- resultSet <- unlist(GRangesList(as.list.environment(res.el.e)), use.names = FALSE)
429
- if (length(resultSet) < 1) {
430
- if (verbose) {
431
- message(paste(
432
- "reached end of SNPs list length =", length(fsnplist),
433
- "with 0 potentially disruptive matches to", length(unique(resultSet$geneSymbol)),
434
- "of", length(pwmList), "motifs."
435
- ))
436
- }
437
- return(NULL)
438
- } else {
439
- if ("ALT_loc" %in% names(mcols(resultSet))) mcols(resultSet)$ALT_loc <- NULL
440
- max.match <- max(vapply(str_locate_all(resultSet$seqMatch, "\\w"), max, integer(1)))
441
- min.match <- min(vapply(str_locate_all(resultSet$seqMatch, "\\w"), min, integer(1)))
442
- resultSet$seqMatch <- str_sub(resultSet$seqMatch,
443
- start = min.match + 1,
444
- end = max.match + 1
445
- )
446
- if (verbose) {
447
- message(paste(
448
- "reached end of SNPs list length =", length(fsnplist),
449
- "with", length(resultSet), "potentially disruptive matches to", length(unique(resultSet$geneSymbol)),
450
- "of", length(pwmList), "motifs."
451
- ))
452
- }
453
- return(resultSet)
454
- }
455
- }
456
-
457
- #' @importFrom matrixStats colRanges
458
- #' @importFrom stringr str_pad
459
- #' @importFrom TFMPvalue TFMsc2pv
460
- updateResultsSnv <- function(result, snp.seq, snp.pos, hit, ref.windows, alt.windows,
461
- allelR, allelA, effect, len, k, pwm, calcp) {
462
- strand.opt <- c("+", "-")
463
- strand(result) <- strand.opt[[hit$strand]]
464
- hit$window <- as.integer(hit$window)
465
- mresult <- mcols(result)
466
- mresult[["snpPos"]] <- start(result)
467
- mresult[["motifPos"]] <- as.integer(snp.pos)
468
- matchs <- snp.seq
469
- seq.pos <- snp.pos + hit$window - 1
470
- matchs[-(seq.pos)] <- tolower(matchs[-(seq.pos)])
471
- matchs <- paste(matchs, collapse = "")
472
- mresult[["seqMatch"]] <- str_pad(matchs, width = k * 2, side = "both")
473
- start(result) <- start(result) - snp.pos + 1
474
- end(result) <- end(result) - snp.pos + len
475
- if (calcp) {
476
- mresult[["scoreRef"]] <- ref.windows[hit$strand, hit$window]
477
- mresult[["scoreAlt"]] <- alt.windows[hit$strand, hit$window]
478
- mresult[["Refpvalue"]] <- NA
479
- mresult[["Altpvalue"]] <- NA
480
- pwmrange <- colSums(colRanges(pwm))
481
- mresult[["pctRef"]] <- (mresult[["scoreRef"]] - pwmrange[[1]]) / (pwmrange[[2]] - pwmrange[[1]])
482
- mresult[["pctAlt"]] <- (mresult[["scoreAlt"]] - pwmrange[[1]]) / (pwmrange[[2]] - pwmrange[[1]])
483
- } else {
484
- mresult[["pctRef"]] <- ref.windows[hit$strand, hit$window]
485
- mresult[["pctAlt"]] <- alt.windows[hit$strand, hit$window]
486
- }
487
- mresult[["alleleRef"]] <- allelR
488
- mresult[["alleleAlt"]] <- allelA
489
- mresult[["effect"]] <- effect
490
- mcols(result) <- mresult
491
- return(result)
492
- }
493
-
494
-
495
- updateResultsIndel <- function(result,
496
- ref.seq, alt.seq,
497
- ref.pos, alt.pos,
498
- hit.ref, hit.alt,
499
- ref.windows, alt.windows,
500
- score, effect, len, k, pwm, calcp) {
501
- strand.opt <- c("+", "-")
502
- if (score > 0L) {
503
- best.hit <- hit.alt
504
- matchs <- alt.seq
505
- snp.pos <- alt.pos
506
- } else {
507
- best.hit <- hit.ref
508
- matchs <- ref.seq
509
- snp.pos <- ref.pos
510
- }
511
-
512
- strand(result) <- strand.opt[[best.hit$strand]]
513
- best.hit$window <- as.integer(best.hit$window)
514
- mresult <- mcols(result)
515
- alt_loc <- range(mresult$ALT_loc)
516
- ref_start <- (1 - alt_loc[[1]])
517
- ref_start <- ifelse(ref_start <= 0, ref_start - 1, ref_start)
518
- motif.start <- (alt_loc[[1]]) + (-len) + (best.hit$window) + ref_start
519
- motif.start <- ifelse(motif.start >= 0, motif.start + 1, motif.start)
520
- if ((mresult$varType == "Insertion" & score < 0) |
521
- (mresult$varType == "Deletion" & score > 0)) {
522
- motif.end <- motif.start + len
523
- } else {
524
- if (motif.start > 0) {
525
- motif.end <- len - length(motif.start:length(alt_loc[1]:alt_loc[2]))
526
- } else {
527
- motif.end <- motif.start + len - length(alt_loc[1]:alt_loc[2])
528
- }
529
- }
530
- motif.end <- ifelse(motif.end <= 0, motif.end - 1, motif.end)
531
- mresult$motifPos <- list(c(motif.start, motif.end))
532
- mresult$altPos <- mresult$ALT_loc
533
- seq.range <- (k - (len - alt_loc[[1]])):(k + len + alt_loc[[2]] - 2)
534
- matchs[-(snp.pos)] <- tolower(matchs[-(snp.pos)])
535
- matchs <- paste(matchs[seq.range], collapse = "")
536
- mresult[["seqMatch"]] <- str_pad(matchs, width = (k * 2) + alt_loc[[2]], side = "both")
537
- pwmrange <- colSums(colRanges(pwm[-5, ]))
538
- mresult[["scoreRef"]] <- ref.windows[hit.ref$strand, hit.ref$window]
539
- mresult[["scoreAlt"]] <- alt.windows[hit.alt$strand, hit.alt$window]
540
- mresult[["pctRef"]] <- (mresult[["scoreRef"]] - pwmrange[[1]]) / (pwmrange[[2]] - pwmrange[[1]])
541
- mresult[["pctAlt"]] <- (mresult[["scoreAlt"]] - pwmrange[[1]]) / (pwmrange[[2]] - pwmrange[[1]])
542
- if (calcp) {
543
- mresult[["Refpvalue"]] <- NA
544
- mresult[["Altpvalue"]] <- NA
545
- }
546
- mresult[["alleleDiff"]] <- score
547
- mresult[["effect"]] <- effect
548
- mresult[["alleleEffectSize"]] <- score / pwmrange[[2]]
549
- mcols(result) <- mresult
550
- return(result)
551
- }
552
-
553
- #' @importFrom matrixStats colMaxs colMins
554
- preparePWM <- function(pwmList,
555
- filterp,
556
- bkg,
557
- scoreThresh,
558
- method = "default") {
559
- bkg <- bkg[c("A", "C", "G", "T")]
560
-
561
- scounts <- as.integer(mcols(pwmList)$sequenceCount)
562
- scounts[is.na(scounts)] <- 20L
563
- pwmList.pc <- Map(function(pwm, scount) {
564
- pwm <- (pwm * scount + 0.25) / (scount + 1)
565
- }, pwmList, scounts)
566
- if (method == "ic") {
567
- pwmOmegas <- lapply(pwmList.pc, function(pwm, b = bkg) {
568
- omegaic <- colSums(pwm * log2(pwm / b))
569
- })
570
- }
571
- if (method == "default") {
572
- pwmOmegas <- lapply(pwmList.pc, function(pwm) {
573
- omegadefault <- colMaxs(pwm) - colMins(pwm)
574
- })
575
- }
576
- if (method == "log") {
577
- pwmList.pc <- lapply(pwmList.pc, function(pwm, b) {
578
- pwm <- log(pwm) - log(b)
579
- }, b = bkg)
580
- pwmOmegas <- 1
581
- }
582
- if (method == "notrans") {
583
- pwmOmegas <- 1
584
- }
585
- pwmList.pc <- Map(function(pwm, omega) {
586
- if (length(omega) == 1 && omega == 1) {
587
- return(pwm)
588
- } else {
589
- omegamatrix <- matrix(rep(omega, 4), nrow = 4, byrow = TRUE)
590
- pwm <- pwm * omegamatrix
591
- }
592
- }, pwmList.pc, pwmOmegas)
593
- pwmRanges <- Map(function(pwm, omega) {
594
- x <- colSums(colRanges(pwm))
595
- return(x)
596
- }, pwmList.pc, pwmOmegas)
597
- if (filterp) {
598
- pwmList.pc2 <- lapply(pwmList.pc, round, digits = 2)
599
- pwmThresh <- lapply(pwmList.pc2, TFMpv2sc, pvalue = scoreThresh, bg = bkg, type = "PWM")
600
- pwmThresh <- Map("+", pwmThresh, -0.02)
601
- } else {
602
- pwmThresh <- rep.int(scoreThresh, times = length(pwmRanges))
603
- }
604
- pwmList@listData <- lapply(pwmList, function(pwm) {
605
- pwm <- pwm[c("A", "C", "G", "T"), ]
606
- pwm <- rbind(pwm, N = 0)
607
- colnames(pwm) <- as.character(1:ncol(pwm))
608
- return(pwm)
609
- })
610
- pwmList.pc <- lapply(pwmList.pc, function(pwm) {
611
- pwm <- pwm[c("A", "C", "G", "T"), ]
612
- pwm <- rbind(pwm, N = 0)
613
- colnames(pwm) <- as.character(1:ncol(pwm))
614
- return(pwm)
615
- })
616
- return(list(
617
- pwmList = pwmList,
618
- pwmListPseudoCount = pwmList.pc,
619
- pwmRange = pwmRanges,
620
- pwmThreshold = pwmThresh
621
- ))
622
- }
623
-
624
-
625
- #' Predict The Disruptiveness Of Single Nucleotide Polymorphisms On
626
- #' Transcription Factor Binding Sites.
627
- #'
628
- #' @param snpList The output of \code{snps.from.rsid} or \code{snps.from.file}
629
- #' @param pwmList An object of class \code{MotifList} containing the motifs that
630
- #' you wish to interrogate
631
- #' @param threshold Numeric; the maximum p-value for a match to be called or a minimum score threshold
632
- #' @param method Character; one of \code{default}, \code{log}, \code{ic}, or \code{notrans}; see
633
- #' details.
634
- #' @param bkg Numeric Vector; the background probabilites of the nucleotides
635
- #' used with method=\code{log} method=\code{ic}
636
- #' @param filterp Logical; filter by p-value instead of by pct score.
637
- #' @param show.neutral Logical; include neutral changes in the output
638
- #' @param verbose Logical; if running serially, show verbose messages
639
- #' @param BPPARAM a BiocParallel object see \code{\link[BiocParallel]{register}}
640
- #' and see \code{getClass("BiocParallelParam")} for additional parameter
641
- #' classes. Try \code{BiocParallel::registered()} to see what's availible and
642
- #' for example \code{BiocParallel::bpparam("SerialParam")} would allow serial
643
- #' evaluation.
644
- #' @seealso See \code{\link{snps.from.rsid}} and \code{\link{snps.from.file}} for
645
- #' information about how to generate the input to this function and
646
- #' \code{\link{plotMB}} for information on how to visualize it's output
647
- #' @details \pkg{motifbreakR} works with position probability matrices (PPM). PPM
648
- #' are derived as the fractional occurrence of nucleotides A,C,G, and T at
649
- #' each position of a position frequency matrix (PFM). PFM are simply the
650
- #' tally of each nucleotide at each position across a set of aligned
651
- #' sequences. With a PPM, one can generate probabilities based on the
652
- #' genome, or more practically, create any number of position specific
653
- #' scoring matrices (PSSM) based on the principle that the PPM contains
654
- #' information about the likelihood of observing a particular nucleotide at
655
- #' a particular position of a true transcription factor binding site. What
656
- #' follows is a discussion of the three different algorithms that may be
657
- #' employed in calls to the \pkg{motifbreakR} function via the \code{method}
658
- #' argument.
659
- #'
660
- #' Suppose we have a frequency matrix \eqn{M} of width \eqn{n} (\emph{i.e.} a
661
- #' PPM as described above). Furthermore, we have a sequence \eqn{s} also of
662
- #' length \eqn{n}, such that
663
- #' \eqn{s_{i} \in \{ A,T,C,G \}, i = 1,\ldots n}{s_i in {A,T,G,C}, i = 1 \ldots n}.
664
- #' Each column of
665
- #' \eqn{M} contains the frequencies of each letter in each position.
666
- #'
667
- #' Commonly in the literature sequences are scored as the sum of log
668
- #' probabilities:
669
- #'
670
- #' \strong{Equation 1}
671
- #'
672
- #' \deqn{F( s,M ) = \sum_{i = 1}^{n}{\log( \frac{M_{s_{i},i}}{b_{s_{i}}} )}}{
673
- #' F( s,M ) = \sum_(i = 1)^n log ((M_s_i,_i)/b_s_i)}
674
- #'
675
- #' where \eqn{b_{s_{i}}}{b_s_i} is the background frequency of letter \eqn{s_{i}}{s_i} in
676
- #' the genome of interest. This method can be specified by the user as
677
- #' \code{method='log'}.
678
- #'
679
- #' As an alternative to this method, we introduced a scoring method to
680
- #' directly weight the score by the importance of the position within the
681
- #' match sequence. This method of weighting is accessed by specifying
682
- #' \code{method='ic'} (information content). A general representation
683
- #' of this scoring method is given by:
684
- #'
685
- #' \strong{Equation 2}
686
- #'
687
- #' \deqn{F( s,M ) = p_{s} \cdot \omega_{M}}{F( s,M ) = p_s . \omega_M}
688
- #'
689
- #' where \eqn{p_{s}}{p_s} is the scoring vector derived from sequence \eqn{s} and matrix
690
- #' \eqn{M}, and \eqn{w_{M}}{w_M} is a weight vector derived from \eqn{M}. First, we
691
- #' compute the scoring vector of position scores \eqn{p}
692
- #'
693
- #' \strong{Equation 3}
694
- #'
695
- #' \deqn{p_{s} = ( M_{s_{i},i} ) \textrm{\ \ \ where\ \ \ } \frac{i = 1,\ldots n}{s_{i} \in \{ A,C,G,T \}}}{
696
- #' p_s = ( M_s_i,_i ) where (i = 1 \ldots n)/(s_i in {A,C,G,T})}
697
- #'
698
- #' and second, for each \eqn{M} a constant vector of weights
699
- #' \eqn{\omega_{M} = ( \omega_{1},\omega_{2},\ldots,\omega_{n} )}{\omega_M = ( \omega_1, \omega_2, \ldots, \omega_n)}.
700
- #'
701
- #' There are two methods for producing \eqn{\omega_{M}}{\omega_M}. The first, which we
702
- #' call weighted sum, is the difference in the probabilities for the two
703
- #' letters of the polymorphism (or variant), \emph{i.e.}
704
- #' \eqn{\Delta p_{s_{i}}}{\Delta p_s_i}, or the difference of the maximum and minimum
705
- #' values for each column of \eqn{M}:
706
- #'
707
- #' \strong{Equation 4.1}
708
- #'
709
- #' \deqn{\omega_{i} = \max \{ M_{i} \} - \min \{ M_{i} \}\textrm{\ \ \ \ where\ \ \ \ \ \ }i = 1,\ldots n}{
710
- #' \omega_i = max{M_i} - min{M_i} where i = 1 \ldots n}
711
- #'
712
- #' The second variation of this theme is to weight by relative entropy.
713
- #' Thus the relative entropy weight for each column \eqn{i} of the matrix is
714
- #' given by:
715
- #'
716
- #' \strong{Equation 4.2}
717
- #'
718
- #' \deqn{\omega_{i} = \sum_{j \in \{ A,C,G,T \}}^{}{M_{j,i}\log_2( \frac{M_{j,i}}{b_{i}} )}\textrm{\ \ \ \ \ where\ \ \ \ \ }i = 1,\ldots n}{
719
- #' \omega_i = \sum_{j in {A,C,G,T}} {M_(j,i)} log2(M_(j,i)/b_i) where i = 1 \ldots n}
720
- #'
721
- #' where \eqn{b_{i}}{b_i} is again the background frequency of the letter \eqn{i}.
722
- #'
723
- #' Thus, there are 3 possible algorithms to apply via the \code{method}
724
- #' argument. The first is the standard summation of log probabilities
725
- #' (\code{method='log'}). The second and third are the weighted sum and
726
- #' information content methods (\code{method='default'} and \code{method='ic'}) specified by
727
- #' equations 4.1 and 4.2, respectively. \pkg{motifbreakR} assumes a
728
- #' uniform background nucleotide distribution (\eqn{b}) in equations 1 and
729
- #' 4.2 unless otherwise specified by the user. Since we are primarily
730
- #' interested in the difference between alleles, background frequency is
731
- #' not a major factor, although it can change the results. Additionally,
732
- #' inclusion of background frequency introduces potential bias when
733
- #' collections of motifs are employed, since motifs are themselves
734
- #' unbalanced with respect to nucleotide composition. With these cautions
735
- #' in mind, users may override the uniform distribution if so desired. For
736
- #' all three methods, \pkg{motifbreakR} scores and reports the reference
737
- #' and alternate alleles of the sequence
738
- #' (\eqn{F( s_{\textsc{ref}},M )}{F( s_ref,M )} and
739
- #' \eqn{F( s_{\textsc{alt}},M )}{F( s_alt,M )}), and provides the matrix scores
740
- #' \eqn{p_{s_{\textsc{ref}}}}{p_s_ref} and \eqn{p_{s_{\textsc{alt}}}}{p_s_alt} of the SNP (or
741
- #' variant). The scores are scaled as a fraction of scoring range 0-1 of
742
- #' the motif matrix, \eqn{M}. If either of
743
- #' \eqn{F( s_{\textsc{ref}},M )}{F( s_ref,M )} and
744
- #' \eqn{F( s_{\textsc{alt}},M )}{F( s_alt,M )} is greater than a user-specified
745
- #' threshold (default value of 0.85) the SNP is reported. By default
746
- #' \pkg{motifbreakR} does not display neutral effects,
747
- #' (\eqn{\Delta p_{i} < 0.4}{\Delta p_i < 0.4}) but this behaviour can be
748
- #' overridden.
749
- #'
750
- #' Additionally, now, with the use of \code{\link{TFMPvalue-package}}, we may filter by p-value of the match.
751
- #' This is unfortunately a two step process. First, by invoking \code{filterp=TRUE} and setting a threshold at
752
- #' a desired p-value e.g 1e-4, we perform a rough filter on the results by rounding all values in the PWM to two
753
- #' decimal place, and calculating a scoring threshold based upon that. The second step is to use the function \code{\link{calculatePvalue}()}
754
- #' on a selection of results which will change the \code{Refpvalue} and \code{Altpvalue} columns in the output from \code{NA} to the p-value
755
- #' calculated by \code{\link{TFMsc2pv}}. This can be (although not always) a very memory and time intensive process if the algorithm doesn't converge rapidly.
756
- #'
757
- #' @return a GRanges object containing:
758
- #' \item{REF}{the reference allele for the variant}
759
- #' \item{ALT}{the alternate allele for the variant}
760
- #' \item{snpPos}{the coordinates of the variant}
761
- #' \item{motifPos}{The position of the motif relative the the variant}
762
- #' \item{geneSymbol}{the geneSymbol corresponding to the TF of the TF binding motif}
763
- #' \item{dataSource}{the source of the TF binding motif}
764
- #' \item{providerName, providerId}{the name and id provided by the source}
765
- #' \item{seqMatch}{the sequence on the 5' -> 3' direction of the "+" strand
766
- #' that corresponds to DNA at the position that the TF binding motif was found.}
767
- #' \item{pctRef}{The score as determined by the scoring method, when the sequence contains the reference variant allele, normalized to a scale from 0 - 1. If \code{filterp = FALSE},
768
- #' this is the value that is thresholded.}
769
- #' \item{pctAlt}{The score as determined by the scoring method, when the sequence contains the alternate variant allele, normalized to a scale from 0 - 1. If \code{filterp = FALSE},
770
- #' this is the value that is thresholded.}
771
- #' \item{scoreRef}{The score as determined by the scoring method, when the sequence contains the reference variant allele}
772
- #' \item{scoreAlt}{The score as determined by the scoring method, when the sequence contains the alternate variant allele}
773
- #' \item{Refpvalue}{p-value for the match for the pctRef score, initially set to \code{NA}. see \code{\link{calculatePvalue}} for more information}
774
- #' \item{Altpvalue}{p-value for the match for the pctAlt score, initially set to \code{NA}. see \code{\link{calculatePvalue}} for more information}
775
- #' \item{alleleRef}{The proportional frequency of the reference allele at position \code{motifPos} in the motif}
776
- #' \item{alleleAlt}{The proportional frequency of the alternate allele at position \code{motifPos} in the motif}
777
- #' \item{altPos}{the position, relative to the reference allele, of the alternate allele}
778
- #' \item{alleleDiff}{The difference between the score on the reference allele and the score on the alternate allele}
779
- #' \item{alleleEffectSize}{The ratio of the \code{alleleDiff} and the maximal score of a sequence under the PWM}
780
- #' \item{effect}{one of weak, strong, or neutral indicating the strength of the effect.}
781
- #' each SNP in this object may be plotted with \code{\link{plotMB}}
782
- #' @examples
783
- #' library(BSgenome.Hsapiens.UCSC.hg19)
784
- #' # prepare variants
785
- #' load(system.file("extdata",
786
- #' "pca.enhancer.snps.rda",
787
- #' package = "motifbreakR"
788
- #' )) # loads snps.mb
789
- #' pca.enhancer.snps <- sample(snps.mb, 20)
790
- #' # Get motifs to interrogate
791
- #' data(hocomoco)
792
- #' motifs <- sample(hocomoco, 50)
793
- #' # run motifbreakR
794
- #' results <- motifbreakR(pca.enhancer.snps,
795
- #' motifs,
796
- #' threshold = 0.85,
797
- #' method = "ic",
798
- #' BPPARAM = BiocParallel::SerialParam()
799
- #' )
800
- #' @import BiocParallel
801
- #' @import parallel
802
- #' @importFrom parallel clusterEvalQ
803
- #' @importFrom BiocParallel bplapply
804
- #' @importFrom stringr str_length str_trim
805
- #' @export
806
- motifbreakR <- function(snpList, pwmList, threshold = 0.85, filterp = FALSE,
807
- method = "default", show.neutral = FALSE, verbose = FALSE,
808
- bkg = c(A = 0.25, C = 0.25, G = 0.25, T = 0.25),
809
- BPPARAM = bpparam()) {
810
- ## Cluster / MC setup
811
- if (.Platform$OS.type == "windows" && inherits(BPPARAM, "MulticoreParam")) {
812
- warning(paste0(
813
- "Serial evaluation under effect, to achive parallel evaluation under\n",
814
- "Windows, please supply an alternative BPPARAM"
815
- ))
816
- }
817
- cores <- bpnworkers(BPPARAM)
818
- num.snps <- length(snpList)
819
- if (num.snps < cores) {
820
- cores <- num.snps
821
- }
822
- if (!(is(BPPARAM, "MulticoreParam") | is(BPPARAM, "SerialParam"))) {
823
- bpstart(BPPARAM)
824
- cl <- bpbackend(BPPARAM)
825
- clusterEvalQ(cl, library("MotifDb"))
826
- }
827
-
828
- pwms <- preparePWM(
829
- pwmList = pwmList, filterp = filterp,
830
- scoreThresh = threshold, bkg = bkg,
831
- method = method
832
- )
833
- snpList <- prepareVariants(fsnplist = snpList)
834
-
835
- snpList_cores <- split(as.list(rep(names(snpList), times = cores)), 1:cores)
836
- for (splitr in seq_along(snpList)) {
837
- splitcores <- sapply(suppressWarnings(split(snpList[[splitr]], 1:cores)), list)
838
- for (splitcore in seq_along(snpList_cores)) {
839
- snpList_cores[[splitcore]][[splitr]] <- splitcores[[splitcore]]
840
- names(snpList_cores[[splitcore]])[splitr] <- names(snpList)[splitr]
841
- }
842
- }
843
- snpList <- snpList_cores
844
- rm(snpList_cores)
845
-
846
- x <- bplapply(snpList, scoreSnpList,
847
- pwmList = pwms$pwmList, threshold = pwms$pwmThreshold,
848
- pwmList.pc = pwms$pwmListPseudoCount, pwmRanges = pwms$pwmRange,
849
- method = method, bkg = bkg, show.neutral = show.neutral,
850
- verbose = ifelse(cores == 1, verbose, FALSE),
851
- filterp = filterp, BPPARAM = BPPARAM
852
- )
853
-
854
- ## Cluster / MC cleanup
855
- if (inherits(x, "try-error")) {
856
- if (is(BPPARAM, "SnowParam")) {
857
- bpstop(BPPARAM)
858
- }
859
- stop(attributes(x)$condition)
860
- }
861
- if (is(BPPARAM, "SnowParam")) {
862
- bpstop(BPPARAM)
863
- }
864
-
865
- drops <- sapply(x, is.null)
866
- x <- x[!drops]
867
- pwmList <- pwms$pwmList
868
- pwmList@listData <- lapply(pwms$pwmList, function(pwm) {
869
- pwm <- pwm[c("A", "C", "G", "T"), ]
870
- return(pwm)
871
- })
872
- pwmList.pc <- lapply(pwms$pwmListPseudoCount, function(pwm) {
873
- pwm <- pwm[c("A", "C", "G", "T"), ]
874
- return(pwm)
875
- })
876
-
877
- if (length(x) > 1) {
878
- x <- unlist(GRangesList(unname(x)))
879
- snpList <- unlist(GRangesList(lapply(snpList, `[[`, "fsnplist")), use.names = FALSE)
880
- x <- x[order(match(x$SNP_id, snpList$SNP_id)), , drop = FALSE]
881
- # attributes(x)$genome.package <- genome.package
882
- attributes(x)$motifs <- pwmList[mcols(pwmList)$providerId %in% unique(x$providerId) &
883
- mcols(pwmList)$providerName %in% unique(x$providerName), ]
884
- attributes(x)$scoremotifs <- pwmList.pc[names(attributes(x)$motifs)]
885
- } else {
886
- if (length(x) == 1L) {
887
- x <- x[[1]]
888
- # attributes(x)$genome.package <- genome.package
889
- attributes(x)$motifs <- pwmList[mcols(pwmList)$providerId %in% unique(x$providerId) &
890
- mcols(pwmList)$providerName %in% unique(x$providerName), ]
891
- attributes(x)$scoremotifs <- pwmList.pc[names(attributes(x)$motifs)]
892
- } else {
893
- warning("No SNP/Motif Interactions reached threshold")
894
- x <- NULL
895
- }
896
- }
897
- if (verbose && cores > 1) {
898
- if (is.null(x)) {
899
- message(paste(
900
- "reached end of SNPs list length =", num.snps, "with 0 potentially disruptive matches to",
901
- length(unique(x$geneSymbol)), "of", length(pwmList), "motifs."
902
- ))
903
- } else {
904
- message(paste(
905
- "reached end of SNPs list length =", num.snps, "with",
906
- length(x), "potentially disruptive matches to", length(unique(x$geneSymbol)),
907
- "of", length(pwmList), "motifs."
908
- ))
909
- }
910
- }
911
- return(x)
912
- }
913
-
914
-
915
- #' Calculate the significance of the matches for the reference and alternate alleles for the for their PWM
916
- #'
917
- #' @param results The output of \code{motifbreakR} that was run with \code{filterp=TRUE}
918
- #' @param background Numeric Vector; the background probabilities of the nucleotides
919
- #' @param granularity Numeric Vector; the granularity to which to round the PWM,
920
- #' larger values compromise full accuracy for speed of calculation. A value of
921
- #' \code{NULL} does no rounding.
922
- #' @param BPPARAM a BiocParallel object see \code{\link[BiocParallel]{register}}
923
- #' and see \code{getClass("BiocParallelParam")} for additional parameter
924
- #' classes. Try \code{BiocParallel::registered()} to see what's available and
925
- #' for example \code{BiocParallel::bpparam("SerialParam")} would allow serial
926
- #' evaluation.
927
- #' @return a GRanges object. The same Granges object that was input as \code{results}, but with
928
- #' \code{Refpvalue} and \code{Altpvalue} columns in the output modified from \code{NA} to the p-value
929
- #' calculated by \code{\link{TFMsc2pv}}.
930
- #' @seealso See \code{\link{TFMsc2pv}} from the \pkg{TFMPvalue} package for
931
- #' information about how the p-values are calculated.
932
- #' @details This function is intended to be used on a selection of results produced by \code{\link{motifbreakR}}, and
933
- #' this can be (although not always) a very memory and time intensive process if the algorithm doesn't converge rapidly.
934
- #' @source H{\'e}l{\`e}ne Touzet and Jean-St{\'e}phane Varr{\'e} (2007) Efficient and accurate P-value computation for Position Weight Matrices.
935
- #' Algorithms for Molecular Biology, \bold{2: 15}.
936
- #' @examples
937
- #' data(example.results)
938
- #' rs1006140 <- example.results[example.results$SNP_id %in% "rs1006140"]
939
- #' # low granularity for speed; 1e-6 or 1e-7 recommended for accuracy
940
- #' rs1006140 <- calculatePvalue(rs1006140, BPPARAM = BiocParallel::SerialParam(), granularity = 1e-4)
941
- #'
942
- #' ##' @importFrom qvalue qvalue
943
- #'
944
- #' @export
945
- calculatePvalue <- function(results,
946
- background = c(A = 0.25, C = 0.25, G = 0.25, T = 0.25),
947
- granularity = NULL,
948
- BPPARAM = BiocParallel::SerialParam()) {
949
-
950
- ## Cluster / MC setup
951
- if (.Platform$OS.type == "windows" && inherits(BPPARAM, "MulticoreParam")) {
952
- warning(paste0(
953
- "Serial evaluation under effect, to achive parallel evaluation under\n",
954
- "Windows, please supply an alternative BPPARAM"
955
- ))
956
- }
957
- cores <- bpnworkers(BPPARAM)
958
- num.res <- length(results)
959
- if (num.res < cores) {
960
- cores <- num.res
961
- }
962
- if (!(is(BPPARAM, "MulticoreParam") | is(BPPARAM, "SerialParam"))) {
963
- bpstart(BPPARAM)
964
- cl <- bpbackend(BPPARAM)
965
- clusterEvalQ(cl, library("MotifDb"))
966
- }
967
- if (!("scoreRef" %in% names(mcols(results)))) {
968
- stop("incorrect results format; please rerun analysis with filterp=TRUE")
969
- } else {
970
- pwmListmeta <- mcols(attributes(results)$motifs, use.names = TRUE)
971
- pwmList <- attributes(results)$scoremotifs
972
- if (!is.null(granularity)) {
973
- pwmList <- lapply(pwmList, function(x, g) {
974
- x <- floor(x / g) * g
975
- return(x)
976
- }, g = granularity)
977
- }
978
- results_sp <- split(results, 1:length(results))
979
- pvalues <- bplapply(results_sp, function(i, pwmList, pwmListmeta, bkg) {
980
- result <- i
981
- pwm.id <- result$providerId
982
- pwm.name.f <- result$providerName
983
- pwmmeta <- pwmListmeta[pwmListmeta$providerId == pwm.id & pwmListmeta$providerName == pwm.name.f, ]
984
- pwm <- pwmList[[rownames(pwmmeta)[1]]]
985
- ref <- TFMsc2pv(pwm, mcols(result)[["scoreRef"]], bg = bkg, type = "PWM")
986
- alt <- TFMsc2pv(pwm, mcols(result)[["scoreAlt"]], bg = bkg, type = "PWM")
987
- # gc()
988
- return(data.frame(ref = ref, alt = alt))
989
- }, pwmList = pwmList, pwmListmeta = pwmListmeta, bkg = background, BPPARAM = BPPARAM)
990
- ## Cluster / MC cleanup
991
- if (inherits(pvalues, "try-error")) {
992
- if (is(BPPARAM, "SnowParam")) {
993
- bpstop(BPPARAM)
994
- }
995
- stop(attributes(pvalues)$condition)
996
- }
997
- pvalues.df <- base::do.call("rbind", c(pvalues, make.row.names = FALSE))
998
- results$Refpvalue <- pvalues.df[, "ref"]
999
- results$Altpvalue <- pvalues.df[, "alt"]
1000
-
1001
- if (is(BPPARAM, "SnowParam")) {
1002
- bpstop(BPPARAM)
1003
- }
1004
-
1005
- return(results)
1006
- }
1007
- }
1008
-
1009
- addPWM.stack <- function(identifier, index, GdObject, pwm_stack, ...) {
1010
- plotMotifLogoStack.3(pwm_stack)
1011
- }
1012
-
1013
- selcor <- function(identifier, index, GdObject, ...) {
1014
- if (identical(index, 1L)) {
1015
- return(TRUE)
1016
- } else {
1017
- return(FALSE)
1018
- }
1019
- }
1020
-
1021
- selall <- function(identifier, GdObject, ...) {
1022
- return(TRUE)
1023
- }
1024
-
1025
- #' @importFrom grid grid.newpage pushViewport viewport popViewport
1026
- plotMotifLogoStack.3 <- function(pfms, ...) {
1027
- n <- length(pfms)
1028
- lapply(pfms, function(.ele) {
1029
- # if (class(.ele) != "pfm")
1030
- if (!is(.ele, "pfm")) {
1031
- stop("pfms must be a list of class pfm")
1032
- }
1033
- })
1034
- assign("tmp_motifStack_symbolsCache", list(), pos = ".GlobalEnv")
1035
- # grid.newpage()
1036
- ht <- 1 / n
1037
- y0 <- 0.5 * ht
1038
- for (i in rev(seq.int(n))) {
1039
- pushViewport(viewport(y = y0, height = ht))
1040
- plotMotifLogo(pfms[[i]],
1041
- motifName = pfms[[i]]@name, ncex = 1,
1042
- p = pfms[[i]]@background, colset = pfms[[i]]@color,
1043
- xlab = NA, newpage = FALSE, margins = c(
1044
- 1.5, 4.1,
1045
- 1.1, 0.1
1046
- ), ...
1047
- )
1048
- popViewport()
1049
- y0 <- y0 + ht
1050
- }
1051
- rm(list = "tmp_motifStack_symbolsCache", pos = ".GlobalEnv")
1052
- return()
1053
- }
1054
-
1055
- #' @importFrom stringr str_replace
1056
- #' @importFrom motifStack addBlank
1057
- DNAmotifAlignment.2snp <- function(pwms, result) {
1058
- from <- min(sapply(result$motifPos, `[`, 1))
1059
- to <- max(sapply(result$motifPos, `[`, 2))
1060
- # pos <- mcols(result)$motifPos
1061
- # pos <- pos[as.logical(strand(result) == "+")][1]
1062
- for (pwm.i in seq_along(pwms)) {
1063
- # pwm <- pwms[[pwm.i]]@mat
1064
- ## get pwm info from result data
1065
- pwm.name <- pwms[[pwm.i]]@name
1066
- pwm.name <- str_replace(pwm.name, pattern = "-:rc$", replacement = "")
1067
- pwm.name <- str_replace(pwm.name, pattern = "-:r$", replacement = "")
1068
- pwm.info <- attributes(result)$motifs
1069
- pwm.id <- mcols(pwm.info[pwm.name, ])$providerId
1070
- pwm.name <- mcols(pwm.info[pwm.name, ])$providerName
1071
- mresult <- result[result$providerId == pwm.id & result$providerName == pwm.name, ]
1072
- mstart <- mresult$motifPos[[1]][1]
1073
- mend <- mresult$motifPos[[1]][2]
1074
- if ((mcols(mresult)$varType == "Insertion" & mcols(mresult)$alleleDiff < 0) |
1075
- (mcols(mresult)$varType == "Deletion" & mcols(mresult)$alleleDiff > 0)) {
1076
- new.mat <- cbind(
1077
- pwms[[pwm.i]]@mat[, 1:abs(mresult$motifPos[[1]][1])],
1078
- matrix(c(0.25, 0.25, 0.25, 0.25), ncol = length(mcols(mresult)$altPos[[1]]), nrow = 4),
1079
- pwms[[pwm.i]]@mat[, (abs(mresult$motifPos[[1]][1]) + 1):ncol(pwms[[pwm.i]]@mat)]
1080
- )
1081
- pwms[[pwm.i]]@mat <- new.mat
1082
- start.offset <- mstart - from
1083
- end.offset <- to - mend
1084
- } else {
1085
- if (mstart < 0 | from > 0) {
1086
- start.offset <- mstart - from
1087
- } else {
1088
- start.offset <- (mstart - 1) - from
1089
- }
1090
- if (mend > 0 | to < 0) {
1091
- end.offset <- to - mend
1092
- } else {
1093
- end.offset <- to - (mend + 1)
1094
- }
1095
- }
1096
- if (start.offset > 0) {
1097
- pwms[[pwm.i]] <- addBlank(x = pwms[[pwm.i]], n = start.offset, b = FALSE)
1098
- }
1099
- if (end.offset > 0) {
1100
- pwms[[pwm.i]] <- addBlank(x = pwms[[pwm.i]], n = end.offset, b = TRUE)
1101
- }
1102
- }
1103
- return(pwms)
1104
- }
1105
-
1106
-
1107
-
1108
- #' Plot a genomic region surrounding a genomic variant, and potentially disrupted
1109
- #' motifs
1110
- #'
1111
- #' @param results The output of \code{motifbreakR}
1112
- #' @param rsid Character; the identifier of the variant to be visualized
1113
- #' @param reverseMotif Logical; if the motif is on the "-" strand show the
1114
- #' the motifs as reversed \code{FALSE} or reverse complement \code{TRUE}
1115
- #' @param effect Character; show motifs that are strongly effected \code{c("strong")},
1116
- #' weakly effected \code{c("weak")}, or both \code{c("strong", "weak")}
1117
- #' @param altAllele Character; The default value of \code{NULL} uses the first (or only)
1118
- #' alternative allele for the SNP to be plotted.
1119
- #' @seealso See \code{\link{motifbreakR}} for the function that produces output to be
1120
- #' visualized here, also \code{\link{snps.from.rsid}} and \code{\link{snps.from.file}}
1121
- #' for information about how to generate the input to \code{\link{motifbreakR}}
1122
- #' function.
1123
- #' @details \code{plotMB} produces output showing the location of the SNP on the
1124
- #' chromosome, the surrounding sequence of the + strand, the footprint of any
1125
- #' motif that is disrupted by the SNP or SNV, and the DNA sequence motif(s).
1126
- #' The \code{altAllele} argument is included for variants like rs1006140 where
1127
- #' multiple alternate alleles exist, the reference allele is A, and the alternate
1128
- #' can be G,T, or C. \code{plotMB} only plots one alternate allele at a time.
1129
- #' @return plots a figure representing the results of \code{motifbreakR} at the
1130
- #' location of a single SNP, returns invisible \code{NULL}.
1131
- #' @examples
1132
- #' data(example.results)
1133
- #' example.results
1134
- #' \donttest{
1135
- #' library(BSgenome.Hsapiens.UCSC.hg19)
1136
- #' plotMB(results = example.results, rsid = "rs1006140", effect = "strong", altAllele = "C")
1137
- #' }
1138
- #' @importFrom motifStack DNAmotifAlignment colorset motifStack plotMotifLogo plotMotifLogoStack
1139
- #' @importClassesFrom motifStack pfm marker
1140
- #' @import grDevices
1141
- #' @importFrom grid gpar
1142
- #' @importFrom Gviz IdeogramTrack SequenceTrack GenomeAxisTrack HighlightTrack
1143
- #' AnnotationTrack plotTracks
1144
- #' @export
1145
- plotMB <- function(results, rsid, reverseMotif = TRUE, effect = c("strong", "weak"), altAllele = NULL) {
1146
- motif.starts <- sapply(results$motifPos, `[`, 1)
1147
- motif.starts <- start(results) + motif.starts
1148
- motif.starts <- order(motif.starts)
1149
- results <- results[motif.starts]
1150
- g <- genome(results)[[1]]
1151
- result <- results[results$SNP_id %in% rsid]
1152
- if (is.null(altAllele)) {
1153
- altAllele <- result$ALT[[1]]
1154
- }
1155
- result <- result[result$ALT == altAllele]
1156
- result <- result[order(sapply(result$motifPos, min), sapply(result$motifPos, max)), ]
1157
- result <- result[result$effect %in% effect]
1158
- chromosome <- as.character(seqnames(result))[[1]]
1159
- genome.package <- attributes(result)$genome.package
1160
- genome.bsgenome <- eval(parse(text = genome.package))
1161
- seq.len <- max(length(result$REF[[1]]), length(result$ALT[[1]]))
1162
- distance.to.edge <- max(abs(c(
1163
- sapply(result$motifPos, min),
1164
- sapply(result$motifPos, max)
1165
- ))) + 4
1166
- from <- start(result)[[1]] - distance.to.edge + 1
1167
- to <- end(result)[[1]] + distance.to.edge
1168
- pwmList <- attributes(result)$motifs
1169
- pwm.names <- result$providerId
1170
- results_motifs <- paste0(result$providerId, result$providerName)
1171
- list_motifs <- paste0(mcols(pwmList)$providerId, mcols(pwmList)$providerName)
1172
- pwms <- pwmList <- pwmList[match(results_motifs, list_motifs)]
1173
- if (reverseMotif) {
1174
- for (pwm.i in seq_along(pwms)) {
1175
- pwm.name <- names(pwms[pwm.i])
1176
- pwm.id <- mcols(pwms[pwm.name, ])$providerId
1177
- pwm.name.f <- mcols(pwms[pwm.name, ])$providerName
1178
- doRev <- as.logical(strand(result[result$providerId == pwm.id & result$providerName == pwm.name.f, ]) == "-")
1179
- if (doRev) {
1180
- pwm <- pwms[[pwm.i]]
1181
- pwm <- pwm[, rev(1:ncol(pwm))]
1182
- rownames(pwm) <- c("T", "G", "C", "A")
1183
- pwm <- pwm[c("A", "C", "G", "T"), ]
1184
- pwms[[pwm.i]] <- pwm
1185
- names(pwms)[pwm.i] <- paste0(names(pwms)[pwm.i], "-:rc")
1186
- }
1187
- }
1188
- } else {
1189
- for (pwm.i in seq_along(pwms)) {
1190
- pwm.name <- names(pwms[pwm.i])
1191
- pwm.id <- mcols(pwms[pwm.name, ])$providerId
1192
- pwm.name.f <- mcols(pwms[pwm.name, ])$providerName
1193
- doRev <- as.logical(strand(result[result$providerId == pwm.id & result$providerName == pwm.name.f, ]) == "-")
1194
- if (doRev) {
1195
- pwm <- pwms[[pwm.i]]
1196
- pwm <- pwm[, rev(1:ncol(pwm))]
1197
- pwms[[pwm.i]] <- pwm
1198
- names(pwms)[pwm.i] <- paste0(names(pwms)[pwm.i], "-:r")
1199
- }
1200
- }
1201
- }
1202
- pwms <- lapply(names(pwms), function(x, pwms = pwms) {
1203
- new("pfm",
1204
- mat = pwms[[x]],
1205
- name = x
1206
- )
1207
- }, pwms)
1208
- pwms <- DNAmotifAlignment.2snp(pwms, result)
1209
- pwmwide <- max(sapply(pwms, function(x) {
1210
- ncol(x@mat)
1211
- }))
1212
-
1213
- markerStart <- result$motifPos[[1]][1]
1214
- if (markerStart > 0) {
1215
- markerEnd <- length(result$altPos[[1]]) + 1
1216
- markerEnd <- markerEnd - markerStart
1217
- markerStart <- 1
1218
- } else {
1219
- markerStart <- -1 * markerStart
1220
- markerEnd <- markerStart + length(result$altPos[[1]])
1221
- if (result$varType[[1]] %in% c("Other", "SNV")) {
1222
- markerStart <- markerStart + 1
1223
- }
1224
- }
1225
- varType <- result$varType[[1]]
1226
- varType <- switch(varType,
1227
- Deletion = "firebrick",
1228
- Insertion = "springgreen4",
1229
- Other = "gray13"
1230
- )
1231
- markerRect <- new("marker",
1232
- type = "rect",
1233
- start = markerStart,
1234
- stop = markerEnd,
1235
- gp = gpar(
1236
- lty = 2,
1237
- fill = NA,
1238
- lwd = 3,
1239
- col = varType
1240
- )
1241
- )
1242
- for (pwm.i in seq_along(pwms)) {
1243
- pwms[[pwm.i]]@markers <- list(markerRect)
1244
- }
1245
- ideoT <- try(IdeogramTrack(genome = g, chromosome = chromosome), silent = TRUE)
1246
- if (inherits(ideoT, "try-error")) {
1247
- backup.band <- data.frame(
1248
- chrom = chromosome, chromStart = 0,
1249
- chromEnd = length(genome.bsgenome[[chromosome]]),
1250
- name = chromosome, gieStain = "gneg"
1251
- )
1252
- ideoT <- IdeogramTrack(genome = g, chromosome = chromosome, bands = backup.band)
1253
- }
1254
-
1255
- ### blank alt sequence
1256
- altseq <- genome.bsgenome[[chromosome]]
1257
-
1258
- ### Replace longer sections
1259
- at <- IRanges(start = start(result[1]), width = width(result[1]))
1260
- if (result$varType[[1]] == "Deletion") {
1261
- reflen <- length(result$REF[[1]])
1262
- addedN <- DNAString(paste0(rep.int(".", reflen), collapse = ""))
1263
- addedN <- replaceLetterAt(addedN, at = (1:reflen)[-result$altPos[[1]]], result$ALT[[1]])
1264
- axisT <- GenomeAxisTrack(exponent = 0)
1265
- seqT <- SequenceTrack(genome.bsgenome, fontcolor = colorset("DNA", "auto"))
1266
- altseq <- replaceAt(x = altseq, at = at, addedN)
1267
- } else if (result$varType[[1]] == "Insertion") {
1268
- altlen <- length(result$ALT[[1]])
1269
- addedN <- DNAString(paste0(rep.int(".", altlen), collapse = ""))
1270
- addedN <- replaceLetterAt(addedN, at = (1:altlen)[-result$altPos[[1]]], result$REF[[1]])
1271
- refseq <- genome.bsgenome[[chromosome]]
1272
- refseq <- DNAStringSet(replaceAt(x = refseq, at = at, addedN))
1273
- altseq <- replaceAt(x = altseq, at = at, result$ALT[[1]])
1274
- names(refseq) <- chromosome
1275
- seqT <- SequenceTrack(refseq,
1276
- fontcolor = c(colorset("DNA", "auto"), N = "#FFFFFF", . = "#FFE3E6"),
1277
- chromosome = chromosome
1278
- )
1279
- } else {
1280
- axisT <- GenomeAxisTrack(exponent = 0)
1281
- altseq <- replaceAt(x = altseq, at = at, result$ALT[[1]])
1282
- seqT <- SequenceTrack(genome.bsgenome, fontcolor = colorset("DNA", "auto"))
1283
- }
1284
- altseq <- DNAStringSet(altseq)
1285
- names(altseq) <- chromosome
1286
- seqAltT <- SequenceTrack(altseq,
1287
- fontcolor = c(colorset("DNA", "auto"), N = "#FFFFFF", . = "#FFE3E6"),
1288
- chromosome = chromosome
1289
- )
1290
-
1291
- # altseq <- replaceLetterAt(altseq, at = wherereplace, letter = rep.int("N", sum(wherereplace)))
1292
- # altseq <- replaceLetterAt(altseq, at = !wherereplace, letter = result$ALT[[1]])
1293
- histart <- start(result[1]) + min(result[1]$altPos[[1]]) - 2
1294
- histart <- ifelse(result[1]$varType %in% c("Other", "SNV"), histart + 1, histart)
1295
- hiend <- start(result[1]) + min(result[1]$altPos[[1]]) - 2 + length(result[1]$altPos[[1]])
1296
- hiT <- HighlightTrack(
1297
- trackList = list(seqT, seqAltT),
1298
- start = histart,
1299
- end = hiend,
1300
- chromosome = chromosome
1301
- )
1302
-
1303
- selectingfun <- selcor
1304
- detailfun <- addPWM.stack
1305
-
1306
- motif_ids <- names(pwmList)
1307
- names(motif_ids) <- mcols(pwmList)$providerName
1308
-
1309
- for (mymotif_i in seq_along(result)) {
1310
- mymotif <- result[mymotif_i]
1311
- start(mymotif) <- start(mymotif) + min(mymotif$altPos[[1]]) - 1
1312
- width(mymotif) <- length(mymotif$altPos[[1]])
1313
- variant.start <- start(mymotif)
1314
- variant.end <- end(mymotif)
1315
- if (mymotif$motifPos[[1]][1] < 0) {
1316
- start(mymotif) <- start(mymotif) + (mymotif$motifPos[[1]][1])
1317
- } else {
1318
- start(mymotif) <- start(mymotif) + (mymotif$motifPos[[1]][1] - 1)
1319
- }
1320
- if (mymotif$motifPos[[1]][2] < 0) {
1321
- end(mymotif) <- end(mymotif) + (mymotif$motifPos[[1]][2] + 1)
1322
- } else {
1323
- end(mymotif) <- end(mymotif) + (mymotif$motifPos[[1]][2])
1324
- }
1325
- if ((result[mymotif_i]$varType == "Deletion" & result[mymotif_i]$alleleDiff > 0) |
1326
- (result[mymotif_i]$varType == "Insertion" & result[mymotif_i]$alleleDiff < 0)) {
1327
- mymotif <- c(mymotif, mymotif)
1328
- end(mymotif)[1] <- variant.start - 1
1329
- start(mymotif)[2] <- variant.end + 1
1330
- mymotif[which.min(width(mymotif))]$motifPos <- NA
1331
- }
1332
- if (exists("mres")) {
1333
- mres <- c(mres, mymotif)
1334
- } else {
1335
- mres <- mymotif
1336
- }
1337
- }
1338
- result <- mres
1339
- rm(mres)
1340
- motif_ids <- motif_ids[result$providerName]
1341
- presult <- result
1342
- strand(presult) <- "*"
1343
- pres_cols <- DataFrame(
1344
- feature = ifelse(!is.na(result$motifPos),
1345
- paste(result$geneSymbol, "motif", sep = "_"), ""
1346
- ),
1347
- group = result$providerName,
1348
- id = motif_ids
1349
- )
1350
- presult <- GRanges(
1351
- seqnames = seqnames(result[1]),
1352
- ranges = ranges(result)
1353
- )
1354
- mcols(presult) <- pres_cols
1355
-
1356
- motifT <- AnnotationTrack(presult,
1357
- fun = detailfun,
1358
- detailsFunArgs = list(pwm_stack = pwms),
1359
- name = names(result)[[1]],
1360
- selectFun = selectingfun,
1361
- reverseStacking = FALSE,
1362
- stacking = "squish"
1363
- )
1364
-
1365
- if (exists("axisT")) {
1366
- track_list <- list(ideoT, motifT, hiT, axisT)
1367
- } else {
1368
- track_list <- list(ideoT, motifT, hiT)
1369
- }
1370
- plotTracks(track_list,
1371
- from = from, to = to, showBandId = TRUE,
1372
- cex.main = 0.8, col.main = "darkgrey",
1373
- add53 = TRUE, labelpos = "below", chromosome = chromosome, # groupAnnotation = "id",
1374
- fontcolor.item = "black",
1375
- collapse = FALSE, min.width = 1, featureAnnotation = "feature", cex.feature = 0.8,
1376
- details.size = 0.85, detailsConnector.pch = NA, detailsConnector.lty = 0,
1377
- shape = "box", cex.group = 0.8, fonts = c("sans", "Helvetica")
1378
- )
1379
- return(invisible(NULL))
1380
- }
1381
-
1382
- #' Export motifbreakR results to csv or tsv
1383
- #'
1384
- #' @param results The output of \code{motifbreakR}
1385
- #' @param file Character; the file name of the destination file
1386
- #' @param format Character; one of tsv (tab separated values) or csv (comma separated values)
1387
- #' @return \code{exportMBresults} produces an output file containing the output
1388
- #' of the motifbreakR function.
1389
- #' @examples
1390
- #' data(example.results)
1391
- #' example.results
1392
- #' \donttest{
1393
- #' exportMBresults(example.results, file = "output.tsv", format = "tsv")
1394
- #' }
1395
- #' @export
1396
- exportMBresults <- function(results, file, format = c("tsv")) {
1397
- if (missing(file)) {
1398
- stop("select output file location")
1399
- }
1400
- if (!(format %in% c("csv", "tsv"))) {
1401
- stop("format must be one of csv or tsv")
1402
- }
1403
- sep <- switch(format,
1404
- csv = ",",
1405
- tsv = "\t"
1406
- )
1407
- names(results) <- NULL
1408
- results <- as.data.frame(results)
1409
- results <- results[, !colnames(results) %in% "width"]
1410
- results$start <- results$start - 1
1411
- results$motifPos <- vapply(results$motifPos, function(x) {
1412
- paste0(x[1], ";", x[2])
1413
- }, FUN.VALUE = character(1))
1414
- if (format == "tsv") {
1415
- write.table(x = results, file = file, quote = FALSE, sep = sep, row.names = FALSE, col.names = TRUE)
1416
- } else {
1417
- write.csv(x = results, file = file, row.names = FALSE, col.names = TRUE)
1418
- }
1419
- }
1420
-
1421
- #' Find Corresponding TF Binding From The ReMap2022 Project
1422
- #'
1423
- #' @param remap_data Character; either the path to a local copy of the non-redundant
1424
- #' peak set, or one of \code{hg38} for Homo sapiens, \code{mm10} for Mus musculus,
1425
- #' \code{dm6} for Drosophila melanogaster, or \code{TAIR10} for Arabidopsis thaliana,
1426
- #' to be queried (slowly) online.
1427
- #' @details \code{snps.from.file} takes a character vector describing the file path
1428
- #' to a bed file that contains the necissary information to generate the input for
1429
- #' \code{motifbreakR} see \url{http://www.genome.ucsc.edu/FAQ/FAQformat.html#format1}
1430
- #' for a complete description of the BED format. Our convention deviates in that there
1431
- #' is a required format for the name field. \code{name} is defined as chromosome:start:REF:ALT
1432
- #' or the rsid from dbSNP (if you've included the optional SNPlocs argument).
1433
- #' For example if you were to include rs123 in it's alternate
1434
- #' format it would be entered as chr7:24966446:C:A
1435
- #' @return a GRanges object containing:
1436
- #' \item{SNP_id}{The rsid of the snp with the "rs" portion stripped}
1437
- #' \item{alleles_as_ambig}{THE IUPAC ambiguity code between the reference and
1438
- #' alternate allele for this SNP}
1439
- #' \item{REF}{The reference allele for the SNP}
1440
- #' \item{ALT}{The alternate allele for the SNP}
1441
- #' @examples
1442
- #' library(BSgenome.Drerio.UCSC.danRer7)
1443
- #'
1444
- #' @importFrom rtracklayer import
1445
- #' @importFrom Biostrings IUPAC_CODE_MAP uniqueLetters BStringSetList DNA_ALPHABET
1446
- #' @importFrom VariantAnnotation readVcf ref alt isSNV VcfFile ScanVcfParam
1447
- #' @importFrom SummarizedExperiment rowRanges
1448
- #' @importFrom stringr str_sort str_split
1449
- #' @export
1450
-
1451
- findSupportingRemapPeaks <- function(results, genome) {
1452
- genome <- match.arg(genome, c("hg38", "mm10", "dm6", "TAIR10_TF", "TAIR10_HISTONE"))
1453
-
1454
- remap_links <- list(
1455
- hg38 = "https://remap.univ-amu.fr/storage/remap2022/hg38/MACS2/remap2022_nr_macs2_hg38_v1_0.bed.gz",
1456
- mm10 = "https://remap.univ-amu.fr/storage/remap2022/mm10/MACS2/remap2022_nr_macs2_mm10_v1_0.bed.gz",
1457
- dm6 = "https://remap.univ-amu.fr/storage/remap2022/dm6/MACS2/remap2022_nr_macs2_dm6_v1_0.bed.gz",
1458
- TAIR10_TF = "https://remap.univ-amu.fr/storage/remap2022/tair10/tf/MACS2/remap2022_nr_macs2_TAIR10_v1_0.bed.gz",
1459
- TAIR10_HISTONE = "https://remap.univ-amu.fr/storage/remap2022/tair10/histones/MACS2/remap2022_histone_nr_macs2_TAIR10_v1_0.bed.gz"
1460
- )
1461
-
1462
- results$matchingBindingEvent <- NA
1463
- results$matchingCelltype <- NA
1464
-
1465
- uniqmb <- results
1466
- mcols(uniqmb) <- NULL
1467
- names(uniqmb) <- NULL
1468
- uniqmb <- sort(unique(uniqmb))
1469
-
1470
- remap_binding <- loadPeakFile(remap_links, genome)
1471
- remap_binding <- subsetByOverlaps(remap_binding, uniqmb)
1472
-
1473
- if (length(remap_binding) < 1) {
1474
- return(results)
1475
- }
1476
-
1477
- mb_overlaps <- findOverlaps(results, remap_binding)
1478
- variant_to_peak <- unique(data.frame(variant = names(results[queryHits(mb_overlaps), ]), tf.binding = remap_binding[subjectHits(mb_overlaps), ]$name))
1479
- tf_ct <- stringr::str_split(variant_to_peak$tf.binding, ":")
1480
- variant_to_peak$tf.binding <- vapply(tf_ct, FUN = `[`, character(1), 1)
1481
- variant_to_peak$tf.celltype <- vapply(tf_ct, FUN = `[`, character(1), 2)
1482
- variant_to_peak$tf.celltype <- lapply(variant_to_peak$tf.celltype, function(x) {
1483
- stringr::str_split(x, ",", simplify = T)[1, ]
1484
- })
1485
- mcol_variant_to_gene <- data.frame(variant = names(results), tf.gene = results$geneSymbol, motif = results$providerId)
1486
-
1487
- if ("manuallyCuratedGeneMotifAssociationTable" %in% slotNames(attributes(results)$motifs)) {
1488
- mb_genelist <- attributes(results)$motifs@manuallyCuratedGeneMotifAssociationTable
1489
- mb_genelist <- unique(mb_genelist[mb_genelist$motif %in% results$providerId, c("motif", "tf.gene")])
1490
- mb_genelist_aug <- unique(data.frame(motif = results$providerId, tf.gene = results$geneSymbol))
1491
- mb_genelist <- unique(rbind(mb_genelist, mb_genelist_aug))
1492
- variant_to_gene <- data.frame(variant = names(results), motif = results$providerId)
1493
- variant_to_gene_master <- merge(variant_to_gene, mb_genelist, all.x = TRUE, sort = FALSE)
1494
- variant_to_gene <- unique(variant_to_gene_master[, c("variant", "tf.gene")])
1495
- } else {
1496
- variant_to_gene_master <- mcol_variant_to_gene
1497
- variant_to_gene <- unique(variant_to_gene_master[, c("variant", "tf.gene")])
1498
- }
1499
-
1500
- variant_mb_peak <- merge(variant_to_gene_master, variant_to_peak, all.x = TRUE, sort = FALSE)
1501
- variant_mb_peak <- unique(variant_mb_peak[variant_mb_peak$tf.gene == variant_mb_peak$tf.binding, ])
1502
- variant_mb_peak <- variant_mb_peak[!is.na(variant_mb_peak$variant), c("variant", "tf.binding", "tf.celltype", "motif")]
1503
-
1504
- split_vmbp <- split(variant_mb_peak, as.factor(variant_mb_peak$variant))
1505
-
1506
- for (variant in seq_along(split_vmbp)) {
1507
- pvariant <- names(split_vmbp[variant])
1508
- vgm <- split_vmbp[[variant]]
1509
- mbv_sel <- which(mcol_variant_to_gene$variant == pvariant)
1510
- vgm$motif <- factor(vgm$motif, levels = mcol_variant_to_gene[mbv_sel, "motif"])
1511
- names(vgm$tf.celltype) <- vgm$tf.binding
1512
- vgmm <- vgm[which(vgm$motif %in% mcol_variant_to_gene$motif), ]
1513
- vgmm <- split(vgmm, vgmm$motif)
1514
- vgmg <- vgm[which(vgm$tf.gene %in% mcol_variant_to_gene$tf.gene), ]
1515
- vgmg <- split(vgmg, vgmg$motif)
1516
- vgm <- mapply(rbind, vgmm, vgmg, SIMPLIFY = F)
1517
-
1518
- results[mbv_sel]$matchingBindingEvent <- lapply(vgm, function(x) {
1519
- unique(x$tf.binding)
1520
- })[results[mbv_sel]$providerId]
1521
- results[mbv_sel]$matchingCelltype <- lapply(vgm, function(x) {
1522
- x$tf.celltype
1523
- })[results[mbv_sel]$providerId]
1524
- if (any(lengths(results[mbv_sel]$matchingBindingEvent) == 0)) {
1525
- results[mbv_sel][lengths(results[mbv_sel]$matchingBindingEvent) == 0]$matchingBindingEvent <- NA
1526
- results[mbv_sel][lengths(results[mbv_sel]$matchingCelltype) == 0]$matchingCelltype <- NA
1527
- }
1528
- }
1529
- attributes(results)$peaks <- remap_binding
1530
- return(results)
1531
- }
1532
-
1533
- #' @importFrom tools R_user_dir
1534
- #' @importFrom BiocFileCache BiocFileCache
1535
- .get_cache <- function() {
1536
- cache <- R_user_dir("motifbreakR", which = "cache")
1537
- BiocFileCache(cache = cache, ask = F)
1538
- }
1539
-
1540
- cachePeakFile <- function(fileURL, genome) {
1541
- bfc <- .get_cache()
1542
- rname <- paste("remap2022", genome, sep = "_")
1543
- rid <- bfcquery(bfc, genome, "rname")$rid
1544
- if (!length(rid)) {
1545
- rid <- names(bfcadd(bfc, rname, fileURL, ext = ".Rdata", download = FALSE))
1546
- }
1547
- if (!isFALSE(bfcneedsupdate(bfc, rid))) {
1548
- message("downloading peak file")
1549
- bfcdownload(bfc, rid, ask = FALSE, FUN = convertPeakFile)
1550
- message("peak download complete")
1551
- }
1552
- bfcrpath(bfc, rids = rid)
1553
- }
1554
-
1555
- #' @importFrom vroom vroom
1556
- loadPeakFile2 <- function(url_list, genome) {
1557
- peak_path <- cachePeakFile(url_list[[genome]], genome)
1558
- remap_peaks <- GRanges(vroom(peak_path,
1559
- delim = "\t",
1560
- col_names = c(
1561
- "chr", "start", "end", "name",
1562
- "score", "strand", "tstart",
1563
- "tend", "color"
1564
- ),
1565
- col_types = c("ciiciciic"),
1566
- col_select = c(chr, start, end, name),
1567
- progress = FALSE
1568
- ))
1569
- return(remap_peaks)
1570
- }
1571
-
1572
- loadPeakFile <- function(url_list, genome) {
1573
- peak_path <- cachePeakFile(url_list[[genome]], genome)
1574
- remap_peaks <- readRDS(peak_path)
1575
- return(remap_peaks)
1576
- }
1577
-
1578
- convertPeakFile <- function(from, to) {
1579
- message("processing peak file")
1580
- remap_peaks <- GRanges(vroom(from,
1581
- delim = "\t",
1582
- col_names = c(
1583
- "chr", "start", "end", "name",
1584
- "score", "strand", "tstart",
1585
- "tend", "color"
1586
- ),
1587
- col_types = c("ciiciciic"),
1588
- col_select = c(chr, start, end, name),
1589
- progress = FALSE
1590
- ))
1591
- saveRDS(remap_peaks, file = to)
1592
- message("processing peak file")
1593
- TRUE
1594
- }