biopipen 0.25.4__py3-none-any.whl → 0.26.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

@@ -0,0 +1,177 @@
1
+
2
+ library(ESCO)
3
+ library(rlang)
4
+ library(glue)
5
+
6
+ args <- {{envs.esco_args | r: todot="-"}}
7
+ args <- args %||% list()
8
+
9
+ save <- args$save
10
+ args$save <- NULL
11
+
12
+ if (!is.null(seed)) {
13
+ set.seed(seed)
14
+ args$seed <- seed
15
+ }
16
+ args$nGenes <- ngenes
17
+ args$nCells <- nsamples
18
+ args$dirname <- paste0(outdir, "/")
19
+ args$verbose <- TRUE
20
+ args$numCores <- ncores
21
+ type <- args$type
22
+
23
+ log_info("Running simulation ...")
24
+ sim <- do_call(escoSimulate, args)
25
+ attributes(sim) <- c(attributes(sim), c(simulation_tool = "ESCO"))
26
+ saveRDS(sim, file.path(outdir, "sim.rds"))
27
+
28
+ log_info("Plotting ...")
29
+ if (type == "single") {
30
+ asys <- assays(sim)
31
+ datalist = list(`simulated-truth` = asys$TrueCounts)
32
+ if (!is.null(asys$counts)) {
33
+ datalist$`zero-inflated` = asys$counts
34
+ }
35
+ if (!is.null(asys$observedcounts)) {
36
+ datalist$`down-sampled` = asys$observedcounts
37
+ }
38
+
39
+ log_info("- Plotting the data ...")
40
+ dataplot <- file.path(outdir, "data.png")
41
+ png(dataplot, width=length(datalist) * 600, height=1200, res=30)
42
+ heatdata(datalist, norm = FALSE, size = 2, ncol = 3)
43
+ dev.off()
44
+
45
+ rholist <- metadata(sim)$Params@corr
46
+ if (length(rholist) > 0) {
47
+ log_info("- Plotting the GCN ...")
48
+ corrgenes <- rownames(rholist[[1]])
49
+ gcnlist = lapply(datalist, function(data)gcn(data, genes = corrgenes))
50
+ gcnlist = append(gcnlist, list("given truth" = rholist[[1]]), 1)
51
+
52
+ gcnplot <- file.path(outdir, "gcn.png")
53
+ png(gcnplot, width=length(gcnlist) * 600, height=1200, res=30)
54
+ heatgcn(gcnlist, size = 2, ncol = 4)
55
+ dev.off()
56
+ }
57
+ } else if (type == "groups") {
58
+ asys <- assays(sim)
59
+ # organize the marker gene info
60
+ genegroup = paste0("Group", rowData(sim)$GeneGroup)
61
+ genegroup[which(genegroup=="Group0")] = "None"
62
+ geneinfo = data.frame(genes = rowData(sim)$Gene,
63
+ newcelltype = as.factor(genegroup))
64
+
65
+ # organize the cell info
66
+ cellinfo = data.frame(çells = colData(sim)$Cell,
67
+ newcelltype= as.factor(colData(sim)$Group))
68
+
69
+ # data
70
+ datalist = list(`simulated-truth` = asys$TrueCounts)
71
+ if (!is.null(asys$counts)) {
72
+ datalist$`zero-inflated` = asys$counts
73
+ }
74
+ if (!is.null(asys$observedcounts)) {
75
+ datalist$`down-sampled` = asys$observedcounts
76
+ }
77
+
78
+ log_info("- Plotting the data ...")
79
+ dataplot <- file.path(outdir, "data.png")
80
+ png(dataplot, width=length(datalist) * 600, height=1200, res=30)
81
+ heatdata(datalist, cellinfo = cellinfo, geneinfo = geneinfo, size = 1, ncol = 3)
82
+ dev.off()
83
+
84
+ log_info("- Plotting the GCN for all marker genes (i.e. DE genes) across all cell groups ...")
85
+ degeneinfo = geneinfo[which(geneinfo$newcelltype!="None"),]
86
+ degeneinfo$newcelltype = droplevels(degeneinfo$newcelltype)
87
+ degcnlist = lapply(datalist, function(data)gcn(data, genes = degeneinfo$genes))
88
+ gcnplot <- file.path(outdir, "gcn-allgroups.png")
89
+ png(gcnplot, width=length(degcnlist) * 700, height=1200, res=30)
90
+ heatgcn(degcnlist, geneinfo = degeneinfo, size = 2, ncol = 3)
91
+ dev.off()
92
+
93
+ log_info("- Plotting the GCN for marker genes within one cell group ...")
94
+ rholist = metadata(sim)$Params@corr
95
+ group2_gcnlist = lapply(datalist,
96
+ function(data){
97
+ gcn(data[,which(colData(sim)$Group=="Group2")],
98
+ CPM2 = TRUE,
99
+ genes = rownames(rholist[["Group2"]]))})
100
+ group2_gcnlist = append(group2_gcnlist,
101
+ list("given truth" = rholist[["Group2"]]), 1)
102
+ gcnplot2 <- file.path(outdir, "gcn-onegroup.png")
103
+ png(gcnplot2, width=length(group2_gcnlist) * 700, height=1200, res=30)
104
+ heatgcn(group2_gcnlist, size = 3, ncol = 4)
105
+ dev.off()
106
+ } else if (type == "tree") {
107
+ # get the data
108
+ datatrue = assays(sim)$TrueCounts
109
+
110
+ # get the cellinfo
111
+ cellinfo = data.frame(cell = colData(sim)$Cell,
112
+ newcelltype = as.factor(colData(sim)$Group))
113
+ levels(cellinfo$newcelltype) = tree$tip.label
114
+
115
+ # get the geneinfo
116
+ genegroup = paste0("Group", rowData(sim)$GeneGroup)
117
+ genegroup[which(genegroup=="Group0")] = "None"
118
+ geneinfo = data.frame(genes = rowData(sim)$Gene,
119
+ newcelltype = as.factor(genegroup))
120
+ levels(geneinfo$newcelltype)[1:3] = tree$tip.label
121
+
122
+ # get the DE geneinfo
123
+ groups <- colData(sim)$Group
124
+ group.names <- sort(unique(groups))
125
+ group.facs.gene <- rowData(sim)[, paste0("DEFac", group.names)]
126
+ DEgene.name = as.character(rowData(sim)$Gene[which(group.facs.gene[,1]>1)])
127
+ degeneinfo = geneinfo[match(DEgene.name, geneinfo$genes),]
128
+
129
+ log_info("- Plotting the data ...")
130
+ dataplot <- file.path(outdir, "data.png")
131
+ png(dataplot, width=2000, height=1200, res=30)
132
+ # plot the data
133
+ heatdata(list(datatrue),
134
+ colv = TRUE,
135
+ cellinfo = cellinfo,
136
+ geneinfo = degeneinfo,
137
+ genes = degeneinfo$genes,
138
+ size = 1.5, ncol = 1)
139
+ dev.off()
140
+ } else if (type == "traj") {
141
+ datatrue = assays(sim)$TrueCounts
142
+
143
+ # get the cellinfo
144
+ cellinfo = data.frame(cell = colData(sim)$Cell,
145
+ newcelltype = colData(sim)$Path)
146
+ # get the pesudo time
147
+ celltime = data.frame(path = as.numeric(colData(sim)$Path),
148
+ step = as.numeric(colData(sim)$Step))
149
+ celltime = order(celltime[,1], celltime[,2])
150
+
151
+ # get the geneinfo
152
+ degenes = which(metadata(sim)$Params@paths.DEgenes==1)
153
+
154
+ log_info("- Plotting the trajectory ...")
155
+ trajplot <- file.path(outdir, "traj.png")
156
+ png(trajplot, width=1600, height=1200, res=30)
157
+ # plot the data
158
+ umapplot(t(t(datatrue)/colSums(datatrue)),
159
+ celltype = colData(sim)$Path,
160
+ labels = levels(as.factor(colData(sim)$Path)))
161
+ dev.off()
162
+
163
+ log_info("- Plotting the data ...")
164
+ dataplot <- file.path(outdir, "data.png")
165
+ heatdata(list("simulated truth" = datatrue[degenes,]),
166
+ cellinfo = cellinfo,
167
+ colv = celltime, size = 1, ncol = 1)
168
+ dev.off()
169
+ }
170
+
171
+ simulated <- switch(save,
172
+ `simulated-truth` = assays(sim)$TrueCounts,
173
+ `zero-inflated` = assays(sim)$counts,
174
+ `down-sampled` = assays(sim)$observedcounts,
175
+ { stop(glue("Unknown save option: {save}, expected one of 'simulated-truth', 'zero-inflated', 'down-sampled'")) }
176
+ )
177
+
@@ -0,0 +1,42 @@
1
+
2
+ library(rlang)
3
+ library(RUVcorr)
4
+
5
+ args <- {{envs.ruvcorr_args | r: todot="-"}}
6
+ if (!is.null(seed)) { set.seed(seed) }
7
+
8
+ args$k <- args$k %||% 10
9
+ args$size.alpha <- args$size.alpha %||% 2
10
+ args$corr.strength <- args$corr.strength %||% 3
11
+ args$g <- args$g %||% NULL
12
+ args$Sigma.eps <- args$Sigma.eps %||% 1
13
+ args$nc <- args$nc %||% (ngenes %/% 4)
14
+ args$ne <- args$ne %||% (ngenes %/% 4)
15
+ args$intercept <- args$intercept %||% TRUE
16
+ args$check <- args$check %||% TRUE
17
+ args$n = ngenes
18
+ args$m = nsamples
19
+
20
+ log_info("Running simulation ...")
21
+ sim <- do_call(simulateGEdata, args)
22
+ attributes(sim) <- c(attributes(sim), c(simulation_tool = "RUVcorr"))
23
+ genes <- paste0("Gene", 1:ngenes)
24
+ samples <- paste0("Sample", 1:nsamples)
25
+
26
+ colnames(sim$Truth) <- genes
27
+ rownames(sim$Truth) <- samples
28
+ sim$Truth <- t(sim$Truth)
29
+ colnames(sim$Y) <- genes
30
+ rownames(sim$Y) <- samples
31
+ sim$Y <- t(sim$Y)
32
+ colnames(sim$Noise) <- genes
33
+ rownames(sim$Noise) <- samples
34
+ sim$Noise <- t(sim$Noise)
35
+ colnames(sim$Sigma) <- genes
36
+ rownames(sim$Sigma) <- genes
37
+
38
+ log_info("Saving results ...")
39
+ saveRDS(sim, file.path(outdir, "sim.rds"))
40
+ saveRDS(sim$Truth, file.path(outdir, "Truth.rds"))
41
+
42
+ simulated <- sim$Y
@@ -0,0 +1,23 @@
1
+ source("{{biopipen_dir}}/utils/misc.R")
2
+
3
+ ngenes <- {{in.ngenes | r}}
4
+ nsamples <- {{in.nsamples | r}}
5
+ outfile <- {{out.outfile | r}}
6
+ outdir <- {{out.outdir | r}}
7
+ seed <- {{envs.seed | r}}
8
+ ncores <- {{envs.ncores | r}}
9
+ transpose_output <- {{envs.transpose_output | r}}
10
+ index_start <- {{envs.index_start | r}}
11
+
12
+ {% if envs.tool.lower() == "ruvcorr" %}
13
+ {% include biopipen_dir + "/scripts/rnaseq/Simulation-RUVcorr.R" %}
14
+ {% elif envs.tool.lower() == "esco" %}
15
+ {% include biopipen_dir + "/scripts/rnaseq/Simulation-ESCO.R" %}
16
+ {% else %}
17
+ stop("Unknown tool: {{envs.tool}}, only 'RUVcorr' and 'ESCO' are supported.")
18
+ {% endif %}
19
+
20
+ colnames(simulated) <- paste0("Sample", index_start + 0:(nsamples - 1))
21
+ if (transpose_output) { simulated <- t(simulated) }
22
+
23
+ write.table(simulated, file = outfile, sep = "\t", quote = FALSE, row.names = TRUE, col.names = TRUE)
@@ -1,73 +1,342 @@
1
- infile = {{in.infile | r}}
2
- outfile = {{out.outfile | r}}
3
- infmt = {{envs.infmt | r}}
4
- inunit = {{envs.inunit | r}}
5
- outunit = {{envs.outunit | r}}
6
- refexon = {{envs.refexon | r}}
7
- inlog2p = {{envs.inlog2p | r}}
8
- outlog2p = {{envs.outlog2p | r}}
1
+ source("{{biopipen_dir}}/utils/misc.R")
2
+ library(rlang)
3
+ library(glue)
9
4
 
10
- if (infmt == "rds") {
11
- indata = readRDS(infile)
12
- } else if (endsWith(infile, ".gz")) {
13
- indata = read.table(infile, header=T, row.names=NULL, sep="\t", check.names = F)
14
- genes = make.unique(indata[, 1])
15
- indata = indata[, -1]
16
- rownames(indata) = genes
5
+ infile <- {{in.infile | r}}
6
+ outfile <- {{out.outfile | r}}
7
+ inunit <- {{envs.inunit | r}}
8
+ outunit <- {{envs.outunit | r}}
9
+ refexon <- {{envs.refexon | r}}
10
+ meanfl <- {{envs.meanfl | r}}
11
+ nreads <- {{envs.nreads | r}}
12
+
13
+ log_info("Reading input data ...")
14
+ indata = read.table(infile, header = TRUE, sep = "\t", row.names = 1, check.names = F)
15
+ samples = colnames(indata)
16
+
17
+ # parse the inunit to see if there is any transformation
18
+ parsable <- function(arg) { is.call(arg) || is_symbol(arg) }
19
+
20
+ check_call_args <- function(arg1, arg2) {
21
+ if (parsable(arg1) && parsable(arg2)) {
22
+ stop(glue("Can't parse the call. Multiple names or calls detected: {arg1}, {arg2}\n"))
23
+ }
24
+ if (!parsable(arg1) && !parsable(arg2)) {
25
+ stop(glue("Can't parse the call. Both arguments are constants: {arg1}, {arg2}. Use the result directly\n"))
26
+ }
17
27
  }
18
28
 
29
+ parse_call <- function(call, expr = "indata") {
30
+ if (!is.call(call)) {
31
+ call <- match.arg(
32
+ as_string(call),
33
+ c(
34
+ "count", "counts", "rawcount", "rawcounts",
35
+ "cpm",
36
+ "fpkm", "rpkm",
37
+ "fpkmuq", "rpkmuq",
38
+ "tpm",
39
+ "tmm"
40
+ )
41
+ )
42
+ return(glue("{as_string(call)} = {expr}"))
43
+ }
44
+ cn <- as_string(call_name(call))
45
+ args <- call_args(call)
46
+ if (length(args) == 1) {
47
+ # This should be those supported functions
48
+ cn <- match.arg(cn, c("log", "log2", "log10", "exp", "sqrt"))
49
+ if (cn == "log") return(parse_call(args[[1]], glue("e ^ ({expr})")))
50
+ if (cn == "log2") return(parse_call(args[[1]], glue("2 ^ ({expr})")))
51
+ if (cn == "log10") return(parse_call(args[[1]], glue("10 ^ ({expr})")))
52
+ if (cn == "exp") return(parse_call(args[[1]], glue("log({expr})")))
53
+ if (cn == "sqrt") return(parse_call(args[[1]], glue("({expr}) ^ 2")))
54
+ } else {
55
+ check_call_args(args[[1]], args[[2]])
56
+ if (cn == "+") {
57
+ if (parsable(args[[1]])) return(parse_call(args[[1]], glue("{expr} - {args[[2]]}")))
58
+ return(parse_call(args[[2]], glue("{expr} - {args[[1]]}")))
59
+ }
60
+ if (cn == "-") {
61
+ if (parsable(args[[1]])) return(parse_call(args[[1]], glue("{expr} + {args[[2]]}")))
62
+ return(parse_call(args[[2]], glue("{args[[1]]} - {expr}")))
63
+ }
64
+ if (cn == "*") {
65
+ if (parsable(args[[1]])) return(parse_call(args[[1]], glue("({expr}) / ({args[[2]]})")))
66
+ return(parse_call(args[[2]], glue("({expr}) / ({args[[1]]})")))
67
+ }
68
+ if (cn == "/") {
69
+ if (parsable(args[[1]])) return(parse_call(args[[1]], glue("({expr}) * ({args[[2]]})")))
70
+ return(parse_call(args[[2]], glue("({args[[1]]}) / ({expr})")))
71
+ }
72
+ if (cn == "^") {
73
+ if (parsable(args[[1]])) return(parse_call(args[[1]], glue("{expr} * (1 / ({args[[2]]}))")))
74
+ return(parse_call(args[[2]], glue("log({expr}, {args[[1]]})")))
75
+ }
76
+ stop(paste0("Unknown function to parse: {cn}\n"))
77
+ }
78
+ }
19
79
 
20
- glenFromExon = function(exonfile, x) {
21
- gff = read.table(exonfile, header = F, row.names = NULL)
22
- # V4: start, V5: end, V10: gene name
23
- glen = aggregate(V5-V4+1 ~ V10, gff, sum)
24
- genes = glen[,1]
25
- glen = glen[,-1,drop=F]
26
- rownames(glen) = genes
80
+ glenFromExon = function(exonfile, data) {
81
+ gff = read.table(exonfile, header = F, row.names = NULL)
82
+ # V4: start, V5: end, V10: gene name
83
+ glen = aggregate(V5-V4+1 ~ V10, gff, sum)
84
+ genes = glen[,1]
85
+ glen = glen[,-1,drop=F]
86
+ rownames(glen) = genes
27
87
 
28
- mygenes = rownames(x)
29
- outgenes = intersect(genes, mygenes)
30
- if (length(outgenes) < length(mygenes))
31
- warning('Genes not found in refexon: ', paste(setdiff(mygenes, outgenes)))
88
+ mygenes = rownames(data)
89
+ outgenes = intersect(genes, mygenes)
90
+ if (length(outgenes) < length(mygenes))
91
+ logger('Genes not found in refexon: ', paste(setdiff(mygenes, outgenes), collapse = ','), level = 'WARNING')
32
92
 
33
- glen[outgenes, , drop = FALSE]
93
+ glen[outgenes, , drop = FALSE]
34
94
  }
35
95
 
36
96
  meanflFromFile = function(samples, mflfile) {
37
- if (is.numeric(mflfile)) {
38
- ret = matrix(mflfile, nrow = length(samples), ncol = 1)
39
- rownames(ret) = samples
40
- } else {
41
- ret = read.table(mflfile, header = F, row.names = 1, check.names = F, sep = "\t")
42
- ret = ret[samples,,drop = F]
43
- }
44
- ret
97
+ if (is.numeric(mflfile)) {
98
+ ret = matrix(mflfile, nrow = length(samples), ncol = 1)
99
+ rownames(ret) = samples
100
+ } else {
101
+ ret = read.table(mflfile, header = F, row.names = 1, check.names = F, sep = "\t")
102
+ ret = ret[samples,,drop = F]
103
+ }
104
+ ret
105
+ }
106
+
107
+ nreadsFromFile = function(samples, nreads) {
108
+ if (is.numeric(nreads)) {
109
+ ret = matrix(nreads, nrow = length(samples), ncol = 1)
110
+ rownames(ret) = samples
111
+ } else {
112
+ ret = read.table(nreads, header = F, row.names = 1, check.names = F, sep = "\t")
113
+ ret = ret[samples,,drop = F]
114
+ }
115
+ ret
116
+ }
117
+
118
+ count2cpm <- function(data) {
119
+ edgeR::cpm(data)
120
+ }
121
+
122
+ count2fpkm = function(data) {
123
+ # may lose some genes
124
+ glen = glenFromExon(refexon, data)
125
+ data = data[rownames(glen), , drop = F]
126
+ dge = edgeR::DGEList(counts=data)
127
+
128
+ dge$genes$Length = glen
129
+ edgeR::rpkm(dge)
130
+ }
131
+
132
+ count2fpkmuq = function(data) {
133
+ # may lose some genes
134
+ glen = glenFromExon(refexon, data)
135
+ data = data[rownames(glen), , drop = FALSE]
136
+
137
+ fld = meanflFromFile(samples, meanfl)
138
+ expr = sapply(samples, function(s){
139
+ RC75 = quantile(data[, s], .75)
140
+ exp( log(data[, s]) + log(1e9) - log(glen - fld[s, ] + 1) - log(RC75) )
141
+ })
142
+ rownames(expr) = rownames(data)
143
+ expr
144
+ }
145
+
146
+ count2tpm = function(data) {
147
+ glen = glenFromExon(refexon, data)
148
+ data = data[rownames(glen), , drop = F]
149
+ fld = meanflFromFile(samples, meanfl)
150
+
151
+ # see: https://gist.github.com/slowkow/c6ab0348747f86e2748b
152
+ expr = as.data.frame(sapply(samples, function(s){
153
+ rate = log(data[, s]) - log(glen - fld[s, ] + 1)
154
+ denom = log(sum(exp(rate)))
155
+ exp(rate - denom + log(1e6))
156
+ }))
157
+ colnames(expr) = colnames(data)
158
+ rownames(expr) = rownames(data)
159
+ expr
160
+ }
161
+
162
+ count2tmm = function(data) {
163
+ dge = edgeR::DGEList(counts=data)
164
+ dge = edgeR::calcNormFactors(dge, method = "TMM")
165
+ edgeR::cpm(dge)
166
+ }
167
+
168
+ fpkm2count = function(data) {
169
+ glen = glenFromExon(refexon, data)
170
+ data = data[rownames(glen), , drop = F]
171
+ fld = meanflFromFile(samples, meanfl)
172
+ totalnr = nreadsFromFile(samples, nreads)
173
+
174
+ expr = sapply(samples, function(s){
175
+ N = totalnr[s, ]
176
+ exp( log(data[, s]) + log(N) + log(glen - fld[s, ] + 1) - log(1e9) )
177
+ })
178
+ rownames(expr) = rownames(data)
179
+ expr
180
+ }
181
+
182
+ fpkm2tpm = function(data) {
183
+ expr = sapply(samples, function(s) {
184
+ exp( log(data[, s]) - log(sum(data[, s])) + log(1e6) )
185
+ })
186
+ rownames(expr) = rownames(data)
187
+ expr
188
+ }
189
+
190
+ fpkm2cpm = function(data) {
191
+ glen = glenFromExon(refexon, data)
192
+ data = data[rownames(glen), , drop = F]
193
+ expr = sapply(samples, function(s) {
194
+ exp( log(data[, s]) - log(1e3) - log(glen - fld[s, ] + 1) )
195
+ })
196
+ rownames(expr) = rownames(data)
197
+ expr
198
+ }
199
+
200
+ tpm2count = function(data) {
201
+ totalnr = nreadsFromFile(samples, nreads)
202
+ ngenes = nrow(data)
203
+
204
+ expr = sapply(samples, function(s){
205
+ # counts to tpm:
206
+ # rate <- log(counts) - log(effLen)
207
+ # denom <- log(sum(exp(rate)))
208
+ # tpm = exp(rate - denom + log(1e6))
209
+ # so:
210
+ # log(tpm) = rate - denom + log(1e6)
211
+ # rate = log(tpm) + denom - log(1e6)
212
+ # log(counts) - log(effLen) = log(tpm) + log(sum(exp(rate))) - log(1e6)
213
+ # log(counts) - log(effLen) = log(tpm) + log(sum(exp(log(counts) - log(effLen)))) - log(1e6)
214
+ # log(counts) - log(effLen) = log(tpm) + log(sum(exp(log(counts))/exp(log(effLen)))) - log(1e6)
215
+ # log(counts) - log(effLen) = log(tpm) + log(sum(counts/effLen)) - log(1e6)
216
+ # ?????????????
217
+ # ??? estimated by sum(counts)/sum(effLen) * length(effLen)
218
+ # log(counts) = log(effLen) + log(tpm) + log(sum(counts)) - log(effLen) + log(length(effLen))) - log(1e6)
219
+ # counts = expr( log(tpm) + log(nreads) + log(length(effLen)) - log(1e6) )
220
+ exp( log(data[, s]) + log(totalnr[s, ]) + log(ngenes) - log(1e6) )
221
+ })
222
+ rownames(expr) = rownames(data)
223
+ expr
224
+ }
225
+
226
+ tpm2fpkm = function(data) {
227
+ totalnr = nreadsFromFile(samples, nreads)
228
+ expr = sapply(samples, function(s) {
229
+ exp( log(data[, s]) - log(1e6) + log(totalnr[s, ]) )
230
+ })
231
+ rownames(expr) = rownames(data)
232
+ expr
233
+ }
234
+
235
+ tpm2cpm = function(data) {
236
+ glen = glenFromExon(refexon, data)
237
+ data = data[rownames(glen), , drop = F]
238
+ fld = meanflFromFile(samples, meanfl)
239
+ ngenes = length(outgenes)
240
+
241
+ expr = sapply(samples, function(s) {
242
+ exp( log(data[, s]) + log(glen - fld[s, ] + 1) - log(sum(glen - fld[s, ] + 1)) + log(ngenes) )
243
+ })
244
+ rownames(expr) = rownames(data)
245
+ expr
246
+ }
247
+
248
+ cpm2count = function(data) {
249
+ totalnr = nreadsFromFile(samples, nreads)
250
+
251
+ expr = sapply(samples, function(s) {
252
+ exp( log(data[, s]) + log(totalnr[s, ]) - log(1e6) )
253
+ })
254
+ rownames(expr) = rownames(data)
255
+ expr
45
256
  }
46
257
 
47
- count2tpm = function(x) {
48
- glen = glenFromExon(refexon, x)
49
- x = x[rownames(glen), , drop = F]
50
- fld = meanflFromFile(samples, meanfl)
258
+ cpm2fpkm = function(data) {
259
+ glen = glenFromExon(refexon, data)
260
+ data = data[rownames(glen), , drop = F]
261
+ expr = sapply(samples, function(s) {
262
+ exp( log(data[, s]) + log(1e3) - log(glen - fld[s, ] + 1) )
263
+ })
264
+ rownames(expr) = rownames(data)
265
+ expr
266
+ }
51
267
 
52
- # see: https://gist.github.com/slowkow/c6ab0348747f86e2748b
53
- expr = as.data.frame(sapply(samples, function(s){
54
- rate = log(x[, s]) - log(glen - fld[s, ] + 1)
55
- denom = log(sum(exp(rate)))
56
- exp(rate - denom + log(1e6))
57
- }))
58
- colnames(expr) = colnames(x)
59
- rownames(expr) = rownames(x)
60
- expr
268
+ cpm2tpm = function(data) {
269
+ glen = glenFromExon(refexon, data)
270
+ data = data[rownames(glen), , drop = F]
271
+ ngenes = nrow(glen)
272
+ expr = sapply(samples, function(s) {
273
+ exp( log(data[, s]) - log(glen - fld[s, ] + 1) - log(sum(glen - fld[s, ] + 1)) + log(ngenes) )
274
+ })
275
+ rownames(expr) = rownames(data)
276
+ expr
61
277
  }
62
278
 
63
- if (inunit %in% c('count', 'counts', 'rawcount', 'rawcounts')) {
64
- inunit = "count"
279
+ is.count = function(unit) {unit %in% c('count', 'counts', 'rawcount', 'rawcounts')}
280
+ is.cpm = function(unit) {unit == 'cpm'}
281
+ is.fpkm = function(unit) {unit %in% c('fpkm', 'rpkm')}
282
+ is.fpkmuq = function(unit) {unit %in% c('fpkmuq', 'rpkmuq')}
283
+ is.tpm = function(unit) {unit == 'tpm'}
284
+ is.tmm = function(unit) {unit == 'tmm'}
285
+
286
+ # log2(count + 1) -> count = 2 ^ indata - 1
287
+ parsed_transformation <- parse_call(parse_expr(inunit))
288
+ splits <- strsplit(parsed_transformation, " = ")[[1]]
289
+ if (is.count(splits[[1]])) {
290
+ intype <- "count"
291
+ } else if (is.cpm(splits[[1]])) {
292
+ intype <- "cpm"
293
+ } else if (is.fpkm(splits[[1]])) {
294
+ intype <- "fpkm"
295
+ } else if (is.fpkmuq(splits[[1]])) {
296
+ intype <- "fpkmuq"
297
+ } else if (is.tpm(splits[[1]])) {
298
+ intype <- "tpm"
299
+ } else if (is.tmm(splits[[1]])) {
300
+ intype <- "tmm"
301
+ } else {
302
+ stop(glue("Can't find a supported unit in the inunit: {inunit}\n"))
65
303
  }
304
+ splits[1] <- intype
305
+ eval(parse_expr(paste(splits, collapse = " = ")))
306
+ indata <- get(intype)
66
307
 
308
+ # find out the outtype
309
+ if (grepl('rawcounts|rawcount|counts|count', outunit)) {
310
+ outtype <- 'count'
311
+ outunit <- gsub('rawcounts|rawcount|counts|count', 'count', outunit)
312
+ } else if (grepl('fpkmuq|rpkmuq', outunit)) {
313
+ outtype <- 'fpkmuq'
314
+ outunit <- gsub('fpkmuq|rpkmuq', 'fpkmuq', outunit)
315
+ } else if (grepl('fpkm|rpkm', outunit)) {
316
+ outtype <- 'fpkm'
317
+ outunit <- gsub('fpkm|rpkm', 'fpkm', outunit)
318
+ } else if (grepl('tpm', outunit)) {
319
+ outtype <- 'tpm'
320
+ } else if (grepl('cpm', outunit)) {
321
+ outtype <- 'cpm'
322
+ } else if (grepl('tmm', outunit)) {
323
+ outtype <- 'tmm'
324
+ } else {
325
+ stop(glue("Can't find a supported unit in the outunit: {outunit}\n"))
326
+ }
67
327
 
68
- convert = function(data, inunit, outunit) {
69
- func = get(paste0(inunit, "2", outunit))
70
- func(data)
328
+ log_info("Transforming data by resolving {inunit} ...")
329
+ if (intype == outtype) {
330
+ fun <- identity
331
+ } else {
332
+ fun <- glue("{intype}2{outtype}")
333
+ fun <- tryCatch(
334
+ { get(fun) },
335
+ error = function(e) { stop(glue("Unsupported conversion from {intype} to {outunit}\n")) }
336
+ )
71
337
  }
338
+ assign(outtype, fun(indata))
339
+ out <- eval(parse_expr(outunit))
72
340
 
73
- convert(indata, inunit, outunit)
341
+ log_info("Saving output data ...")
342
+ write.table(out, outfile, quote=FALSE, row.names=TRUE, col.names=TRUE, sep="\t")