@sjcrh/proteinpaint-server 2.122.0 → 2.123.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dataset/termdb.test.js +1 -1
- package/package.json +4 -6
- package/routes/burden.js +3 -4
- package/routes/correlationVolcano.js +2 -4
- package/routes/samplewsimages.js +9 -0
- package/routes/termdb.DE.js +2 -4
- package/routes/termdb.cluster.js +2 -4
- package/routes/termdb.config.js +2 -3
- package/routes/termdb.rootterm.js +3 -3
- package/routes/termdb.termchildren.js +3 -3
- package/routes/termdb.violin.js +2 -5
- package/src/app.js +563 -496
- package/src/run_R.js +0 -66
- package/utils/binom.R +0 -17
- package/utils/burden-ci95.R +0 -134
- package/utils/burden-main.R +0 -46
- package/utils/corr.R +0 -38
- package/utils/cuminc.R +0 -279
- package/utils/density.R +0 -36
- package/utils/edge.R +0 -283
- package/utils/fdr.R +0 -9
- package/utils/fisher.2x3.R +0 -12
- package/utils/fisher.R +0 -9
- package/utils/getBurden.R +0 -371
- package/utils/getGeneFromMatrix.R +0 -40
- package/utils/hclust.R +0 -110
- package/utils/km.R +0 -13
- package/utils/lowess.R +0 -9
- package/utils/regression.R +0 -154
- package/utils/regression.utils.R +0 -804
- package/utils/survival.R +0 -92
- package/utils/wilcoxon.R +0 -73
package/utils/edge.R
DELETED
|
@@ -1,283 +0,0 @@
|
|
|
1
|
-
# Test syntax: cat ~/sjpp/test.txt | time Rscript edge.R
|
|
2
|
-
|
|
3
|
-
# Load required packages
|
|
4
|
-
suppressWarnings({
|
|
5
|
-
library(jsonlite)
|
|
6
|
-
library(rhdf5)
|
|
7
|
-
library(stringr)
|
|
8
|
-
library(readr)
|
|
9
|
-
suppressPackageStartupMessages(library(edgeR))
|
|
10
|
-
suppressPackageStartupMessages(library(dplyr))
|
|
11
|
-
})
|
|
12
|
-
|
|
13
|
-
# Filter based on CPM
|
|
14
|
-
filter_using_cpm <- function(y, gene_cpm_cutoff, sample_cpm_cutoff, count_cpm_cutoff) {
|
|
15
|
-
selr <- rowSums(cpm(y$counts)>gene_cpm_cutoff)>=sample_cpm_cutoff
|
|
16
|
-
selc <- colSums(cpm(y$counts))>=count_cpm_cutoff
|
|
17
|
-
y <- y[selr, selc]
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
# Read JSON input from stdin
|
|
21
|
-
read_json_time <- system.time({
|
|
22
|
-
con <- file("stdin", "r")
|
|
23
|
-
json <- readLines(con, warn=FALSE)
|
|
24
|
-
close(con)
|
|
25
|
-
input <- fromJSON(json)
|
|
26
|
-
cases <- unlist(strsplit(input$case, ","))
|
|
27
|
-
controls <- unlist(strsplit(input$control, ","))
|
|
28
|
-
combined <- c("geneID", "geneSymbol", cases, controls)
|
|
29
|
-
})
|
|
30
|
-
#cat("Time to read JSON: ", as.difftime(read_json_time, units = "secs")[3], " seconds\n")
|
|
31
|
-
|
|
32
|
-
# Read counts data
|
|
33
|
-
read_counts_time <- system.time({
|
|
34
|
-
if (input$storage_type == "HDF5") {
|
|
35
|
-
geneIDs <- h5read(input$input_file, "gene_ids")
|
|
36
|
-
geneNames <- h5read(input$input_file, "gene_names")
|
|
37
|
-
samples <- h5read(input$input_file, "samples")
|
|
38
|
-
|
|
39
|
-
# Find indices of case and control samples in the HDF5 file
|
|
40
|
-
case_indices <- match(cases, samples)
|
|
41
|
-
control_indices <- match(controls, samples)
|
|
42
|
-
|
|
43
|
-
# Check for missing samples
|
|
44
|
-
if (any(is.na(case_indices))) {
|
|
45
|
-
missing_cases <- cases[is.na(case_indices)]
|
|
46
|
-
stop(paste(missing_cases, "not found"))
|
|
47
|
-
}
|
|
48
|
-
if (any(is.na(control_indices))) {
|
|
49
|
-
missing_controls <- controls[is.na(control_indices)]
|
|
50
|
-
stop(paste(missing_controls, "not found"))
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
samples_indices <- c(case_indices, control_indices)
|
|
54
|
-
read_counts <- as.data.frame(t(h5read(input$input_file, "counts", index = list(samples_indices, 1:length(geneIDs)))))
|
|
55
|
-
colnames(read_counts) <- c(cases, controls)
|
|
56
|
-
} else if (input$storage_type == "text") {
|
|
57
|
-
suppressWarnings({
|
|
58
|
-
suppressMessages({
|
|
59
|
-
read_counts <- read_tsv(input$input_file, col_names = TRUE, col_select = combined)
|
|
60
|
-
})
|
|
61
|
-
})
|
|
62
|
-
geneIDs <- unlist(read_counts[1])
|
|
63
|
-
geneNames <- unlist(read_counts[2])
|
|
64
|
-
read_counts <- select(read_counts, -geneID)
|
|
65
|
-
read_counts <- select(read_counts, -geneSymbol)
|
|
66
|
-
} else {
|
|
67
|
-
stop("Unknown storage type")
|
|
68
|
-
}
|
|
69
|
-
})
|
|
70
|
-
#cat("Time to read counts data: ", as.difftime(read_counts_time, units = "secs")[3], " seconds\n")
|
|
71
|
-
|
|
72
|
-
# Create conditions vector
|
|
73
|
-
conditions <- c(rep("Diseased", length(cases)), rep("Control", length(controls)))
|
|
74
|
-
gene_id_symbols <- paste0(geneIDs, "\t", geneNames)
|
|
75
|
-
|
|
76
|
-
# Create DGEList object
|
|
77
|
-
dge_list_time <- system.time({
|
|
78
|
-
y <- DGEList(counts = read_counts, group = conditions, genes = gene_id_symbols)
|
|
79
|
-
})
|
|
80
|
-
#cat("Time to generate DGEList: ", as.difftime(dge_list_time, units = "secs")[3], " seconds\n")
|
|
81
|
-
|
|
82
|
-
# Filter and normalize counts
|
|
83
|
-
filter_time <- system.time({
|
|
84
|
-
keep <- filterByExpr(y, min.count = input$min_count, min.total.count = input$min_total_count)
|
|
85
|
-
})
|
|
86
|
-
#cat("Time to filter by expression: ", as.difftime(filter_time, unit = "secs")[3], " seconds\n")
|
|
87
|
-
|
|
88
|
-
normalization_time <- system.time({
|
|
89
|
-
y <- y[keep, keep.lib.sizes = FALSE]
|
|
90
|
-
y <- normLibSizes(y) # Using TMM method for normalization
|
|
91
|
-
})
|
|
92
|
-
#cat("Normalization time: ", as.difftime(normalization_time, units = "secs")[3], " seconds\n")
|
|
93
|
-
|
|
94
|
-
# Cutoffs for cpm, will add them as UI options later
|
|
95
|
-
if (length(samples_indices) > 100) {
|
|
96
|
-
gene_cpm_cutoff <- 15
|
|
97
|
-
sample_cpm_cutoff <- 30
|
|
98
|
-
count_cpm_cutoff <- 100000
|
|
99
|
-
} else {
|
|
100
|
-
gene_cpm_cutoff <- 5
|
|
101
|
-
sample_cpm_cutoff <- 15
|
|
102
|
-
count_cpm_cutoff <- 100000
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
filter_using_cpm_time <- system.time({
|
|
106
|
-
y <- filter_using_cpm(y, gene_cpm_cutoff, sample_cpm_cutoff, count_cpm_cutoff) # Filtering counts matrix based on gene_cpm_cutoff, sample_cpm_cutoff and count_cpm_cutoff
|
|
107
|
-
})
|
|
108
|
-
#cat("Filter using cpm time: ", as.difftime(filter_using_cpm_time, units = "secs")[3], " seconds\n")
|
|
109
|
-
|
|
110
|
-
if (dim(y)[1]==0) { # Its possible after filtering there might not be any genes left in the matrix, in such a case the R code must exit gracefully with an error.
|
|
111
|
-
stop("Number of genes after filtering = 0, cannot proceed any further")
|
|
112
|
-
}
|
|
113
|
-
if (dim(y)[2]==0) { # Its possible after filtering there might not be any samples left in the matrix, in such a case the R code must exit gracefully with an error.
|
|
114
|
-
stop("Number of samples after filtering = 0, cannot proceed any further")
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
# Saving MDS plot image
|
|
118
|
-
|
|
119
|
-
if (dim(read_counts)[1] * dim(read_counts)[2] < as.numeric(input$mds_cutoff)) { # If the dimensions of the read counts matrix is below this threshold, only then the mds image will be generated as its very compute intensive
|
|
120
|
-
mds_plot_time <- system.time({
|
|
121
|
-
set.seed(as.integer(Sys.time())) # Set the seed according to current time
|
|
122
|
-
cachedir <- input$cachedir # Importing serverconfig.cachedir
|
|
123
|
-
random_number <- runif(1, min = 0, max = 1) # Generating random number
|
|
124
|
-
mds_image_name <- paste0("edgeR_mds_temp_",random_number,".png") # Generating random image name so that simultaneous server side requests do NOT generate the same edgeR file name
|
|
125
|
-
png(filename = paste0(cachedir,"/",mds_image_name), width = 1000, height = 1000, res = 200) # Opening a png device
|
|
126
|
-
par(oma = c(0, 0, 0, 0)) # Creating a margin
|
|
127
|
-
mds_conditions <- c(rep("T", length(cases)), rep("C", length(controls))) # Case samples are labelled "T" and control samples are labelled "C". Single-letter labelling added because otherwise labels get overwritten on each other.
|
|
128
|
-
plotMDS(y, labels = mds_conditions) # Plot the edgeR MDS plot
|
|
129
|
-
# dev.off() # Gives a null device message which breaks JSON. Commenting it out for now, will investigate it later
|
|
130
|
-
})
|
|
131
|
-
#cat("mds plot time: ", as.difftime(mds_plot_time, units = "secs")[3], " seconds\n")
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
# Differential expression analysis
|
|
135
|
-
if (length(input$conf1) == 0) { # No adjustment of confounding factors
|
|
136
|
-
design <- model.matrix(~conditions) # Based on the protocol defined in section 1.4 of edgeR manual https://bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
|
|
137
|
-
} else { # Adjusting for confounding factors
|
|
138
|
-
# Check the type of confounding variable
|
|
139
|
-
if (input$conf1_mode == "continuous") { # If this is float, the input conf1 vector should be converted into a numeric vector
|
|
140
|
-
conf1 <- as.numeric(input$conf1)
|
|
141
|
-
} else { # When input$conf1_mode == "discrete" keep the vector as string.
|
|
142
|
-
conf1 <- as.factor(input$conf1)
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
if (length(input$conf2) == 0) { # No adjustment of confounding factor 2
|
|
146
|
-
y$samples <- data.frame(y$samples, conditions = conditions, conf1 = conf1)
|
|
147
|
-
model_gen_time <- system.time({
|
|
148
|
-
design <- model.matrix(~ conditions + conf1, data = y$samples)
|
|
149
|
-
})
|
|
150
|
-
#cat("Time for making design matrix: ", as.difftime(model_gen_time, units = "secs")[3], " seconds\n")
|
|
151
|
-
} else {
|
|
152
|
-
# Check the type of confounding variable 2
|
|
153
|
-
if (input$conf2_mode == "continuous") { # If this is float, the input conf2 vector should be converted into a numeric vector
|
|
154
|
-
conf2 <- as.numeric(input$conf2)
|
|
155
|
-
} else { # When input$conf2_mode == "discrete" keep the vector as string.
|
|
156
|
-
conf2 <- as.factor(input$conf2)
|
|
157
|
-
}
|
|
158
|
-
y$samples <- data.frame(y$samples, conditions = conditions, conf1 = conf1, conf2 = conf2)
|
|
159
|
-
model_gen_time <- system.time({
|
|
160
|
-
design <- model.matrix(~ conditions + conf1 + conf2, data = y$samples)
|
|
161
|
-
})
|
|
162
|
-
#cat("Time for making design matrix: ", as.difftime(model_gen_time, units = "secs")[3], " seconds\n")
|
|
163
|
-
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
DE_method <- input$DE_method
|
|
168
|
-
if (DE_method == "edgeR") {
|
|
169
|
-
fit_time <- system.time({
|
|
170
|
-
suppressWarnings({
|
|
171
|
-
suppressMessages({
|
|
172
|
-
fit <- glmQLFit(y,design) # The glmQLFit() replaces glmFit() which implements the quasi-likelihood function. This is better able to account for overdispersion as it employs a more lenient approach where variance is not a fixed function of the mean.
|
|
173
|
-
})
|
|
174
|
-
})
|
|
175
|
-
})
|
|
176
|
-
#cat("QL fit time: ", as.difftime(fit_time, units = "secs")[3], " seconds\n")
|
|
177
|
-
test_time <- system.time({
|
|
178
|
-
suppressWarnings({
|
|
179
|
-
suppressMessages({
|
|
180
|
-
et <- glmQLFTest(fit, coef = "conditionsDiseased")
|
|
181
|
-
})
|
|
182
|
-
})
|
|
183
|
-
})
|
|
184
|
-
#cat("QL test time: ", as.difftime(test_time, units = "secs")[3], " seconds\n")
|
|
185
|
-
|
|
186
|
-
# Saving QL fit image
|
|
187
|
-
ql_plot_time <- system.time({
|
|
188
|
-
set.seed(as.integer(Sys.time())) # Set the seed according to current time
|
|
189
|
-
cachedir <- input$cachedir # Importing serverconfig.cachedir
|
|
190
|
-
random_number <- runif(1, min = 0, max = 1) # Generating random number
|
|
191
|
-
fit_image_name <- paste0("edgeR_ql_temp_",random_number,".png") # Generating random image name so that simultaneous server side requests do NOT generate the same edgeR file name
|
|
192
|
-
png(filename = paste0(cachedir,"/",fit_image_name), width = 1000, height = 1000, res = 200) # Opening a png device
|
|
193
|
-
par(oma = c(0, 0, 0, 0)) # Creating a margin
|
|
194
|
-
plotQLDisp(fit) # Plot the edgeR fit
|
|
195
|
-
# dev.off() # Gives a null device message which breaks JSON. Commenting it out for now, will investigate it later
|
|
196
|
-
})
|
|
197
|
-
#cat("ql plot time: ", as.difftime(ql_plot_time, units = "secs")[3], " seconds\n")
|
|
198
|
-
logfc <- et$table$logFC
|
|
199
|
-
logcpm <- et$table$logCPM
|
|
200
|
-
pvalues <- et$table$PValue
|
|
201
|
-
genes_matrix <- str_split_fixed(unlist(et$genes), "\t", 2)
|
|
202
|
-
geneids <- unlist(genes_matrix[, 1])
|
|
203
|
-
geneNames <- unlist(genes_matrix[, 2])
|
|
204
|
-
adjust_p_values <- p.adjust(pvalues, method = "fdr")
|
|
205
|
-
} else if (DE_method == "limma") {
|
|
206
|
-
# Do voom transformation and fit linear model
|
|
207
|
-
voom_transformation_lmfit_time <- system.time({
|
|
208
|
-
suppressWarnings({
|
|
209
|
-
suppressMessages({
|
|
210
|
-
set.seed(as.integer(Sys.time())) # Set the seed according to current time
|
|
211
|
-
cachedir <- input$cachedir # Importing serverconfig.cachedir
|
|
212
|
-
random_number <- runif(1, min = 0, max = 1) # Generating random number
|
|
213
|
-
fit_image_name <- paste0("limma_voom_temp_",random_number,".png") # Generating random image name so that simultaneous server side requests do NOT generate the same edgeR file name
|
|
214
|
-
png(filename = paste0(cachedir,"/",fit_image_name), width = 1000, height = 1000, res = 200) # Opening a png device
|
|
215
|
-
par(oma = c(0, 0, 0, 0)) # Creating a margin
|
|
216
|
-
suppressWarnings({
|
|
217
|
-
suppressMessages({
|
|
218
|
-
fit <- voomLmFit(y, design, plot = TRUE) # This is base don the recommendation of the edgeR limma/voom authors https://support.bioconductor.org/p/9161585/
|
|
219
|
-
})
|
|
220
|
-
})
|
|
221
|
-
dev.off() # Gives a null device message which breaks JSON. Commenting it out for now, will investigate it later
|
|
222
|
-
})
|
|
223
|
-
})
|
|
224
|
-
})
|
|
225
|
-
#cat("voom transformation + limma fit time: ", as.difftime(voom_transformation_lmfit_time, units = "secs")[3], " seconds\n")
|
|
226
|
-
|
|
227
|
-
# Saving mean-difference plot (aka MA plot)
|
|
228
|
-
#set.seed(as.integer(Sys.time())) # Set the seed according to current time
|
|
229
|
-
#cachedir <- input$cachedir # Importing serverconfig.cachedir
|
|
230
|
-
#random_number <- runif(1, min = 0, max = 1) # Generating random number
|
|
231
|
-
#md_image_name <- paste0("limma_md_temp_",random_number,".png") # Generating random image name so that simultaneous server side requests do NOT generate the same edgeR file name
|
|
232
|
-
#png(filename = paste0(cachedir,"/",md_image_name), width = 1000, height = 1000, res = 200) # Opening a png device
|
|
233
|
-
#par(oma = c(0, 0, 0, 0)) # Creating a margin
|
|
234
|
-
#plotMD(fit) # Plot the limma fit
|
|
235
|
-
## dev.off() # Gives a null device message which breaks JSON. Commenting it out for now, will investigate it later
|
|
236
|
-
|
|
237
|
-
# Empirical Bayes smoothing
|
|
238
|
-
empirical_smoothing_time <- system.time({
|
|
239
|
-
suppressWarnings({
|
|
240
|
-
suppressMessages({
|
|
241
|
-
tmp <- eBayes(fit)
|
|
242
|
-
})
|
|
243
|
-
})
|
|
244
|
-
})
|
|
245
|
-
#cat("Empirical smoothing time: ", as.difftime(empirical_smoothing_time, units = "secs")[3], " seconds\n")
|
|
246
|
-
|
|
247
|
-
# Time for selecting top genes
|
|
248
|
-
top_genes_selection_time <- system.time({
|
|
249
|
-
suppressWarnings({
|
|
250
|
-
suppressMessages({
|
|
251
|
-
top_table <- topTable(tmp, coef = "conditionsDiseased", number = Inf, adjust.method = "fdr") # The coeff needs to be specified in topTable() because it needs to know for which contrast the logFC needs to be calculated https://www.biostars.org/p/160465/
|
|
252
|
-
logfc <- top_table$logFC
|
|
253
|
-
pvalues <- top_table$P.Value
|
|
254
|
-
genes_matrix <- str_split_fixed(unlist(top_table$genes), "\t", 2)
|
|
255
|
-
geneids <- unlist(genes_matrix[, 1])
|
|
256
|
-
geneNames <- unlist(genes_matrix[, 2])
|
|
257
|
-
adjust_p_values <- top_table$adj.P.Val
|
|
258
|
-
})
|
|
259
|
-
})
|
|
260
|
-
})
|
|
261
|
-
#cat("Time for selecting top genes: ", as.difftime(top_genes_selection_time, units = "secs")[3], " seconds\n")
|
|
262
|
-
} else { # Should not happen
|
|
263
|
-
stop(paste0("Unknown method:", DE_method))
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
final_data_generation_time <- system.time({
|
|
267
|
-
output <- data.frame(geneids, geneNames, logfc, pvalues, adjust_p_values)
|
|
268
|
-
names(output)[1] <- "gene_id"
|
|
269
|
-
names(output)[2] <- "gene_name"
|
|
270
|
-
names(output)[3] <- "fold_change"
|
|
271
|
-
names(output)[4] <- "original_p_value"
|
|
272
|
-
names(output)[5] <- "adjusted_p_value"
|
|
273
|
-
})
|
|
274
|
-
final_output <- c()
|
|
275
|
-
final_output$gene_data <- output
|
|
276
|
-
final_output$edgeR_ql_image_name <- fit_image_name
|
|
277
|
-
if (dim(read_counts)[1] * dim(read_counts)[2] < as.numeric(input$mds_cutoff)) { # If the dimensions of the read counts matrix is below this threshold, only then the mds image will be generated as its very compute intensive
|
|
278
|
-
final_output$edgeR_mds_image_name <- mds_image_name
|
|
279
|
-
}
|
|
280
|
-
#cat("Time for generating final dataframe: ", as.difftime(final_data_generation_time, unit = "secs")[3], " seconds\n")
|
|
281
|
-
|
|
282
|
-
# Output results
|
|
283
|
-
toJSON(final_output, digits = NA, na = "string") # Setting digits = NA makes toJSON() use the max precision. na='string' causes any "not a number" to be reported as string. This from ?toJSON() documentation
|
package/utils/fdr.R
DELETED
package/utils/fisher.2x3.R
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
out <- NULL
|
|
2
|
-
con <- file("stdin","r")
|
|
3
|
-
dat <- read.table(con,sep="\t",header=F,quote="")
|
|
4
|
-
|
|
5
|
-
for (i in 1:nrow(dat)) {
|
|
6
|
-
x <- fisher.test( matrix( c( dat[i,2], dat[i,3], dat[i,4], dat[i,5], dat[i,6], dat[i,7] ), nrow=2 ) )
|
|
7
|
-
out <- rbind(out,x$p.value)
|
|
8
|
-
}
|
|
9
|
-
out <- cbind(dat,out)
|
|
10
|
-
|
|
11
|
-
write.table(out,file="",sep="\t",quote=F,row.names=F,col.names=F)
|
|
12
|
-
close(con)
|
package/utils/fisher.R
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
con <- file("stdin","r")
|
|
2
|
-
dat <- read.table(con,sep="\t",header=F,quote="")
|
|
3
|
-
|
|
4
|
-
pvals <- apply(dat[,2:5], 1, function(row) fisher.test(matrix(row, ncol=2))$p.value)
|
|
5
|
-
|
|
6
|
-
out <- cbind(dat,pvals)
|
|
7
|
-
|
|
8
|
-
write.table(out,file="",sep="\t",quote=F,row.names=F,col.names=F)
|
|
9
|
-
close(con)
|