@sjcrh/proteinpaint-server 2.109.1 → 2.111.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -4
- package/routes/correlationVolcano.js +11 -1
- package/routes/gdc.mafBuild.js +20 -13
- package/routes/termdb.DE.js +42 -12
- package/routes/termdb.boxplot.js +2 -1
- package/routes/termdb.violin.js +47 -8
- package/src/app.js +5013 -4927
- package/utils/density.R +29 -0
- package/utils/edge.R +75 -55
package/utils/density.R
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
library(jsonlite)
|
|
2
|
+
# This script reads in a json string from stdin, calculates the densities of each plot and returns the densities as a json string
|
|
3
|
+
# The input json string is a dictionary where each field maps to an array of numbers
|
|
4
|
+
# The output json string is a dictionary with the density for each plot. The density is represented like {x: [x density values], y: [y density values]}
|
|
5
|
+
# In order to test it you can run this from the command line replacing the arrays with your own:
|
|
6
|
+
# echo '{"plotA": [1.2, 2, 3], "plotB": [4.5, 5, 6]}' | Rscript ./density.R
|
|
7
|
+
|
|
8
|
+
con <- file("stdin", "r")
|
|
9
|
+
json <- readLines(con)
|
|
10
|
+
close(con)
|
|
11
|
+
data <- fromJSON(json)
|
|
12
|
+
densities <- list()
|
|
13
|
+
for(plot in names(data)){
|
|
14
|
+
values = data[[plot]]
|
|
15
|
+
# If the plot has less than 5 values or all the values are the same, we will return a flat line
|
|
16
|
+
if(length(values) <= 5 | length(unique(values)) == 1){
|
|
17
|
+
y = rep(0, length(values))
|
|
18
|
+
densities[[plot]] <- list(x=values, y=y)
|
|
19
|
+
next
|
|
20
|
+
}
|
|
21
|
+
den = density(x = values, from=min(values), to=max(values))
|
|
22
|
+
x = den$x
|
|
23
|
+
y = den$y
|
|
24
|
+
result = list(x=x, y=y) #This is an object with two keys x and y that are number arrays
|
|
25
|
+
densities[[plot]] <- result
|
|
26
|
+
}
|
|
27
|
+
toJSON(densities, digits = NA, na = "string") # will return a json like { plotA: {x:[...], y: [...]}}
|
|
28
|
+
|
|
29
|
+
|
package/utils/edge.R
CHANGED
|
@@ -10,24 +10,11 @@ suppressWarnings({
|
|
|
10
10
|
suppressPackageStartupMessages(library(dplyr))
|
|
11
11
|
})
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
# Add the gene_id_symbols as a new column to the dataframe
|
|
19
|
-
read_counts$gene_id_symbols <- gene_id_symbols
|
|
20
|
-
# Sort the dataframe based on the standard deviation column
|
|
21
|
-
read_counts <- read_counts[order(read_counts$Row_SD, decreasing = TRUE), ]
|
|
22
|
-
# Select top 3000 rows
|
|
23
|
-
read_counts <- head(read_counts,num_variable_genes) # Currently hardcoded 3000 genes
|
|
24
|
-
# Get gene id symbols corresponding to the reordered read count matrix
|
|
25
|
-
gene_id_symbols <- read_counts$gene_id_symbols
|
|
26
|
-
# Remove column Row_SD from read_counts dataframe
|
|
27
|
-
read_counts <- read_counts[, !names(read_counts) %in% "Row_SD"]
|
|
28
|
-
# Remove column gene_id_symbols from read_counts dataframe
|
|
29
|
-
read_counts <- read_counts[, !names(read_counts) %in% "gene_id_symbols"]
|
|
30
|
-
return(list(read_counts = read_counts, gene_id_symbols = gene_id_symbols))
|
|
13
|
+
# Filter based on CPM
|
|
14
|
+
filter_using_cpm <- function(y, gene_cpm_cutoff, sample_cpm_cutoff, count_cpm_cutoff) {
|
|
15
|
+
selr <- rowSums(cpm(y$counts)>gene_cpm_cutoff)>=sample_cpm_cutoff
|
|
16
|
+
selc <- colSums(cpm(y$counts))>=count_cpm_cutoff
|
|
17
|
+
y <- y[selr, selc]
|
|
31
18
|
}
|
|
32
19
|
|
|
33
20
|
# Read JSON input from stdin
|
|
@@ -86,21 +73,6 @@ read_counts_time <- system.time({
|
|
|
86
73
|
conditions <- c(rep("Diseased", length(cases)), rep("Control", length(controls)))
|
|
87
74
|
gene_id_symbols <- paste0(geneIDs, "\t", geneSymbols)
|
|
88
75
|
|
|
89
|
-
filter_genes_time <- system.time({
|
|
90
|
-
if (length(input$VarGenes) != 0) { # Filter out variable genes for DE analysis
|
|
91
|
-
filtered_read_counts <- filter_genes_by_global_variance(read_counts, gene_id_symbols, input$VarGenes)
|
|
92
|
-
read_counts <- filtered_read_counts$read_counts
|
|
93
|
-
gene_id_symbols <- filtered_read_counts$gene_id_symbols
|
|
94
|
-
|
|
95
|
-
#### Will implement filtering by per group variance later
|
|
96
|
-
#filtered_read_counts <- filter_genes_by_group_variance(read_counts, gene_id_symbols, num_variable_genes, cases, controls)
|
|
97
|
-
#read_counts <- filtered_read_counts$read_counts
|
|
98
|
-
#gene_id_symbols <- filtered_read_counts$gene_id_symbols
|
|
99
|
-
}
|
|
100
|
-
})
|
|
101
|
-
|
|
102
|
-
#cat("Time to filter genes: ", filter_genes_time[3], " seconds\n")
|
|
103
|
-
|
|
104
76
|
# Create DGEList object
|
|
105
77
|
dge_list_time <- system.time({
|
|
106
78
|
y <- DGEList(counts = read_counts, group = conditions, genes = gene_id_symbols)
|
|
@@ -115,27 +87,59 @@ filter_time <- system.time({
|
|
|
115
87
|
|
|
116
88
|
normalization_time <- system.time({
|
|
117
89
|
y <- y[keep, keep.lib.sizes = FALSE]
|
|
118
|
-
y <-
|
|
90
|
+
y <- normLibSizes(y) # Using TMM method for normalization
|
|
119
91
|
})
|
|
120
92
|
#cat("Normalization time: ", normalization_time[3], " seconds\n")
|
|
121
93
|
|
|
94
|
+
# Cutoffs for cpm, will add them as UI options later
|
|
95
|
+
if (length(samples_indices) > 100) {
|
|
96
|
+
gene_cpm_cutoff <- 15
|
|
97
|
+
sample_cpm_cutoff <- 30
|
|
98
|
+
count_cpm_cutoff <- 100000
|
|
99
|
+
} else {
|
|
100
|
+
gene_cpm_cutoff <- 5
|
|
101
|
+
sample_cpm_cutoff <- 15
|
|
102
|
+
count_cpm_cutoff <- 100000
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
filter_using_cpm_time <- system.time({
|
|
106
|
+
y <- filter_using_cpm(y, gene_cpm_cutoff, sample_cpm_cutoff, count_cpm_cutoff) # Filtering counts matrix based on gene_cpm_cutoff, sample_cpm_cutoff and count_cpm_cutoff
|
|
107
|
+
})
|
|
108
|
+
#cat("Filter using cpm time: ", filter_using_cpm_time[3], " seconds\n")
|
|
109
|
+
|
|
110
|
+
# Saving MDS plot image
|
|
111
|
+
set.seed(as.integer(Sys.time())) # Set the seed according to current time
|
|
112
|
+
cachedir <- input$cachedir # Importing serverconfig.cachedir
|
|
113
|
+
random_number <- runif(1, min = 0, max = 1) # Generating random number
|
|
114
|
+
mds_image_name <- paste0("edgeR_mds_temp_",random_number,".png") # Generating random image name so that simultaneous server side requests do NOT generate the same edgeR file name
|
|
115
|
+
png(filename = paste0(cachedir,"/",mds_image_name), width = 1000, height = 1000, res = 200) # Opening a png device
|
|
116
|
+
par(oma = c(0, 0, 0, 0)) # Creating a margin
|
|
117
|
+
mds_conditions <- c(rep("T", length(cases)), rep("C", length(controls))) # Case samples are labelled "T" and control samples are labelled "C". Single-letter labelling added because otherwise labels get overwritten on each other.
|
|
118
|
+
plotMDS(y, labels = mds_conditions) # Plot the edgeR MDS plot
|
|
119
|
+
# dev.off() # Gives a null device message which breaks JSON. Commenting it out for now, will investigate it later
|
|
120
|
+
|
|
121
|
+
|
|
122
122
|
# Differential expression analysis
|
|
123
123
|
if (length(input$conf1) == 0) { # No adjustment of confounding factors
|
|
124
|
-
|
|
124
|
+
design <- model.matrix(~conditions) # Based on the protocol defined in section 1.4 of edgeR manual https://bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
|
|
125
|
+
fit_time <- system.time({
|
|
125
126
|
suppressWarnings({
|
|
126
127
|
suppressMessages({
|
|
127
|
-
|
|
128
|
+
fit <- glmQLFit(y,design) # The glmQLFit() replaces glmFit() which implements the quasi-likelihood function. This is better able to account for overdispersion as it employs a more lenient approach where variance is not a fixed function of the mean.
|
|
128
129
|
})
|
|
129
130
|
})
|
|
130
131
|
})
|
|
131
|
-
#cat("
|
|
132
|
+
#cat("QL fit time: ", fit_time[3], " seconds\n")
|
|
132
133
|
|
|
133
|
-
|
|
134
|
-
|
|
134
|
+
test_time <- system.time({
|
|
135
|
+
suppressWarnings({
|
|
136
|
+
suppressMessages({
|
|
137
|
+
et <- glmQLFTest(fit)
|
|
138
|
+
})
|
|
139
|
+
})
|
|
135
140
|
})
|
|
136
|
-
#cat("
|
|
141
|
+
#cat("QL test time: ", test_time[3], " seconds\n")
|
|
137
142
|
} else { # Adjusting for confounding factors
|
|
138
|
-
|
|
139
143
|
# Check the type of confounding variable
|
|
140
144
|
if (input$conf1_mode == "continuous") { # If this is float, the input conf1 vector should be converted into a numeric vector
|
|
141
145
|
conf1 <- as.numeric(input$conf1)
|
|
@@ -144,7 +148,7 @@ if (length(input$conf1) == 0) { # No adjustment of confounding factors
|
|
|
144
148
|
}
|
|
145
149
|
|
|
146
150
|
if (length(input$conf2) == 0) { # No adjustment of confounding factor 2
|
|
147
|
-
y$samples <- data.frame(conditions = conditions, conf1 = conf1)
|
|
151
|
+
y$samples <- data.frame(y$samples, conditions = conditions, conf1 = conf1)
|
|
148
152
|
model_gen_time <- system.time({
|
|
149
153
|
design <- model.matrix(~ conditions + conf1, data = y$samples)
|
|
150
154
|
})
|
|
@@ -156,29 +160,41 @@ if (length(input$conf1) == 0) { # No adjustment of confounding factors
|
|
|
156
160
|
} else { # When input$conf2_mode == "discrete" keep the vector as string.
|
|
157
161
|
conf2 <- as.factor(input$conf2)
|
|
158
162
|
}
|
|
159
|
-
y$samples <- data.frame(conditions = conditions, conf1 = conf1, conf2 = conf2)
|
|
163
|
+
y$samples <- data.frame(y$samples, conditions = conditions, conf1 = conf1, conf2 = conf2)
|
|
160
164
|
model_gen_time <- system.time({
|
|
161
165
|
design <- model.matrix(~ conditions + conf1 + conf2, data = y$samples)
|
|
162
166
|
})
|
|
163
167
|
#cat("Time for making design matrix: ", model_gen_time[3], " seconds\n")
|
|
164
168
|
}
|
|
165
169
|
|
|
166
|
-
dispersion_time <- system.time({
|
|
167
|
-
y <- estimateDisp(y, design)
|
|
168
|
-
})
|
|
169
|
-
#cat("Dispersion time: ", dispersion_time[3], " seconds\n")
|
|
170
|
-
|
|
171
170
|
fit_time <- system.time({
|
|
172
|
-
|
|
171
|
+
suppressWarnings({
|
|
172
|
+
suppressMessages({
|
|
173
|
+
fit <- glmQLFit(y,design)
|
|
174
|
+
})
|
|
175
|
+
})
|
|
173
176
|
})
|
|
174
|
-
#cat("
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
177
|
+
#cat("QL fit time: ", fit_time[3], " seconds\n")
|
|
178
|
+
test_time <- system.time({
|
|
179
|
+
suppressWarnings({
|
|
180
|
+
suppressMessages({
|
|
181
|
+
et <- glmQLFTest(fit, coef = "conditionsDiseased")
|
|
182
|
+
})
|
|
183
|
+
})
|
|
178
184
|
})
|
|
179
|
-
#cat("
|
|
185
|
+
#cat("QL test time: ", test_time[3], " seconds\n")
|
|
180
186
|
}
|
|
181
187
|
|
|
188
|
+
# Saving QL fit image
|
|
189
|
+
set.seed(as.integer(Sys.time())) # Set the seed according to current time
|
|
190
|
+
cachedir <- input$cachedir # Importing serverconfig.cachedir
|
|
191
|
+
random_number <- runif(1, min = 0, max = 1) # Generating random number
|
|
192
|
+
ql_image_name <- paste0("edgeR_ql_temp_",random_number,".png") # Generating random image name so that simultaneous server side requests do NOT generate the same edgeR file name
|
|
193
|
+
png(filename = paste0(cachedir,"/",ql_image_name), width = 1000, height = 1000, res = 200) # Opening a png device
|
|
194
|
+
par(oma = c(0, 0, 0, 0)) # Creating a margin
|
|
195
|
+
plotQLDisp(fit) # Plot the edgeR fit
|
|
196
|
+
# dev.off() # Gives a null device message which breaks JSON. Commenting it out for now, will investigate it later
|
|
197
|
+
|
|
182
198
|
# Multiple testing correction
|
|
183
199
|
multiple_testing_correction_time <- system.time({
|
|
184
200
|
logfc <- et$table$logFC
|
|
@@ -195,10 +211,14 @@ multiple_testing_correction_time <- system.time({
|
|
|
195
211
|
names(output)[4] <- "original_p_value"
|
|
196
212
|
names(output)[5] <- "adjusted_p_value"
|
|
197
213
|
})
|
|
214
|
+
final_output <- c()
|
|
215
|
+
final_output$gene_data <- output
|
|
216
|
+
final_output$edgeR_ql_image_name <- ql_image_name
|
|
217
|
+
final_output$edgeR_mds_image_name <- mds_image_name
|
|
198
218
|
#cat("Time for multiple testing correction: ", multiple_testing_correction_time[3], " seconds\n")
|
|
199
219
|
|
|
200
220
|
# Output results
|
|
201
|
-
toJSON(
|
|
221
|
+
toJSON(final_output)
|
|
202
222
|
|
|
203
223
|
#-----------------------------------#
|
|
204
224
|
# Will implement this later
|