@sjcrh/proteinpaint-server 2.109.1 → 2.111.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,29 @@
1
+ library(jsonlite)
2
+ # This script reads in a json string from stdin, calculates the densities of each plot and returns the densities as a json string
3
+ # The input json string is a dictionary where each field maps to an array of numbers
4
+ # The output json string is a dictionary with the density for each plot. The density is represented like {x: [x density values], y: [y density values]}
5
+ # In order to test it you can run this from the command line replacing the arrays with your own:
6
+ # echo '{"plotA": [1.2, 2, 3], "plotB": [4.5, 5, 6]}' | Rscript ./density.R
7
+
8
+ con <- file("stdin", "r")
9
+ json <- readLines(con)
10
+ close(con)
11
+ data <- fromJSON(json)
12
+ densities <- list()
13
+ for(plot in names(data)){
14
+ values = data[[plot]]
15
+ # If the plot has less than 5 values or all the values are the same, we will return a flat line
16
+ if(length(values) <= 5 | length(unique(values)) == 1){
17
+ y = rep(0, length(values))
18
+ densities[[plot]] <- list(x=values, y=y)
19
+ next
20
+ }
21
+ den = density(x = values, from=min(values), to=max(values))
22
+ x = den$x
23
+ y = den$y
24
+ result = list(x=x, y=y) #This is an object with two keys x and y that are number arrays
25
+ densities[[plot]] <- result
26
+ }
27
+ toJSON(densities, digits = NA, na = "string") # will return a json like { plotA: {x:[...], y: [...]}}
28
+
29
+
package/utils/edge.R CHANGED
@@ -10,24 +10,11 @@ suppressWarnings({
10
10
  suppressPackageStartupMessages(library(dplyr))
11
11
  })
12
12
 
13
- filter_genes_by_global_variance <- function(read_counts, gene_id_symbols, num_variable_genes) {
14
- # Calculate the standard deviation of each row
15
- row_sd <- apply(read_counts, 1, sd)
16
- # Add the standard deviation as a new column to the dataframe
17
- read_counts$Row_SD <- row_sd
18
- # Add the gene_id_symbols as a new column to the dataframe
19
- read_counts$gene_id_symbols <- gene_id_symbols
20
- # Sort the dataframe based on the standard deviation column
21
- read_counts <- read_counts[order(read_counts$Row_SD, decreasing = TRUE), ]
22
- # Select top 3000 rows
23
- read_counts <- head(read_counts,num_variable_genes) # Currently hardcoded 3000 genes
24
- # Get gene id symbols corresponding to the reordered read count matrix
25
- gene_id_symbols <- read_counts$gene_id_symbols
26
- # Remove column Row_SD from read_counts dataframe
27
- read_counts <- read_counts[, !names(read_counts) %in% "Row_SD"]
28
- # Remove column gene_id_symbols from read_counts dataframe
29
- read_counts <- read_counts[, !names(read_counts) %in% "gene_id_symbols"]
30
- return(list(read_counts = read_counts, gene_id_symbols = gene_id_symbols))
13
+ # Filter based on CPM
14
+ filter_using_cpm <- function(y, gene_cpm_cutoff, sample_cpm_cutoff, count_cpm_cutoff) {
15
+ selr <- rowSums(cpm(y$counts)>gene_cpm_cutoff)>=sample_cpm_cutoff
16
+ selc <- colSums(cpm(y$counts))>=count_cpm_cutoff
17
+ y <- y[selr, selc]
31
18
  }
32
19
 
33
20
  # Read JSON input from stdin
@@ -86,21 +73,6 @@ read_counts_time <- system.time({
86
73
  conditions <- c(rep("Diseased", length(cases)), rep("Control", length(controls)))
87
74
  gene_id_symbols <- paste0(geneIDs, "\t", geneSymbols)
88
75
 
89
- filter_genes_time <- system.time({
90
- if (length(input$VarGenes) != 0) { # Filter out variable genes for DE analysis
91
- filtered_read_counts <- filter_genes_by_global_variance(read_counts, gene_id_symbols, input$VarGenes)
92
- read_counts <- filtered_read_counts$read_counts
93
- gene_id_symbols <- filtered_read_counts$gene_id_symbols
94
-
95
- #### Will implement filtering by per group variance later
96
- #filtered_read_counts <- filter_genes_by_group_variance(read_counts, gene_id_symbols, num_variable_genes, cases, controls)
97
- #read_counts <- filtered_read_counts$read_counts
98
- #gene_id_symbols <- filtered_read_counts$gene_id_symbols
99
- }
100
- })
101
-
102
- #cat("Time to filter genes: ", filter_genes_time[3], " seconds\n")
103
-
104
76
  # Create DGEList object
105
77
  dge_list_time <- system.time({
106
78
  y <- DGEList(counts = read_counts, group = conditions, genes = gene_id_symbols)
@@ -115,27 +87,59 @@ filter_time <- system.time({
115
87
 
116
88
  normalization_time <- system.time({
117
89
  y <- y[keep, keep.lib.sizes = FALSE]
118
- y <- calcNormFactors(y, method = "TMM")
90
+ y <- normLibSizes(y) # Using TMM method for normalization
119
91
  })
120
92
  #cat("Normalization time: ", normalization_time[3], " seconds\n")
121
93
 
94
+ # Cutoffs for cpm, will add them as UI options later
95
+ if (length(samples_indices) > 100) {
96
+ gene_cpm_cutoff <- 15
97
+ sample_cpm_cutoff <- 30
98
+ count_cpm_cutoff <- 100000
99
+ } else {
100
+ gene_cpm_cutoff <- 5
101
+ sample_cpm_cutoff <- 15
102
+ count_cpm_cutoff <- 100000
103
+ }
104
+
105
+ filter_using_cpm_time <- system.time({
106
+ y <- filter_using_cpm(y, gene_cpm_cutoff, sample_cpm_cutoff, count_cpm_cutoff) # Filtering counts matrix based on gene_cpm_cutoff, sample_cpm_cutoff and count_cpm_cutoff
107
+ })
108
+ #cat("Filter using cpm time: ", filter_using_cpm_time[3], " seconds\n")
109
+
110
+ # Saving MDS plot image
111
+ set.seed(as.integer(Sys.time())) # Set the seed according to current time
112
+ cachedir <- input$cachedir # Importing serverconfig.cachedir
113
+ random_number <- runif(1, min = 0, max = 1) # Generating random number
114
+ mds_image_name <- paste0("edgeR_mds_temp_",random_number,".png") # Generating random image name so that simultaneous server side requests do NOT generate the same edgeR file name
115
+ png(filename = paste0(cachedir,"/",mds_image_name), width = 1000, height = 1000, res = 200) # Opening a png device
116
+ par(oma = c(0, 0, 0, 0)) # Creating a margin
117
+ mds_conditions <- c(rep("T", length(cases)), rep("C", length(controls))) # Case samples are labelled "T" and control samples are labelled "C". Single-letter labelling added because otherwise labels get overwritten on each other.
118
+ plotMDS(y, labels = mds_conditions) # Plot the edgeR MDS plot
119
+ # dev.off() # Gives a null device message which breaks JSON. Commenting it out for now, will investigate it later
120
+
121
+
122
122
  # Differential expression analysis
123
123
  if (length(input$conf1) == 0) { # No adjustment of confounding factors
124
- dispersion_time <- system.time({
124
+ design <- model.matrix(~conditions) # Based on the protocol defined in section 1.4 of edgeR manual https://bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
125
+ fit_time <- system.time({
125
126
  suppressWarnings({
126
127
  suppressMessages({
127
- y <- estimateDisp(y)
128
+ fit <- glmQLFit(y,design) # The glmQLFit() replaces glmFit() which implements the quasi-likelihood function. This is better able to account for overdispersion as it employs a more lenient approach where variance is not a fixed function of the mean.
128
129
  })
129
130
  })
130
131
  })
131
- #cat("Dispersion time: ", dispersion_time[3], " seconds\n")
132
+ #cat("QL fit time: ", fit_time[3], " seconds\n")
132
133
 
133
- exact_test_time <- system.time({
134
- et <- exactTest(y)
134
+ test_time <- system.time({
135
+ suppressWarnings({
136
+ suppressMessages({
137
+ et <- glmQLFTest(fit)
138
+ })
139
+ })
135
140
  })
136
- #cat("Exact test time: ", exact_test_time[3], " seconds\n")
141
+ #cat("QL test time: ", test_time[3], " seconds\n")
137
142
  } else { # Adjusting for confounding factors
138
-
139
143
  # Check the type of confounding variable
140
144
  if (input$conf1_mode == "continuous") { # If this is float, the input conf1 vector should be converted into a numeric vector
141
145
  conf1 <- as.numeric(input$conf1)
@@ -144,7 +148,7 @@ if (length(input$conf1) == 0) { # No adjustment of confounding factors
144
148
  }
145
149
 
146
150
  if (length(input$conf2) == 0) { # No adjustment of confounding factor 2
147
- y$samples <- data.frame(conditions = conditions, conf1 = conf1)
151
+ y$samples <- data.frame(y$samples, conditions = conditions, conf1 = conf1)
148
152
  model_gen_time <- system.time({
149
153
  design <- model.matrix(~ conditions + conf1, data = y$samples)
150
154
  })
@@ -156,29 +160,41 @@ if (length(input$conf1) == 0) { # No adjustment of confounding factors
156
160
  } else { # When input$conf2_mode == "discrete" keep the vector as string.
157
161
  conf2 <- as.factor(input$conf2)
158
162
  }
159
- y$samples <- data.frame(conditions = conditions, conf1 = conf1, conf2 = conf2)
163
+ y$samples <- data.frame(y$samples, conditions = conditions, conf1 = conf1, conf2 = conf2)
160
164
  model_gen_time <- system.time({
161
165
  design <- model.matrix(~ conditions + conf1 + conf2, data = y$samples)
162
166
  })
163
167
  #cat("Time for making design matrix: ", model_gen_time[3], " seconds\n")
164
168
  }
165
169
 
166
- dispersion_time <- system.time({
167
- y <- estimateDisp(y, design)
168
- })
169
- #cat("Dispersion time: ", dispersion_time[3], " seconds\n")
170
-
171
170
  fit_time <- system.time({
172
- fit <- glmFit(y, design)
171
+ suppressWarnings({
172
+ suppressMessages({
173
+ fit <- glmQLFit(y,design)
174
+ })
175
+ })
173
176
  })
174
- #cat("Fit time: ", fit_time[3], " seconds\n")
175
-
176
- test_statistics_time <- system.time({
177
- et <- glmLRT(fit, coef = "conditionsDiseased")
177
+ #cat("QL fit time: ", fit_time[3], " seconds\n")
178
+ test_time <- system.time({
179
+ suppressWarnings({
180
+ suppressMessages({
181
+ et <- glmQLFTest(fit, coef = "conditionsDiseased")
182
+ })
183
+ })
178
184
  })
179
- #cat("Test statistics time: ", test_statistics_time[3], " seconds\n")
185
+ #cat("QL test time: ", test_time[3], " seconds\n")
180
186
  }
181
187
 
188
+ # Saving QL fit image
189
+ set.seed(as.integer(Sys.time())) # Set the seed according to current time
190
+ cachedir <- input$cachedir # Importing serverconfig.cachedir
191
+ random_number <- runif(1, min = 0, max = 1) # Generating random number
192
+ ql_image_name <- paste0("edgeR_ql_temp_",random_number,".png") # Generating random image name so that simultaneous server side requests do NOT generate the same edgeR file name
193
+ png(filename = paste0(cachedir,"/",ql_image_name), width = 1000, height = 1000, res = 200) # Opening a png device
194
+ par(oma = c(0, 0, 0, 0)) # Creating a margin
195
+ plotQLDisp(fit) # Plot the edgeR fit
196
+ # dev.off() # Gives a null device message which breaks JSON. Commenting it out for now, will investigate it later
197
+
182
198
  # Multiple testing correction
183
199
  multiple_testing_correction_time <- system.time({
184
200
  logfc <- et$table$logFC
@@ -195,10 +211,14 @@ multiple_testing_correction_time <- system.time({
195
211
  names(output)[4] <- "original_p_value"
196
212
  names(output)[5] <- "adjusted_p_value"
197
213
  })
214
+ final_output <- c()
215
+ final_output$gene_data <- output
216
+ final_output$edgeR_ql_image_name <- ql_image_name
217
+ final_output$edgeR_mds_image_name <- mds_image_name
198
218
  #cat("Time for multiple testing correction: ", multiple_testing_correction_time[3], " seconds\n")
199
219
 
200
220
  # Output results
201
- toJSON(output)
221
+ toJSON(final_output)
202
222
 
203
223
  #-----------------------------------#
204
224
  # Will implement this later