@sjcrh/proteinpaint-server 2.122.0 → 2.124.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/utils/edge.R DELETED
@@ -1,283 +0,0 @@
1
- # Test syntax: cat ~/sjpp/test.txt | time Rscript edge.R
2
-
3
- # Load required packages
4
- suppressWarnings({
5
- library(jsonlite)
6
- library(rhdf5)
7
- library(stringr)
8
- library(readr)
9
- suppressPackageStartupMessages(library(edgeR))
10
- suppressPackageStartupMessages(library(dplyr))
11
- })
12
-
13
- # Filter based on CPM
14
- filter_using_cpm <- function(y, gene_cpm_cutoff, sample_cpm_cutoff, count_cpm_cutoff) {
15
- selr <- rowSums(cpm(y$counts)>gene_cpm_cutoff)>=sample_cpm_cutoff
16
- selc <- colSums(cpm(y$counts))>=count_cpm_cutoff
17
- y <- y[selr, selc]
18
- }
19
-
20
- # Read JSON input from stdin
21
- read_json_time <- system.time({
22
- con <- file("stdin", "r")
23
- json <- readLines(con, warn=FALSE)
24
- close(con)
25
- input <- fromJSON(json)
26
- cases <- unlist(strsplit(input$case, ","))
27
- controls <- unlist(strsplit(input$control, ","))
28
- combined <- c("geneID", "geneSymbol", cases, controls)
29
- })
30
- #cat("Time to read JSON: ", as.difftime(read_json_time, units = "secs")[3], " seconds\n")
31
-
32
- # Read counts data
33
- read_counts_time <- system.time({
34
- if (input$storage_type == "HDF5") {
35
- geneIDs <- h5read(input$input_file, "gene_ids")
36
- geneNames <- h5read(input$input_file, "gene_names")
37
- samples <- h5read(input$input_file, "samples")
38
-
39
- # Find indices of case and control samples in the HDF5 file
40
- case_indices <- match(cases, samples)
41
- control_indices <- match(controls, samples)
42
-
43
- # Check for missing samples
44
- if (any(is.na(case_indices))) {
45
- missing_cases <- cases[is.na(case_indices)]
46
- stop(paste(missing_cases, "not found"))
47
- }
48
- if (any(is.na(control_indices))) {
49
- missing_controls <- controls[is.na(control_indices)]
50
- stop(paste(missing_controls, "not found"))
51
- }
52
-
53
- samples_indices <- c(case_indices, control_indices)
54
- read_counts <- as.data.frame(t(h5read(input$input_file, "counts", index = list(samples_indices, 1:length(geneIDs)))))
55
- colnames(read_counts) <- c(cases, controls)
56
- } else if (input$storage_type == "text") {
57
- suppressWarnings({
58
- suppressMessages({
59
- read_counts <- read_tsv(input$input_file, col_names = TRUE, col_select = combined)
60
- })
61
- })
62
- geneIDs <- unlist(read_counts[1])
63
- geneNames <- unlist(read_counts[2])
64
- read_counts <- select(read_counts, -geneID)
65
- read_counts <- select(read_counts, -geneSymbol)
66
- } else {
67
- stop("Unknown storage type")
68
- }
69
- })
70
- #cat("Time to read counts data: ", as.difftime(read_counts_time, units = "secs")[3], " seconds\n")
71
-
72
- # Create conditions vector
73
- conditions <- c(rep("Diseased", length(cases)), rep("Control", length(controls)))
74
- gene_id_symbols <- paste0(geneIDs, "\t", geneNames)
75
-
76
- # Create DGEList object
77
- dge_list_time <- system.time({
78
- y <- DGEList(counts = read_counts, group = conditions, genes = gene_id_symbols)
79
- })
80
- #cat("Time to generate DGEList: ", as.difftime(dge_list_time, units = "secs")[3], " seconds\n")
81
-
82
- # Filter and normalize counts
83
- filter_time <- system.time({
84
- keep <- filterByExpr(y, min.count = input$min_count, min.total.count = input$min_total_count)
85
- })
86
- #cat("Time to filter by expression: ", as.difftime(filter_time, unit = "secs")[3], " seconds\n")
87
-
88
- normalization_time <- system.time({
89
- y <- y[keep, keep.lib.sizes = FALSE]
90
- y <- normLibSizes(y) # Using TMM method for normalization
91
- })
92
- #cat("Normalization time: ", as.difftime(normalization_time, units = "secs")[3], " seconds\n")
93
-
94
- # Cutoffs for cpm, will add them as UI options later
95
- if (length(samples_indices) > 100) {
96
- gene_cpm_cutoff <- 15
97
- sample_cpm_cutoff <- 30
98
- count_cpm_cutoff <- 100000
99
- } else {
100
- gene_cpm_cutoff <- 5
101
- sample_cpm_cutoff <- 15
102
- count_cpm_cutoff <- 100000
103
- }
104
-
105
- filter_using_cpm_time <- system.time({
106
- y <- filter_using_cpm(y, gene_cpm_cutoff, sample_cpm_cutoff, count_cpm_cutoff) # Filtering counts matrix based on gene_cpm_cutoff, sample_cpm_cutoff and count_cpm_cutoff
107
- })
108
- #cat("Filter using cpm time: ", as.difftime(filter_using_cpm_time, units = "secs")[3], " seconds\n")
109
-
110
- if (dim(y)[1]==0) { # Its possible after filtering there might not be any genes left in the matrix, in such a case the R code must exit gracefully with an error.
111
- stop("Number of genes after filtering = 0, cannot proceed any further")
112
- }
113
- if (dim(y)[2]==0) { # Its possible after filtering there might not be any samples left in the matrix, in such a case the R code must exit gracefully with an error.
114
- stop("Number of samples after filtering = 0, cannot proceed any further")
115
- }
116
-
117
- # Saving MDS plot image
118
-
119
- if (dim(read_counts)[1] * dim(read_counts)[2] < as.numeric(input$mds_cutoff)) { # If the dimensions of the read counts matrix is below this threshold, only then the mds image will be generated as its very compute intensive
120
- mds_plot_time <- system.time({
121
- set.seed(as.integer(Sys.time())) # Set the seed according to current time
122
- cachedir <- input$cachedir # Importing serverconfig.cachedir
123
- random_number <- runif(1, min = 0, max = 1) # Generating random number
124
- mds_image_name <- paste0("edgeR_mds_temp_",random_number,".png") # Generating random image name so that simultaneous server side requests do NOT generate the same edgeR file name
125
- png(filename = paste0(cachedir,"/",mds_image_name), width = 1000, height = 1000, res = 200) # Opening a png device
126
- par(oma = c(0, 0, 0, 0)) # Creating a margin
127
- mds_conditions <- c(rep("T", length(cases)), rep("C", length(controls))) # Case samples are labelled "T" and control samples are labelled "C". Single-letter labelling added because otherwise labels get overwritten on each other.
128
- plotMDS(y, labels = mds_conditions) # Plot the edgeR MDS plot
129
- # dev.off() # Gives a null device message which breaks JSON. Commenting it out for now, will investigate it later
130
- })
131
- #cat("mds plot time: ", as.difftime(mds_plot_time, units = "secs")[3], " seconds\n")
132
- }
133
-
134
- # Differential expression analysis
135
- if (length(input$conf1) == 0) { # No adjustment of confounding factors
136
- design <- model.matrix(~conditions) # Based on the protocol defined in section 1.4 of edgeR manual https://bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
137
- } else { # Adjusting for confounding factors
138
- # Check the type of confounding variable
139
- if (input$conf1_mode == "continuous") { # If this is float, the input conf1 vector should be converted into a numeric vector
140
- conf1 <- as.numeric(input$conf1)
141
- } else { # When input$conf1_mode == "discrete" keep the vector as string.
142
- conf1 <- as.factor(input$conf1)
143
- }
144
-
145
- if (length(input$conf2) == 0) { # No adjustment of confounding factor 2
146
- y$samples <- data.frame(y$samples, conditions = conditions, conf1 = conf1)
147
- model_gen_time <- system.time({
148
- design <- model.matrix(~ conditions + conf1, data = y$samples)
149
- })
150
- #cat("Time for making design matrix: ", as.difftime(model_gen_time, units = "secs")[3], " seconds\n")
151
- } else {
152
- # Check the type of confounding variable 2
153
- if (input$conf2_mode == "continuous") { # If this is float, the input conf2 vector should be converted into a numeric vector
154
- conf2 <- as.numeric(input$conf2)
155
- } else { # When input$conf2_mode == "discrete" keep the vector as string.
156
- conf2 <- as.factor(input$conf2)
157
- }
158
- y$samples <- data.frame(y$samples, conditions = conditions, conf1 = conf1, conf2 = conf2)
159
- model_gen_time <- system.time({
160
- design <- model.matrix(~ conditions + conf1 + conf2, data = y$samples)
161
- })
162
- #cat("Time for making design matrix: ", as.difftime(model_gen_time, units = "secs")[3], " seconds\n")
163
-
164
- }
165
- }
166
-
167
- DE_method <- input$DE_method
168
- if (DE_method == "edgeR") {
169
- fit_time <- system.time({
170
- suppressWarnings({
171
- suppressMessages({
172
- fit <- glmQLFit(y,design) # The glmQLFit() replaces glmFit() which implements the quasi-likelihood function. This is better able to account for overdispersion as it employs a more lenient approach where variance is not a fixed function of the mean.
173
- })
174
- })
175
- })
176
- #cat("QL fit time: ", as.difftime(fit_time, units = "secs")[3], " seconds\n")
177
- test_time <- system.time({
178
- suppressWarnings({
179
- suppressMessages({
180
- et <- glmQLFTest(fit, coef = "conditionsDiseased")
181
- })
182
- })
183
- })
184
- #cat("QL test time: ", as.difftime(test_time, units = "secs")[3], " seconds\n")
185
-
186
- # Saving QL fit image
187
- ql_plot_time <- system.time({
188
- set.seed(as.integer(Sys.time())) # Set the seed according to current time
189
- cachedir <- input$cachedir # Importing serverconfig.cachedir
190
- random_number <- runif(1, min = 0, max = 1) # Generating random number
191
- fit_image_name <- paste0("edgeR_ql_temp_",random_number,".png") # Generating random image name so that simultaneous server side requests do NOT generate the same edgeR file name
192
- png(filename = paste0(cachedir,"/",fit_image_name), width = 1000, height = 1000, res = 200) # Opening a png device
193
- par(oma = c(0, 0, 0, 0)) # Creating a margin
194
- plotQLDisp(fit) # Plot the edgeR fit
195
- # dev.off() # Gives a null device message which breaks JSON. Commenting it out for now, will investigate it later
196
- })
197
- #cat("ql plot time: ", as.difftime(ql_plot_time, units = "secs")[3], " seconds\n")
198
- logfc <- et$table$logFC
199
- logcpm <- et$table$logCPM
200
- pvalues <- et$table$PValue
201
- genes_matrix <- str_split_fixed(unlist(et$genes), "\t", 2)
202
- geneids <- unlist(genes_matrix[, 1])
203
- geneNames <- unlist(genes_matrix[, 2])
204
- adjust_p_values <- p.adjust(pvalues, method = "fdr")
205
- } else if (DE_method == "limma") {
206
- # Do voom transformation and fit linear model
207
- voom_transformation_lmfit_time <- system.time({
208
- suppressWarnings({
209
- suppressMessages({
210
- set.seed(as.integer(Sys.time())) # Set the seed according to current time
211
- cachedir <- input$cachedir # Importing serverconfig.cachedir
212
- random_number <- runif(1, min = 0, max = 1) # Generating random number
213
- fit_image_name <- paste0("limma_voom_temp_",random_number,".png") # Generating random image name so that simultaneous server side requests do NOT generate the same edgeR file name
214
- png(filename = paste0(cachedir,"/",fit_image_name), width = 1000, height = 1000, res = 200) # Opening a png device
215
- par(oma = c(0, 0, 0, 0)) # Creating a margin
216
- suppressWarnings({
217
- suppressMessages({
218
- fit <- voomLmFit(y, design, plot = TRUE) # This is base don the recommendation of the edgeR limma/voom authors https://support.bioconductor.org/p/9161585/
219
- })
220
- })
221
- dev.off() # Gives a null device message which breaks JSON. Commenting it out for now, will investigate it later
222
- })
223
- })
224
- })
225
- #cat("voom transformation + limma fit time: ", as.difftime(voom_transformation_lmfit_time, units = "secs")[3], " seconds\n")
226
-
227
- # Saving mean-difference plot (aka MA plot)
228
- #set.seed(as.integer(Sys.time())) # Set the seed according to current time
229
- #cachedir <- input$cachedir # Importing serverconfig.cachedir
230
- #random_number <- runif(1, min = 0, max = 1) # Generating random number
231
- #md_image_name <- paste0("limma_md_temp_",random_number,".png") # Generating random image name so that simultaneous server side requests do NOT generate the same edgeR file name
232
- #png(filename = paste0(cachedir,"/",md_image_name), width = 1000, height = 1000, res = 200) # Opening a png device
233
- #par(oma = c(0, 0, 0, 0)) # Creating a margin
234
- #plotMD(fit) # Plot the limma fit
235
- ## dev.off() # Gives a null device message which breaks JSON. Commenting it out for now, will investigate it later
236
-
237
- # Empirical Bayes smoothing
238
- empirical_smoothing_time <- system.time({
239
- suppressWarnings({
240
- suppressMessages({
241
- tmp <- eBayes(fit)
242
- })
243
- })
244
- })
245
- #cat("Empirical smoothing time: ", as.difftime(empirical_smoothing_time, units = "secs")[3], " seconds\n")
246
-
247
- # Time for selecting top genes
248
- top_genes_selection_time <- system.time({
249
- suppressWarnings({
250
- suppressMessages({
251
- top_table <- topTable(tmp, coef = "conditionsDiseased", number = Inf, adjust.method = "fdr") # The coeff needs to be specified in topTable() because it needs to know for which contrast the logFC needs to be calculated https://www.biostars.org/p/160465/
252
- logfc <- top_table$logFC
253
- pvalues <- top_table$P.Value
254
- genes_matrix <- str_split_fixed(unlist(top_table$genes), "\t", 2)
255
- geneids <- unlist(genes_matrix[, 1])
256
- geneNames <- unlist(genes_matrix[, 2])
257
- adjust_p_values <- top_table$adj.P.Val
258
- })
259
- })
260
- })
261
- #cat("Time for selecting top genes: ", as.difftime(top_genes_selection_time, units = "secs")[3], " seconds\n")
262
- } else { # Should not happen
263
- stop(paste0("Unknown method:", DE_method))
264
- }
265
-
266
- final_data_generation_time <- system.time({
267
- output <- data.frame(geneids, geneNames, logfc, pvalues, adjust_p_values)
268
- names(output)[1] <- "gene_id"
269
- names(output)[2] <- "gene_name"
270
- names(output)[3] <- "fold_change"
271
- names(output)[4] <- "original_p_value"
272
- names(output)[5] <- "adjusted_p_value"
273
- })
274
- final_output <- c()
275
- final_output$gene_data <- output
276
- final_output$edgeR_ql_image_name <- fit_image_name
277
- if (dim(read_counts)[1] * dim(read_counts)[2] < as.numeric(input$mds_cutoff)) { # If the dimensions of the read counts matrix is below this threshold, only then the mds image will be generated as its very compute intensive
278
- final_output$edgeR_mds_image_name <- mds_image_name
279
- }
280
- #cat("Time for generating final dataframe: ", as.difftime(final_data_generation_time, unit = "secs")[3], " seconds\n")
281
-
282
- # Output results
283
- toJSON(final_output, digits = NA, na = "string") # Setting digits = NA makes toJSON() use the max precision. na='string' causes any "not a number" to be reported as string. This from ?toJSON() documentation
package/utils/fdr.R DELETED
@@ -1,9 +0,0 @@
1
- argv <- commandArgs(TRUE)
2
-
3
- infile <- argv[1]
4
- outfile <- argv[2]
5
-
6
- dat <- read.table(infile,sep="\t",header=F,quote="")
7
- out <- p.adjust( dat, method="BH")
8
-
9
- write.table(out,file=outfile,sep="\t",quote=F,row.names=F,col.names=F)
@@ -1,12 +0,0 @@
1
- out <- NULL
2
- con <- file("stdin","r")
3
- dat <- read.table(con,sep="\t",header=F,quote="")
4
-
5
- for (i in 1:nrow(dat)) {
6
- x <- fisher.test( matrix( c( dat[i,2], dat[i,3], dat[i,4], dat[i,5], dat[i,6], dat[i,7] ), nrow=2 ) )
7
- out <- rbind(out,x$p.value)
8
- }
9
- out <- cbind(dat,out)
10
-
11
- write.table(out,file="",sep="\t",quote=F,row.names=F,col.names=F)
12
- close(con)
package/utils/fisher.R DELETED
@@ -1,9 +0,0 @@
1
- con <- file("stdin","r")
2
- dat <- read.table(con,sep="\t",header=F,quote="")
3
-
4
- pvals <- apply(dat[,2:5], 1, function(row) fisher.test(matrix(row, ncol=2))$p.value)
5
-
6
- out <- cbind(dat,pvals)
7
-
8
- write.table(out,file="",sep="\t",quote=F,row.names=F,col.names=F)
9
- close(con)