@sjcrh/proteinpaint-server 2.104.0 → 2.105.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/routes/gdc.maf.js +23 -16
- package/routes/gdc.mafBuild.js +14 -22
- package/routes/termdb.DE.js +3 -0
- package/routes/termdb.config.js +0 -1
- package/src/app.js +95 -90
- package/utils/edge.R +52 -9
package/utils/edge.R
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# Test syntax: cat ~/sjpp/test.txt | time Rscript edge.R
|
|
2
|
+
|
|
1
3
|
# Load required packages
|
|
2
4
|
suppressWarnings({
|
|
3
5
|
library(jsonlite)
|
|
@@ -8,13 +10,39 @@ suppressWarnings({
|
|
|
8
10
|
suppressPackageStartupMessages(library(dplyr))
|
|
9
11
|
})
|
|
10
12
|
|
|
13
|
+
filter_genes_by_global_variance <- function(read_counts, gene_id_symbols, num_variable_genes) {
|
|
14
|
+
# Calculate the standard deviation of each row
|
|
15
|
+
row_sd <- apply(read_counts, 1, sd)
|
|
16
|
+
# Add the standard deviation as a new column to the dataframe
|
|
17
|
+
read_counts$Row_SD <- row_sd
|
|
18
|
+
# Add the gene_id_symbols as a new column to the dataframe
|
|
19
|
+
read_counts$gene_id_symbols <- gene_id_symbols
|
|
20
|
+
# Sort the dataframe based on the standard deviation column
|
|
21
|
+
read_counts <- read_counts[order(read_counts$Row_SD, decreasing = TRUE), ]
|
|
22
|
+
# Select top 3000 rows
|
|
23
|
+
read_counts <- head(read_counts,num_variable_genes) # Currently hardcoded 3000 genes
|
|
24
|
+
# Get gene id symbols corresponding to the reordered read count matrix
|
|
25
|
+
gene_id_symbols <- read_counts$gene_id_symbols
|
|
26
|
+
# Remove column Row_SD from read_counts dataframe
|
|
27
|
+
read_counts <- read_counts[, !names(read_counts) %in% "Row_SD"]
|
|
28
|
+
# Remove column gene_id_symbols from read_counts dataframe
|
|
29
|
+
read_counts <- read_counts[, !names(read_counts) %in% "gene_id_symbols"]
|
|
30
|
+
return(list(read_counts = read_counts, gene_id_symbols = gene_id_symbols))
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
# Will implement this later
|
|
34
|
+
filter_genes_by_group_variance <- function(read_counts, gene_id_symbols, num_variable_genes, cases, controls) {
|
|
35
|
+
# Divide the read counts into two groups
|
|
36
|
+
case_read_counts <- read_counts[, cases]
|
|
37
|
+
control_read_counts <- read_counts[, controls]
|
|
38
|
+
}
|
|
39
|
+
|
|
11
40
|
# Read JSON input from stdin
|
|
12
41
|
read_json_time <- system.time({
|
|
13
42
|
con <- file("stdin", "r")
|
|
14
43
|
json <- readLines(con, warn=FALSE)
|
|
15
44
|
close(con)
|
|
16
45
|
input <- fromJSON(json)
|
|
17
|
-
|
|
18
46
|
cases <- unlist(strsplit(input$case, ","))
|
|
19
47
|
controls <- unlist(strsplit(input$control, ","))
|
|
20
48
|
combined <- c("geneID", "geneSymbol", cases, controls)
|
|
@@ -27,11 +55,11 @@ read_counts_time <- system.time({
|
|
|
27
55
|
geneIDs <- h5read(input$input_file, "gene_names")
|
|
28
56
|
geneSymbols <- h5read(input$input_file, "gene_symbols")
|
|
29
57
|
samples <- h5read(input$input_file, "samples")
|
|
30
|
-
|
|
58
|
+
|
|
31
59
|
# Find indices of case and control samples in the HDF5 file
|
|
32
60
|
case_indices <- match(cases, samples)
|
|
33
61
|
control_indices <- match(controls, samples)
|
|
34
|
-
|
|
62
|
+
|
|
35
63
|
# Check for missing samples
|
|
36
64
|
if (any(is.na(case_indices))) {
|
|
37
65
|
missing_cases <- cases[is.na(case_indices)]
|
|
@@ -41,9 +69,9 @@ read_counts_time <- system.time({
|
|
|
41
69
|
missing_controls <- controls[is.na(control_indices)]
|
|
42
70
|
stop(paste(missing_controls, "not found"))
|
|
43
71
|
}
|
|
44
|
-
|
|
72
|
+
|
|
45
73
|
samples_indices <- c(case_indices, control_indices)
|
|
46
|
-
read_counts <- t(h5read(input$input_file, "counts", index = list(samples_indices, 1:length(geneIDs))))
|
|
74
|
+
read_counts <- as.data.frame(t(h5read(input$input_file, "counts", index = list(samples_indices, 1:length(geneIDs)))))
|
|
47
75
|
colnames(read_counts) <- c(cases, controls)
|
|
48
76
|
} else if (input$storage_type == "text") {
|
|
49
77
|
suppressWarnings({
|
|
@@ -65,6 +93,21 @@ read_counts_time <- system.time({
|
|
|
65
93
|
conditions <- c(rep("Diseased", length(cases)), rep("Control", length(controls)))
|
|
66
94
|
gene_id_symbols <- paste0(geneIDs, "\t", geneSymbols)
|
|
67
95
|
|
|
96
|
+
filter_genes_time <- system.time({
|
|
97
|
+
if (length(input$VarGenes) != 0) { # Filter out variable genes for DE analysis
|
|
98
|
+
filtered_read_counts <- filter_genes_by_global_variance(read_counts, gene_id_symbols, input$VarGenes)
|
|
99
|
+
read_counts <- filtered_read_counts$read_counts
|
|
100
|
+
gene_id_symbols <- filtered_read_counts$gene_id_symbols
|
|
101
|
+
|
|
102
|
+
#### Will implement filtering by per group variance later
|
|
103
|
+
#filtered_read_counts <- filter_genes_by_group_variance(read_counts, gene_id_symbols, num_variable_genes, cases, controls)
|
|
104
|
+
#read_counts <- filtered_read_counts$read_counts
|
|
105
|
+
#gene_id_symbols <- filtered_read_counts$gene_id_symbols
|
|
106
|
+
}
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
#cat("Time to filter genes: ", filter_genes_time[3], " seconds\n")
|
|
110
|
+
|
|
68
111
|
# Create DGEList object
|
|
69
112
|
dge_list_time <- system.time({
|
|
70
113
|
y <- DGEList(counts = read_counts, group = conditions, genes = gene_id_symbols)
|
|
@@ -93,7 +136,7 @@ if (length(input$conf1) == 0) { # No adjustment of confounding factors
|
|
|
93
136
|
})
|
|
94
137
|
})
|
|
95
138
|
#cat("Dispersion time: ", dispersion_time[3], " seconds\n")
|
|
96
|
-
|
|
139
|
+
|
|
97
140
|
exact_test_time <- system.time({
|
|
98
141
|
et <- exactTest(y)
|
|
99
142
|
})
|
|
@@ -104,17 +147,17 @@ if (length(input$conf1) == 0) { # No adjustment of confounding factors
|
|
|
104
147
|
design <- model.matrix(~ conf1 + conditions, data = y$samples)
|
|
105
148
|
})
|
|
106
149
|
#cat("Time for making design matrix: ", model_gen_time[3], " seconds\n")
|
|
107
|
-
|
|
150
|
+
|
|
108
151
|
dispersion_time <- system.time({
|
|
109
152
|
y <- estimateDisp(y, design)
|
|
110
153
|
})
|
|
111
154
|
#cat("Dispersion time: ", dispersion_time[3], " seconds\n")
|
|
112
|
-
|
|
155
|
+
|
|
113
156
|
fit_time <- system.time({
|
|
114
157
|
fit <- glmFit(y, design)
|
|
115
158
|
})
|
|
116
159
|
#cat("Fit time: ", fit_time[3], " seconds\n")
|
|
117
|
-
|
|
160
|
+
|
|
118
161
|
test_statistics_time <- system.time({
|
|
119
162
|
et <- glmLRT(fit, coef = 2)
|
|
120
163
|
})
|