@sjcrh/proteinpaint-server 2.103.0 → 2.105.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/utils/edge.R CHANGED
@@ -1,3 +1,5 @@
1
+ # Test syntax: cat ~/sjpp/test.txt | time Rscript edge.R
2
+
1
3
  # Load required packages
2
4
  suppressWarnings({
3
5
  library(jsonlite)
@@ -8,13 +10,39 @@ suppressWarnings({
8
10
  suppressPackageStartupMessages(library(dplyr))
9
11
  })
10
12
 
13
+ filter_genes_by_global_variance <- function(read_counts, gene_id_symbols, num_variable_genes) {
14
+ # Calculate the standard deviation of each row
15
+ row_sd <- apply(read_counts, 1, sd)
16
+ # Add the standard deviation as a new column to the dataframe
17
+ read_counts$Row_SD <- row_sd
18
+ # Add the gene_id_symbols as a new column to the dataframe
19
+ read_counts$gene_id_symbols <- gene_id_symbols
20
+ # Sort the dataframe based on the standard deviation column
21
+ read_counts <- read_counts[order(read_counts$Row_SD, decreasing = TRUE), ]
22
+ # Select top 3000 rows
23
+ read_counts <- head(read_counts,num_variable_genes) # Currently hardcoded 3000 genes
24
+ # Get gene id symbols corresponding to the reordered read count matrix
25
+ gene_id_symbols <- read_counts$gene_id_symbols
26
+ # Remove column Row_SD from read_counts dataframe
27
+ read_counts <- read_counts[, !names(read_counts) %in% "Row_SD"]
28
+ # Remove column gene_id_symbols from read_counts dataframe
29
+ read_counts <- read_counts[, !names(read_counts) %in% "gene_id_symbols"]
30
+ return(list(read_counts = read_counts, gene_id_symbols = gene_id_symbols))
31
+ }
32
+
33
+ # Will implement this later
34
+ filter_genes_by_group_variance <- function(read_counts, gene_id_symbols, num_variable_genes, cases, controls) {
35
+ # Divide the read counts into two groups
36
+ case_read_counts <- read_counts[, cases]
37
+ control_read_counts <- read_counts[, controls]
38
+ }
39
+
11
40
  # Read JSON input from stdin
12
41
  read_json_time <- system.time({
13
42
  con <- file("stdin", "r")
14
43
  json <- readLines(con, warn=FALSE)
15
44
  close(con)
16
45
  input <- fromJSON(json)
17
-
18
46
  cases <- unlist(strsplit(input$case, ","))
19
47
  controls <- unlist(strsplit(input$control, ","))
20
48
  combined <- c("geneID", "geneSymbol", cases, controls)
@@ -27,11 +55,11 @@ read_counts_time <- system.time({
27
55
  geneIDs <- h5read(input$input_file, "gene_names")
28
56
  geneSymbols <- h5read(input$input_file, "gene_symbols")
29
57
  samples <- h5read(input$input_file, "samples")
30
-
58
+
31
59
  # Find indices of case and control samples in the HDF5 file
32
60
  case_indices <- match(cases, samples)
33
61
  control_indices <- match(controls, samples)
34
-
62
+
35
63
  # Check for missing samples
36
64
  if (any(is.na(case_indices))) {
37
65
  missing_cases <- cases[is.na(case_indices)]
@@ -41,9 +69,9 @@ read_counts_time <- system.time({
41
69
  missing_controls <- controls[is.na(control_indices)]
42
70
  stop(paste(missing_controls, "not found"))
43
71
  }
44
-
72
+
45
73
  samples_indices <- c(case_indices, control_indices)
46
- read_counts <- t(h5read(input$input_file, "counts", index = list(samples_indices, 1:length(geneIDs))))
74
+ read_counts <- as.data.frame(t(h5read(input$input_file, "counts", index = list(samples_indices, 1:length(geneIDs)))))
47
75
  colnames(read_counts) <- c(cases, controls)
48
76
  } else if (input$storage_type == "text") {
49
77
  suppressWarnings({
@@ -65,6 +93,21 @@ read_counts_time <- system.time({
65
93
  conditions <- c(rep("Diseased", length(cases)), rep("Control", length(controls)))
66
94
  gene_id_symbols <- paste0(geneIDs, "\t", geneSymbols)
67
95
 
96
+ filter_genes_time <- system.time({
97
+ if (length(input$VarGenes) != 0) { # Filter out variable genes for DE analysis
98
+ filtered_read_counts <- filter_genes_by_global_variance(read_counts, gene_id_symbols, input$VarGenes)
99
+ read_counts <- filtered_read_counts$read_counts
100
+ gene_id_symbols <- filtered_read_counts$gene_id_symbols
101
+
102
+ #### Will implement filtering by per group variance later
103
+ #filtered_read_counts <- filter_genes_by_group_variance(read_counts, gene_id_symbols, num_variable_genes, cases, controls)
104
+ #read_counts <- filtered_read_counts$read_counts
105
+ #gene_id_symbols <- filtered_read_counts$gene_id_symbols
106
+ }
107
+ })
108
+
109
+ #cat("Time to filter genes: ", filter_genes_time[3], " seconds\n")
110
+
68
111
  # Create DGEList object
69
112
  dge_list_time <- system.time({
70
113
  y <- DGEList(counts = read_counts, group = conditions, genes = gene_id_symbols)
@@ -93,7 +136,7 @@ if (length(input$conf1) == 0) { # No adjustment of confounding factors
93
136
  })
94
137
  })
95
138
  #cat("Dispersion time: ", dispersion_time[3], " seconds\n")
96
-
139
+
97
140
  exact_test_time <- system.time({
98
141
  et <- exactTest(y)
99
142
  })
@@ -104,17 +147,17 @@ if (length(input$conf1) == 0) { # No adjustment of confounding factors
104
147
  design <- model.matrix(~ conf1 + conditions, data = y$samples)
105
148
  })
106
149
  #cat("Time for making design matrix: ", model_gen_time[3], " seconds\n")
107
-
150
+
108
151
  dispersion_time <- system.time({
109
152
  y <- estimateDisp(y, design)
110
153
  })
111
154
  #cat("Dispersion time: ", dispersion_time[3], " seconds\n")
112
-
155
+
113
156
  fit_time <- system.time({
114
157
  fit <- glmFit(y, design)
115
158
  })
116
159
  #cat("Fit time: ", fit_time[3], " seconds\n")
117
-
160
+
118
161
  test_statistics_time <- system.time({
119
162
  et <- glmLRT(fit, coef = 2)
120
163
  })