biopipen 0.33.1__py3-none-any.whl → 0.34.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (149) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/filters.py +10 -183
  3. biopipen/core/proc.py +5 -3
  4. biopipen/core/testing.py +8 -1
  5. biopipen/ns/bam.py +40 -4
  6. biopipen/ns/cnv.py +1 -1
  7. biopipen/ns/cnvkit.py +1 -1
  8. biopipen/ns/delim.py +1 -1
  9. biopipen/ns/gsea.py +63 -37
  10. biopipen/ns/misc.py +38 -0
  11. biopipen/ns/plot.py +8 -0
  12. biopipen/ns/scrna.py +290 -288
  13. biopipen/ns/scrna_metabolic_landscape.py +207 -366
  14. biopipen/ns/tcr.py +165 -97
  15. biopipen/reports/bam/CNVpytor.svelte +4 -9
  16. biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
  17. biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
  18. biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
  19. biopipen/reports/{delim/SampleInfo.svelte → common.svelte} +2 -3
  20. biopipen/reports/scrna/DimPlots.svelte +1 -1
  21. biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +51 -22
  22. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +46 -42
  23. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +63 -6
  24. biopipen/reports/snp/PlinkCallRate.svelte +2 -2
  25. biopipen/reports/snp/PlinkFreq.svelte +1 -1
  26. biopipen/reports/snp/PlinkHWE.svelte +1 -1
  27. biopipen/reports/snp/PlinkHet.svelte +1 -1
  28. biopipen/reports/snp/PlinkIBD.svelte +1 -1
  29. biopipen/reports/tcr/CDR3AAPhyschem.svelte +1 -1
  30. biopipen/scripts/bam/CNAClinic.R +41 -6
  31. biopipen/scripts/bam/CNVpytor.py +2 -1
  32. biopipen/scripts/bam/ControlFREEC.py +2 -3
  33. biopipen/scripts/bam/SamtoolsView.py +33 -0
  34. biopipen/scripts/cnv/AneuploidyScore.R +25 -13
  35. biopipen/scripts/cnv/AneuploidyScoreSummary.R +218 -163
  36. biopipen/scripts/cnv/TMADScore.R +4 -4
  37. biopipen/scripts/cnv/TMADScoreSummary.R +51 -84
  38. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +3 -3
  39. biopipen/scripts/cnvkit/CNVkitHeatmap.py +3 -3
  40. biopipen/scripts/cnvkit/CNVkitReference.py +3 -3
  41. biopipen/scripts/delim/RowsBinder.R +1 -1
  42. biopipen/scripts/delim/SampleInfo.R +4 -1
  43. biopipen/scripts/gene/GeneNameConversion.R +14 -12
  44. biopipen/scripts/gsea/Enrichr.R +2 -2
  45. biopipen/scripts/gsea/FGSEA.R +184 -50
  46. biopipen/scripts/gsea/PreRank.R +3 -3
  47. biopipen/scripts/misc/Plot.R +80 -0
  48. biopipen/scripts/plot/VennDiagram.R +2 -2
  49. biopipen/scripts/protein/ProdigySummary.R +34 -27
  50. biopipen/scripts/regulatory/MotifAffinityTest.R +11 -9
  51. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +5 -5
  52. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +4 -4
  53. biopipen/scripts/regulatory/VariantMotifPlot.R +10 -8
  54. biopipen/scripts/regulatory/motifs-common.R +10 -9
  55. biopipen/scripts/rnaseq/Simulation-ESCO.R +14 -11
  56. biopipen/scripts/rnaseq/Simulation-RUVcorr.R +7 -4
  57. biopipen/scripts/rnaseq/Simulation.R +0 -2
  58. biopipen/scripts/rnaseq/UnitConversion.R +6 -5
  59. biopipen/scripts/scrna/AnnData2Seurat.R +25 -73
  60. biopipen/scripts/scrna/CellCellCommunication.py +1 -1
  61. biopipen/scripts/scrna/CellCellCommunicationPlots.R +51 -168
  62. biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +99 -150
  63. biopipen/scripts/scrna/CellTypeAnnotation-direct.R +11 -9
  64. biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +12 -9
  65. biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +14 -11
  66. biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +19 -16
  67. biopipen/scripts/scrna/CellTypeAnnotation.R +10 -2
  68. biopipen/scripts/scrna/CellsDistribution.R +1 -1
  69. biopipen/scripts/scrna/ExprImputation-alra.R +87 -11
  70. biopipen/scripts/scrna/ExprImputation-rmagic.R +247 -21
  71. biopipen/scripts/scrna/ExprImputation-scimpute.R +8 -5
  72. biopipen/scripts/scrna/MarkersFinder.R +348 -217
  73. biopipen/scripts/scrna/MetaMarkers.R +3 -3
  74. biopipen/scripts/scrna/ModuleScoreCalculator.R +14 -13
  75. biopipen/scripts/scrna/RadarPlots.R +1 -1
  76. biopipen/scripts/scrna/ScFGSEA.R +157 -75
  77. biopipen/scripts/scrna/ScSimulation.R +11 -10
  78. biopipen/scripts/scrna/ScVelo.py +605 -0
  79. biopipen/scripts/scrna/Seurat2AnnData.R +2 -3
  80. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +1 -1
  81. biopipen/scripts/scrna/SeuratClusterStats-features.R +39 -30
  82. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +56 -65
  83. biopipen/scripts/scrna/SeuratClusterStats-stats.R +4 -4
  84. biopipen/scripts/scrna/SeuratClusterStats.R +9 -6
  85. biopipen/scripts/scrna/SeuratClustering.R +31 -48
  86. biopipen/scripts/scrna/SeuratLoading.R +2 -2
  87. biopipen/scripts/scrna/SeuratMap2Ref.R +66 -367
  88. biopipen/scripts/scrna/SeuratMetadataMutater.R +5 -7
  89. biopipen/scripts/scrna/SeuratPreparing.R +76 -24
  90. biopipen/scripts/scrna/SeuratSubClustering.R +46 -185
  91. biopipen/scripts/scrna/{SlingShot.R → Slingshot.R} +12 -16
  92. biopipen/scripts/scrna/Subset10X.R +2 -2
  93. biopipen/scripts/scrna/TopExpressingGenes.R +141 -184
  94. biopipen/scripts/scrna/celltypist-wrapper.py +6 -4
  95. biopipen/scripts/scrna/seurat_anndata_conversion.py +81 -0
  96. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +429 -123
  97. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +346 -245
  98. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +182 -173
  99. biopipen/scripts/snp/MatrixEQTL.R +39 -20
  100. biopipen/scripts/snp/PlinkCallRate.R +43 -34
  101. biopipen/scripts/snp/PlinkFreq.R +34 -41
  102. biopipen/scripts/snp/PlinkHWE.R +23 -18
  103. biopipen/scripts/snp/PlinkHet.R +26 -22
  104. biopipen/scripts/snp/PlinkIBD.R +30 -34
  105. biopipen/scripts/stats/ChowTest.R +9 -8
  106. biopipen/scripts/stats/DiffCoexpr.R +13 -11
  107. biopipen/scripts/stats/LiquidAssoc.R +7 -8
  108. biopipen/scripts/stats/Mediation.R +8 -8
  109. biopipen/scripts/stats/MetaPvalue.R +11 -13
  110. biopipen/scripts/stats/MetaPvalue1.R +6 -5
  111. biopipen/scripts/tcr/CDR3AAPhyschem.R +105 -164
  112. biopipen/scripts/tcr/ClonalStats.R +5 -4
  113. biopipen/scripts/tcr/CloneResidency.R +3 -3
  114. biopipen/scripts/tcr/CloneSizeQQPlot.R +2 -2
  115. biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
  116. biopipen/scripts/tcr/ImmunarchFilter.R +3 -3
  117. biopipen/scripts/tcr/ImmunarchLoading.R +5 -5
  118. biopipen/scripts/tcr/ScRepCombiningExpression.R +39 -0
  119. biopipen/scripts/tcr/ScRepLoading.R +114 -92
  120. biopipen/scripts/tcr/TCRClusterStats.R +2 -2
  121. biopipen/scripts/tcr/TCRClustering.R +86 -97
  122. biopipen/scripts/tcr/TESSA.R +65 -115
  123. biopipen/scripts/tcr/VJUsage.R +5 -5
  124. biopipen/scripts/vcf/TruvariBenchSummary.R +15 -11
  125. biopipen/utils/common_docstrs.py +66 -63
  126. biopipen/utils/reporter.py +177 -0
  127. {biopipen-0.33.1.dist-info → biopipen-0.34.0.dist-info}/METADATA +2 -1
  128. {biopipen-0.33.1.dist-info → biopipen-0.34.0.dist-info}/RECORD +130 -144
  129. {biopipen-0.33.1.dist-info → biopipen-0.34.0.dist-info}/WHEEL +1 -1
  130. biopipen/reports/scrna/CellCellCommunicationPlots.svelte +0 -14
  131. biopipen/reports/scrna/SeuratClusterStats.svelte +0 -16
  132. biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -37
  133. biopipen/reports/scrna/SeuratPreparing.svelte +0 -15
  134. biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -28
  135. biopipen/reports/utils/gsea.liq +0 -110
  136. biopipen/scripts/scrna/CellTypeAnnotation-common.R +0 -10
  137. biopipen/scripts/scrna/SeuratClustering-common.R +0 -213
  138. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -193
  139. biopipen/utils/caching.R +0 -44
  140. biopipen/utils/gene.R +0 -95
  141. biopipen/utils/gsea.R +0 -329
  142. biopipen/utils/io.R +0 -20
  143. biopipen/utils/misc.R +0 -602
  144. biopipen/utils/mutate_helpers.R +0 -581
  145. biopipen/utils/plot.R +0 -209
  146. biopipen/utils/repr.R +0 -146
  147. biopipen/utils/rnaseq.R +0 -48
  148. biopipen/utils/single_cell.R +0 -207
  149. {biopipen-0.33.1.dist-info → biopipen-0.34.0.dist-info}/entry_points.txt +0 -0
@@ -1,53 +1,57 @@
1
- {{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
2
- {{ biopipen_dir | joinpaths: "utils", "gsea.R" | source_r }}
3
- {{ biopipen_dir | joinpaths: "utils", "plot.R" | source_r }}
4
-
5
- library(scater)
6
- library(reshape2)
7
- library(RColorBrewer)
1
+ library(rlang)
8
2
  library(parallel)
9
- library(ggprism)
3
+ library(matrixStats)
4
+ library(enrichit)
10
5
  library(Seurat)
11
- library(ComplexHeatmap)
6
+ library(biopipen.utils)
7
+ library(plotthis)
8
+ library(tidyseurat)
12
9
 
13
10
  sobjfile <- {{ in.sobjfile | r }}
14
11
  outdir <- {{ out.outdir | r }}
15
- gmtfile <- {{ envs.gmtfile | r }}
16
12
  ntimes <- {{ envs.ntimes | r }}
17
13
  ncores <- {{ envs.ncores | r }}
18
- heatmap_devpars <- {{ envs.heatmap_devpars | r }}
19
- violin_devpars <- {{ envs.violin_devpars | r }}
20
- grouping <- {{ envs.grouping | r }}
21
- grouping_prefix <- {{ envs.grouping_prefix | r }}
22
- subsetting_cols <- {{ envs.subsetting | r }}
23
- subsetting_prefix <- {{ envs.subsetting_prefix | r }}
24
-
25
- if (!is.null(grouping_prefix) && nchar(grouping_prefix) > 0) {
26
- grouping_prefix = paste0(grouping_prefix, "_")
27
- }
28
-
29
- if (!is.null(subsetting_prefix) && nchar(subsetting_prefix) > 0) {
30
- subsetting_prefix = paste0(subsetting_prefix, "_")
31
- }
14
+ gmtfile <- {{ envs.gmtfile | r }}
15
+ subset_by <- {{ envs.subset_by | r }}
16
+ group_by <- {{ envs.group_by | r }}
17
+ plots <- {{ envs.plots | r }}
18
+ cases <- {{ envs.cases | r }}
32
19
 
33
20
  set.seed(8525)
34
21
 
35
- ## gmt_pathways is copied from fgsea package.
36
- gmt_pathways <- function(gmt_file) {
37
- pathway_lines <- strsplit(readLines(gmt_file), "\t")
38
- pathways <- lapply(pathway_lines, tail, -2)
39
- names(pathways) <- sapply(pathway_lines, head, 1)
40
- pathways
41
- }
22
+ log <- get_logger()
23
+ reporter <- get_reporter()
24
+
25
+ log$info("Loading Seurat object ...")
26
+ sobj <- read_obj(sobjfile)
27
+ assay <- DefaultAssay(sobj)
28
+
29
+ defaults <- list(
30
+ ntimes = ntimes,
31
+ subset_by = subset_by,
32
+ group_by = group_by,
33
+ plots = plots
34
+ )
35
+ log$info("Expanding cases ...")
36
+ default_case <- subset_by %||% "DEFAULT"
37
+ cases <- expand_cases(
38
+ cases,
39
+ defaults,
40
+ function(name, case) {
41
+ if (is.null(case$group_by)) {
42
+ stop("'group_by' is required in case: ", name)
43
+ }
44
+ stats::setNames(list(case), name)
45
+ },
46
+ default_case = default_case)
42
47
 
43
- gmtfile <- localizeGmtfile(gmtfile)
44
- pathways <- gmt_pathways(gmtfile)
48
+ log$info("Loading metabolic pathways ...")
49
+ pathways <- ParseGMT(gmtfile)
45
50
  pathway_names <- names(pathways)
46
51
  metabolics <- unique(as.vector(unname(unlist(pathways))))
47
- sobj <- readRDS(sobjfile)
48
52
 
49
53
  ## calculate how many pathways of one gene involved.
50
- num_of_pathways <- function(gmtfile, overlapgenes) {
54
+ num_of_pathways <- function(overlapgenes) {
51
55
  filter_pathways <- list()
52
56
  for (p in pathway_names) {
53
57
  genes <- pathways[[p]]
@@ -70,83 +74,94 @@ num_of_pathways <- function(gmtfile, overlapgenes) {
70
74
  gene_times
71
75
  }
72
76
 
73
- do_one_subset <- function(s, subset_col, subset_prefix) {
74
- log_info(" Processing subset: {s} ...")
75
- if (is.null(s)) {
76
- subset_dir <- file.path(outdir, "ALL")
77
- dir.create(subset_dir, showWarnings = FALSE)
78
- subset_obj <- sobj
79
- } else {
80
- subset_dir <- file.path(outdir, paste0(subset_prefix, s))
81
- dir.create(subset_dir, showWarnings = FALSE)
82
-
83
- subset_code = paste0(
84
- "subset(sobj, subset = ", subset_col, " == '", s, "')"
77
+ do_subset <- function(
78
+ object,
79
+ caseinfo,
80
+ subset_by,
81
+ subset_val,
82
+ ntimes,
83
+ group_by,
84
+ plots
85
+ ) {
86
+ if (!is.null(subset_val)) {
87
+ log$info("- Handling subset: {subset_by} = {subset_val} ...")
88
+ object <- tryCatch(
89
+ filter(object, !!sym(subset_by) == subset_val & !is.na(!!sym(group_by))),
90
+ error = function(e) NULL
85
91
  )
86
- subset_obj = eval(parse(text = subset_code))
92
+
93
+ if (is.null(object) || ncol(object) < 5) {
94
+ msg <- paste0(" ! skipped. Subset has less than 5 cells: ", subset_by, " = ", subset_val)
95
+ log$warn(msg)
96
+ reporter$add(list(kind = "error", content = msg), h1 = caseinfo$name)
97
+ return(NULL)
98
+ }
87
99
  }
88
100
 
89
- all_cell_types <- subset_obj@meta.data[[grouping]]
90
- cell_types <- unique(all_cell_types)
101
+ all_groups <- object@meta.data[[group_by]]
102
+ if (!is.factor(all_groups)) {
103
+ all_groups <- factor(all_groups)
104
+ }
105
+ # order by levels(all_groups)
106
+ groups <- intersect(levels(all_groups), unique(all_groups))
91
107
 
92
- gene_pathway_number <- num_of_pathways(
93
- gmtfile,
94
- intersect(rownames(subset_obj), metabolics)
95
- )
108
+ gene_pathway_number <- num_of_pathways(intersect(rownames(object), metabolics))
96
109
 
97
110
  ## Calculate the pathway activities
98
111
  # mean ratio of genes in each pathway for each cell type
99
112
  mean_expression_shuffle <- matrix(
100
113
  NA,
101
114
  nrow = length(pathway_names),
102
- ncol = length(cell_types),
103
- dimnames = list(pathway_names, cell_types)
115
+ ncol = length(groups),
116
+ dimnames = list(pathway_names, groups)
104
117
  )
105
118
  mean_expression_noshuffle <- matrix(
106
119
  NA,
107
120
  nrow = length(pathway_names),
108
- ncol = length(cell_types),
109
- dimnames = list(pathway_names, cell_types)
121
+ ncol = length(groups),
122
+ dimnames = list(pathway_names, groups)
110
123
  )
111
124
  ### calculate the pvalues using shuffle method
112
125
  pvalues_mat <- matrix(
113
126
  NA,
114
127
  nrow = length(pathway_names),
115
- ncol = length(cell_types),
116
- dimnames = (list(pathway_names, cell_types))
128
+ ncol = length(groups),
129
+ dimnames = (list(pathway_names, groups))
117
130
  )
118
131
 
119
132
  for (pi in seq_along(pathway_names)) {
120
133
  p <- pathway_names[pi]
121
- log_info(" * Pathway ({pi}): {p} ...")
134
+ log$info(" Pathway ({pi}/{length(pathway_names)}): {p} ...")
122
135
  genes <- pathways[[p]]
123
- genes_comm <- intersect(genes, rownames(subset_obj))
124
- genes_expressed <- names(rowSums(subset_obj)[rowSums(subset_obj) > 0])
125
- genes_comm <- intersect(genes_comm, genes_expressed)
136
+ genes_comm <- intersect(genes, rownames(object))
137
+ # genes_expressed <- names(rowSums(object)[rowSums(object) > 0])
138
+ # genes_comm <- intersect(genes_comm, genes_expressed)
126
139
  if (length(genes_comm) < 5) next
127
140
 
128
141
  # Errored if default assay is SCT
129
142
  # Issue with Seurat?
130
- # pathway_metabolic_obj <- subset(subset_obj, features = genes_comm)
131
- assay <- DefaultAssay(subset_obj)
132
- mean_exp_eachCellType <- AverageExpression(subset_obj, features = genes_comm, assays = assay)[[assay]]
143
+ # pathway_metabolic_obj <- subset(object, features = genes_comm)
144
+ # assay <- DefaultAssay(object)
145
+ ## AggregateExpression raises Warning: The counts layer for the integrated assay is empty. Skipping assay.
146
+ mean_exp_eachCellType <- suppressMessages(AverageExpression(object, features = genes_comm, assays = assay, group.by = group_by))[[assay]]
133
147
 
134
148
  # remove genes which are zeros in any celltype to avoid extreme ratio value
135
- keep <- rownames(mean_exp_eachCellType)[rowAlls(mean_exp_eachCellType > 0.001, useNames = F)]
149
+ keep <- rownames(mean_exp_eachCellType)[rowAlls(as.matrix(mean_exp_eachCellType) > 0.001, useNames = F)]
136
150
  if (length(keep) < 3) next
137
151
 
138
152
  # using the loweset value to replace zeros for avoiding extreme ratio value
139
- # pathway_metabolic_obj <- subset(subset_obj, features = keep)
140
- assay_data = GetAssayData(subset_obj, assay = assay, layer = "data")[keep, , drop = F]
153
+ # pathway_metabolic_obj <- subset(object, features = keep)
154
+ assay_data = GetAssayData(object, assay = assay, layer = "data")[keep, , drop = F]
141
155
  assay_data <- t(apply(assay_data, 1, function(x) {
142
156
  x[x <= 0] <- min(x[x > 0])
143
157
  x
144
158
  }))
145
- pathway_metabolic_obj <- CreateSeuratObject(CreateAssayObject(data = assay_data), assay = assay)
146
- Idents(pathway_metabolic_obj) <- Idents(subset_obj)
159
+ pathway_metabolic_obj <- suppressWarnings(CreateSeuratObject(CreateAssayObject(data = assay_data), assay = assay))
160
+ pathway_metabolic_obj[[group_by]] <- object[[group_by]]
161
+ Idents(pathway_metabolic_obj) <- Idents(object)
147
162
  pathway_number_weight <- 1 / gene_pathway_number[keep, ]
148
163
  #
149
- mean_exp_eachCellType <- t(AverageExpression(pathway_metabolic_obj, assays = assay)[[assay]])
164
+ mean_exp_eachCellType <- t(suppressMessages(AverageExpression(pathway_metabolic_obj, assays = assay, group.by = group_by)[[assay]]))
150
165
  ratio_exp_eachCellType <- t(mean_exp_eachCellType) / colMeans(mean_exp_eachCellType)
151
166
  # exclude the extreme ratios
152
167
  col_quantile <- apply(ratio_exp_eachCellType, 2, function(x) quantile(x, na.rm = T))
@@ -161,21 +176,21 @@ do_one_subset <- function(s, subset_col, subset_prefix) {
161
176
  if (sum(!outliers) < 3) next
162
177
 
163
178
  keep <- names(outliers)[!outliers]
164
- pathway_metabolic_obj <- subset(pathway_metabolic_obj, features = keep)
179
+ pathway_metabolic_obj <- suppressWarnings(subset(pathway_metabolic_obj, features = keep))
165
180
  pathway_number_weight <- 1 / gene_pathway_number[keep, ]
166
- mean_exp_eachCellType <- t(AverageExpression(pathway_metabolic_obj, assays = assay)[[assay]])
181
+ mean_exp_eachCellType <- t(suppressMessages(AverageExpression(pathway_metabolic_obj, assays = assay, group.by = group_by)[[assay]]))
167
182
  ratio_exp_eachCellType <- t(mean_exp_eachCellType) / colMeans(mean_exp_eachCellType)
168
183
  mean_exp_pathway <- apply(ratio_exp_eachCellType, 2, function(x) weighted.mean(x, pathway_number_weight / sum(pathway_number_weight)))
169
- mean_expression_shuffle[p, ] <- mean_exp_pathway[cell_types]
170
- mean_expression_noshuffle[p, ] <- mean_exp_pathway[cell_types]
184
+ mean_expression_shuffle[p, ] <- mean_exp_pathway[groups]
185
+ mean_expression_noshuffle[p, ] <- mean_exp_pathway[groups]
171
186
  pathway_metabolic_data <- GetAssayData(pathway_metabolic_obj)
172
187
 
173
188
  ## shuffle 5000 times:
174
189
  ## define the functions
175
190
  group_mean <- function(x) {
176
191
  sapply(
177
- cell_types,
178
- function(y) rowMeans(pathway_metabolic_data[, shuffle_cell_types_list[[x]] == y, drop = F])
192
+ groups,
193
+ function(y) rowMeans(pathway_metabolic_data[, shuffle_groups_list[[x]] == y, drop = F])
179
194
  )
180
195
  }
181
196
  column_weigth_mean <- function(x) {
@@ -184,9 +199,9 @@ do_one_subset <- function(s, subset_col, subset_prefix) {
184
199
  #####
185
200
  times <- 1:ntimes
186
201
  weight_values <- pathway_number_weight / sum(pathway_number_weight)
187
- shuffle_cell_types_list <- mclapply(times, function(x) sample(all_cell_types), mc.cores = ncores)
188
- # shuffle_cell_types_list <- lapply(times, function(x) sample(all_cell_types))
189
- names(shuffle_cell_types_list) <- times
202
+ shuffle_groups_list <- mclapply(times, function(x) sample(all_groups), mc.cores = ncores)
203
+ # shuffle_groups_list <- lapply(times, function(x) sample(all_groups))
204
+ names(shuffle_groups_list) <- times
190
205
  mean_exp_eachCellType_list <- mclapply(times, function(x) group_mean(x), mc.cores = ncores)
191
206
  # mean_exp_eachCellType_list <- lapply(times, function(x) group_mean(x))
192
207
  ratio_exp_eachCellType_list <- mclapply(times, function(x) mean_exp_eachCellType_list[[x]] / rowMeans(mean_exp_eachCellType_list[[x]]), mc.cores = ncores)
@@ -194,10 +209,10 @@ do_one_subset <- function(s, subset_col, subset_prefix) {
194
209
  mean_exp_pathway_list <- mclapply(times, function(x) column_weigth_mean(x), mc.cores = ncores)
195
210
  # mean_exp_pathway_list <- lapply(times, function(x) column_weigth_mean(x))
196
211
 
197
- shuffle_results <- matrix(unlist(mean_exp_pathway_list), ncol = length(cell_types), byrow = T)
212
+ shuffle_results <- matrix(unlist(mean_exp_pathway_list), ncol = length(groups), byrow = T)
198
213
  rownames(shuffle_results) <- times
199
- colnames(shuffle_results) <- cell_types
200
- for (c in cell_types) {
214
+ colnames(shuffle_results) <- groups
215
+ for (c in groups) {
201
216
  if (is.na(mean_expression_shuffle[p, c])) next
202
217
  if (mean_expression_shuffle[p, c] > 1) {
203
218
  pval <- sum(shuffle_results[, c] > mean_expression_shuffle[p, c]) / ntimes
@@ -208,8 +223,15 @@ do_one_subset <- function(s, subset_col, subset_prefix) {
208
223
  pvalues_mat[p, c] <- pval
209
224
  }
210
225
  }
211
- all_NA <- rowAlls(is.na(mean_expression_shuffle), useNames = F)
212
- mean_expression_shuffle <- mean_expression_shuffle[!all_NA, , drop = F]
226
+ all_NA <- rowAlls(is.na(as.matrix(mean_expression_shuffle)), useNames = F)
227
+ if (all(all_NA)) {
228
+ log$warn(" ! All pathways are NA after shuffling.")
229
+ # keep at least 3 pathways for plotting
230
+ mean_expression_shuffle <- mean_expression_shuffle[1:3, , drop = F]
231
+ mean_expression_shuffle[is.na(mean_expression_shuffle)] <- 1
232
+ } else {
233
+ mean_expression_shuffle <- mean_expression_shuffle[!all_NA, , drop = F]
234
+ }
213
235
  # heatmap
214
236
  dat <- mean_expression_shuffle
215
237
 
@@ -217,181 +239,260 @@ do_one_subset <- function(s, subset_col, subset_prefix) {
217
239
  sort_column <- c()
218
240
 
219
241
  for (i in colnames(dat)) {
220
- select_row <- which(rowMaxs(dat, na.rm = T, useNames = F) == dat[, i])
221
- tmp <- rownames(dat)[select_row][order(dat[select_row, i], decreasing = T)]
222
- sort_row <- c(sort_row, tmp)
242
+ select_row <- which(rowMaxs(dat, na.rm = TRUE, useNames = FALSE) == dat[, i])
243
+ tmp <- rownames(dat)[select_row][order(dat[select_row, i], decreasing = TRUE)]
244
+ sort_row <- unique(c(sort_row, tmp))
223
245
  }
224
- sort_column <- apply(dat[sort_row, , drop = F], 2, function(x) order(x)[nrow(dat)])
246
+ sort_column <- apply(dat[sort_row, , drop = FALSE], 2, function(x) order(x)[nrow(dat)])
225
247
  sort_column <- names(sort_column)
226
248
  dat[is.na(dat)] <- 1
227
-
228
- heatmapfile <- file.path(subset_dir, "KEGGpathway_activity_heatmap.png")
229
- hmdata <- dat[sort_row, sort_column, drop = F]
230
- cnames <- sapply(colnames(hmdata), function(x) {paste0(grouping_prefix, x)})
231
- colnames(hmdata) <- cnames
232
- hmdata = hmdata[, sort(cnames), drop=FALSE]
233
- hm_devpars = heatmap_devpars
234
- if (is.null(hm_devpars$res)) {
235
- hm_devpars$res = 100
236
- }
237
- if (is.null(hm_devpars$width)) {
238
- hm_devpars$width = 300 + max(nchar(rownames(hmdata))) * 8 + ncol(hmdata) * 15
239
- }
240
- if (is.null(hm_devpars$height)) {
241
- hm_devpars$height = 400 + max(nchar(colnames(hmdata))) * 8 + nrow(hmdata) * 20
249
+ dat <- dat[sort_row, sort_column, drop = FALSE]
250
+
251
+ if (!is.null(subset_by)) {
252
+ prefix <- file.path(caseinfo$prefix, paste0(slugify(subset_by), "_", slugify(subset_val), "."))
253
+ h2 <- paste0(subset_by, ": ", subset_val)
254
+ } else if (length(cases) > 1) {
255
+ prefix <- paste0(caseinfo$prefix, "/No_Subsetting/")
256
+ dir.create(prefix, showWarnings = FALSE, recursive = TRUE)
257
+ h2 <- "No Subsetting"
258
+ } else {
259
+ prefix <- paste0(caseinfo$prefix, "/")
260
+ h2 <- "#"
242
261
  }
243
- plotHeatmap(
244
- hmdata,
245
- args = list(
246
- name = "Pathway activity",
247
- rect_gp = gpar(col = "white", lwd = 0.5),
248
- row_names_side = "left",
249
- row_dend_side = "right",
250
- row_names_max_width = max_text_width(
251
- rownames(hmdata),
252
- gp = gpar(fontsize = 12)
253
- ),
254
- row_dend_width = unit(30, "mm"),
255
- cluster_columns = FALSE
256
- ),
257
- devpars = hm_devpars,
258
- outfile = heatmapfile
262
+
263
+ write.table(
264
+ mean_expression_noshuffle,
265
+ file = paste0(prefix, "pathway_activity_noshuffle.txt"),
266
+ row.names = TRUE,
267
+ col.names = TRUE,
268
+ quote = FALSE,
269
+ sep = "\t"
270
+ )
271
+ write.table(
272
+ mean_expression_shuffle,
273
+ file = paste0(prefix, "pathway_activity_shuffle.txt"),
274
+ row.names = TRUE,
275
+ col.names = TRUE,
276
+ quote = FALSE,
277
+ sep = "\t"
278
+ )
279
+ write.table(pvalues_mat,
280
+ file = paste0(prefix, "pathway_activity_shuffle_pvalue.txt"),
281
+ row.names = TRUE,
282
+ col.names = TRUE,
283
+ quote = FALSE,
284
+ sep = "\t"
259
285
  )
260
286
 
287
+ for (plotname in names(plots)) {
288
+ plotargs <- plots[[plotname]]
289
+ plotargs$devpars <- plotargs$devpars %||% list()
290
+ plotargs <- extract_vars(plotargs, "devpars", "plot_type")
291
+ devpars <- devpars %||% list()
292
+ devpars$res <- devpars$res %||% 100
293
+ if (plot_type == "merged_heatmap") { next }
294
+ log$info(" Plotting: {plotname} ...")
295
+ if (plot_type %in% c("violin", "box", "boxplot")) {
296
+ plotfn <- if (plot_type == "violin") plotthis::ViolinPlot else plotthis::BoxPlot
297
+ # boxplot show the distribution of pathway activity
298
+ scRNA_dat <- as.data.frame(mean_expression_noshuffle)
299
+ scRNA_dat$X <- NULL
300
+
301
+ # scRNA_df <- reshape2::melt(as.matrix(scRNA_dat))
302
+ # scRNA_df <- scRNA_df[!is.na(scRNA_df$value), ]
303
+ # colnames(scRNA_df)[ncol(scRNA_df) - 1] <- "variable"
304
+ scRNA_dat$Pathways <- rownames(scRNA_dat)
305
+ scRNA_dat <- tidyr::pivot_longer(
306
+ scRNA_dat,
307
+ cols = -c(Pathways),
308
+ names_to = group_by,
309
+ values_to = "Pathway Activity"
310
+ )
261
311
 
262
- write.table(mean_expression_noshuffle, file = file.path(subset_dir, "KEGGpathway_activity_noshuffle.txt"), row.names = T, col.names = T, quote = F, sep = "\t")
263
- write.table(mean_expression_shuffle, file = file.path(subset_dir, "KEGGpathway_activity_shuffle.txt"), row.names = T, col.names = T, quote = F, sep = "\t")
264
- write.table(pvalues_mat, file = file.path(subset_dir, "KEGGpathway_activity_shuffle_pvalue.txt"), row.names = T, col.names = T, quote = F, sep = "\t")
312
+ plotargs$data <- scRNA_dat
313
+ plotargs$x <- group_by
314
+ plotargs$y <- "Pathway Activity"
315
+ plotargs$keep_empty <- TRUE
316
+
317
+ p <- do_call(plotfn, plotargs)
318
+ devpars$width <- devpars$width %||% (attr(p, "width") * devpars$res) %||% 1000
319
+ devpars$height <- devpars$height %||% (attr(p, "height") * devpars$res) %||% 1000
320
+ } else { # heatmap
321
+ minval <- min(dat)
322
+ maxval <- max(dat)
323
+ dis <- max(1 - minval, maxval - 1)
324
+ minval <- 1 - dis
325
+ maxval <- 1 + dis
326
+ dat <- as.data.frame(t(dat)) # rows: groups, columns: pathways
327
+ dat[[group_by]] <- rownames(dat)
328
+ plotargs$data <- dat
329
+ plotargs$columns_by <- group_by
330
+ plotargs$in_form <- "wide-rows"
331
+ plotargs$name <- plotargs$name %||% "Pathway Activity"
332
+ plotargs$rows_name <- plotargs$rows_name %||% "Pathways"
333
+ plotargs$show_row_names <- plotargs$show_row_names %||% TRUE
334
+ plotargs$lower_cutoff <- plotargs$lower_cutoff %||% minval
335
+ plotargs$upper_cutoff <- plotargs$upper_cutoff %||% maxval
336
+ plotargs$row_name_annotation <- plotargs$row_name_annotation %||% FALSE
337
+ plotargs$row_names_side <- plotargs$row_names_side %||% "left"
338
+ plotargs$show_column_names <- plotargs$show_column_names %||% TRUE
339
+
340
+ p <- do_call(plotthis::Heatmap, plotargs)
341
+ devpars$width <- devpars$width %||% (attr(p, "width") * devpars$res) %||% 1000
342
+ devpars$height <- devpars$height %||% (attr(p, "height") * devpars$res) %||% 1000
343
+ }
265
344
 
266
- # boxplot show the distribution of pathway activity
267
- scRNA_dat <- as.data.frame(mean_expression_noshuffle)
268
- scRNA_dat$X <- NULL
345
+ plotprefix <- paste0(prefix, slugify(plotname))
346
+ png(paste0(plotprefix, ".png"), res = devpars$res, width = devpars$width, height = devpars$height)
347
+ print(p)
348
+ dev.off()
269
349
 
270
- scRNA_df <- melt(as.matrix(scRNA_dat))
271
- scRNA_df <- scRNA_df[!is.na(scRNA_df$value), ]
272
- colnames(scRNA_df)[ncol(scRNA_df) - 1] <- "variable"
273
- scRNA_df$variable <- sapply(scRNA_df$variable, function(x) {paste0(grouping_prefix, x)})
274
- violinfile <- file.path(subset_dir, "pathway_activity_violinplot.png")
275
- vio_devpars = violin_devpars
276
- if (is.null(vio_devpars$res)) {
277
- vio_devpars$res = 100
278
- }
279
- if (is.null(vio_devpars$width)) {
280
- vio_devpars$width = 100 + ncol(scRNA_df) * 100
281
- }
282
- if (is.null(hm_devpars$height)) {
283
- vio_devpars$height = 1000
350
+ descr <- plotargs$descr %||% paste0(
351
+ plotname, " a ", plotargs$plot_type, " plot of pathway activity for ", group_by, ". "
352
+ )
353
+
354
+ reporter$add(
355
+ list(name = plotname, contents = list(
356
+ list(kind = "descr", content = descr),
357
+ reporter$image(plotprefix, c(), FALSE))
358
+ ),
359
+ h1 = caseinfo$name,
360
+ h2 = h2,
361
+ ui = "tabs"
362
+ )
284
363
  }
285
- plotViolin(
286
- scRNA_df,
287
- args = list(
288
- mapping = aes(x = variable, y = value, fill = variable),
289
- trim = F,
290
- linewidth = 0.2,
291
- show.legend = F,
292
- width = 1.2
293
- ),
294
- ggs = c(
295
- "scale_y_continuous(limits = c(0, 3), breaks = 0:3, labels = 0:3)",
296
- 'labs(y = "Metabolic Pathway Activity", x=NULL)',
297
- 'stat_summary(
298
- aes(x = variable, y = value),
299
- fun = median,
300
- geom = "point",
301
- size = 1,
302
- color = "black"
303
- )',
304
- "scale_fill_biopipen()",
305
- "theme_prism(axis_text_angle = 90)"
306
- ),
307
- devpars = vio_devpars,
308
- outfile = violinfile
309
- )
310
364
 
311
- list(hmdata=as.data.frame(hmdata), hm_devpars=hm_devpars)
365
+ return(dat)
312
366
  }
313
367
 
314
- do_one_subset_col <- function(subset_col, subset_prefix) {
315
- log_info("- Handling subset column: {subset_col} ...")
316
- if (is.null(subset_col)) {
317
- do_one_subset(NULL, subset_col = NULL, subset_prefix = NULL)
368
+
369
+ do_case <- function(casename) {
370
+ log$info("Processing case: {casename} ...")
371
+ case <- cases[[casename]]
372
+ if (is.null(case) || length(case) == 0) {
373
+ log$warn(" Case skipped.")
374
+ return(NULL)
375
+ }
376
+ caseinfo <- case_info(casename, outdir, create = TRUE)
377
+
378
+ if (is.null(case$subset_by)) {
379
+ result <- do_subset(
380
+ sobj,
381
+ caseinfo = caseinfo,
382
+ subset_by = NULL,
383
+ subset_val = NULL,
384
+ ntimes = case$ntimes,
385
+ group_by = case$group_by,
386
+ plots = case$plots
387
+ )
318
388
  } else {
319
- subsets <- na.omit(unique(sobj@meta.data[[subset_col]]))
320
-
321
- # if (ncores == 1) {
322
- x = lapply(subsets, do_one_subset, subset_col = subset_col, subset_prefix = subset_prefix)
323
- # } else {
324
- # x <- mclapply(subsets, do_one_subset, subset_col = subset_col, subset_prefix = subset_prefix, mc.cores = ncores)
325
- # if (any(unlist(lapply(x, class)) == "try-error")) {
326
- # stop("mclapply error")
327
- # }
328
- # }
329
- # x is a list of hmdata
330
- # merge all hmdata
331
- if (length(x) > 1) {
332
- pws = c()
333
- for (i in 1:length(x)) {
334
- pws <- unique(c(pws, rownames(x[[i]]$hmdata)))
335
- }
336
- for (i in 1:length(x)) {
337
- x[[i]]$hmdata[setdiff(pws, rownames(x[[i]]$hmdata)), ] <- NA
338
- colnames(x[[i]]$hmdata) <- paste0(subsets[i], "_", colnames(x[[i]]$hmdata))
339
- }
340
- hm_devpars = x[[1]]$hm_devpars
341
- hm_devpars$height = hm_devpars$height * length(pws) / nrow(x[[1]]$hmdata)
342
- hmdata <- x[[1]]$hmdata[pws, ]
343
- for (i in 2:length(x)) {
344
- hmdata <- cbind(hmdata, x[[i]]$hmdata[pws, ])
345
- if (hm_devpars$res != x[[i]]$hm_devpars$res) {
346
- stop("hm_devpars$res not equal for group heatmaps")
347
- }
348
- hm_devpars$width = sum(hm_devpars$width, x[[i]]$hm_devpars$width / 2)
349
- hm_devpars$height = max(hm_devpars$height, x[[i]]$hm_devpars$height * length(pws) / nrow(x[[i]]$hmdata))
350
- }
351
- # In case of NA values
352
- hmdata[is.na(hmdata)] = 0
353
- # Plot heatmap of the merged hmdata
354
- subset_heatmap_file <- file.path(outdir, paste0(subset_col, ".group-unclustered.png"))
355
- plotHeatmap(
356
- hmdata,
357
- args = list(
358
- name = "Pathway activity",
359
- rect_gp = gpar(col = "white", lwd = 0.5),
360
- row_names_side = "left",
361
- row_dend_side = "right",
362
- row_names_max_width = max_text_width(pws, gp = gpar(fontsize = 12)),
363
- row_dend_reorder = TRUE,
364
- row_dend_width = unit(30, "mm"),
365
- column_split = unlist(lapply(1:length(subsets), function(i) {rep(subsets[i], ncol(x[[i]]$hmdata))})),
366
- cluster_columns = FALSE
367
- ),
368
- devpars = hm_devpars,
369
- outfile = subset_heatmap_file
370
- )
371
- subset_heatmap_file <- file.path(outdir, paste0(subset_col, ".group-clustered.png"))
372
- plotHeatmap(
373
- hmdata,
374
- args = list(
375
- name = "Pathway activity",
376
- rect_gp = gpar(col = "white", lwd = 0.5),
377
- row_names_side = "left",
378
- row_dend_side = "right",
379
- row_names_max_width = max_text_width(pws, gp = gpar(fontsize = 12)),
380
- row_dend_reorder = TRUE,
381
- row_dend_width = unit(30, "mm"),
382
- cluster_columns = TRUE
383
- ),
384
- devpars = hm_devpars,
385
- outfile = subset_heatmap_file
389
+ sobj_avail <- filter(sobj, !is.na(!!sym(case$subset_by)))
390
+ if (ncol(sobj_avail) < 5) {
391
+ stop("Not enough cells (< 5) for subset: ", case$subset_by)
392
+ }
393
+
394
+ subsets <- unique(sobj@meta.data[[case$subset_by]])
395
+ result <- NULL
396
+ for (ss in subsets) {
397
+ tmp <- do_subset(
398
+ sobj_avail,
399
+ caseinfo = caseinfo,
400
+ subset_by = case$subset_by,
401
+ subset_val = ss,
402
+ ntimes = case$ntimes,
403
+ group_by = case$group_by,
404
+ plots = case$plots
386
405
  )
406
+ if (is.null(tmp)) { next }
407
+ tmp[[case$group_by]] <- rownames(tmp)
408
+ tmp[[case$subset_by]] <- ss
409
+ rownames(tmp) <- NULL
410
+ if (is.null(result)) {
411
+ result <- tmp
412
+ } else {
413
+ all_columns <- union(colnames(result), colnames(tmp))
414
+ result[, setdiff(all_columns, colnames(result))] <- 1
415
+ tmp[, setdiff(all_columns, colnames(tmp))] <- 1
416
+ result <- rbind(result, tmp)
417
+ }
387
418
  }
419
+ uniq_subsets <- unique(result[[case$subset_by]])
420
+ result[[case$subset_by]] <- factor(
421
+ result[[case$subset_by]],
422
+ levels = if (is.factor(sobj@meta.data[[case$subset_by]])) {
423
+ intersect(levels(sobj@meta.data[[case$subset_by]]), uniq_subsets)
424
+ } else {
425
+ uniq_subsets
426
+ }
427
+ )
388
428
  }
389
- }
429
+ uniq_groups <- unique(result[[case$group_by]])
430
+ result[[case$group_by]] <- factor(
431
+ result[[case$group_by]],
432
+ levels = if (is.factor(sobj@meta.data[[case$group_by]])) {
433
+ intersect(levels(sobj@meta.data[[case$group_by]]), uniq_groups)
434
+ } else {
435
+ uniq_groups
436
+ }
437
+ )
390
438
 
391
- if (is.null(subsetting_cols)) {
392
- do_one_subset_col(NULL)
393
- } else {
394
- for (i in seq_along(subsetting_cols)) {
395
- do_one_subset_col(subsetting_cols[i], subsetting_prefix[i])
439
+ for (plotname in names(case$plots)) {
440
+ plotargs <- case$plots[[plotname]]
441
+ if (is.null(plotargs$plot_type)) {
442
+ stop("'plot_type' is required in plot args: ", plotname, " in case: ", casename)
443
+ }
444
+ plotargs$devpars <- plotargs$devpars %||% list()
445
+ plotargs <- extract_vars(plotargs, "devpars", "plot_type")
446
+ if (plot_type != "merged_heatmap") {
447
+ next
448
+ }
449
+ log$info(" Plotting: {plotname} ...")
450
+
451
+ plotargs$data <- result
452
+ plotargs$name <- plotargs$name %||% "Pathway Activity"
453
+ plotargs$in_form <- "wide-rows"
454
+ plotargs$columns_by <- case$group_by
455
+ plotargs$show_row_names <- plotargs$show_row_names %||% TRUE
456
+ minval <- min(as.matrix(result[, setdiff(colnames(result), c(case$group_by, case$subset_by))]))
457
+ maxval <- max(as.matrix(result[, setdiff(colnames(result), c(case$group_by, case$subset_by))]))
458
+ dis <- max(1 - minval, maxval - 1)
459
+ minval <- 1 - dis
460
+ maxval <- 1 + dis
461
+ plotargs$lower_cutoff <- plotargs$lower_cutoff %||% minval
462
+ plotargs$upper_cutoff <- plotargs$upper_cutoff %||% maxval
463
+ plotargs$row_name_annotation <- plotargs$row_name_annotation %||% FALSE
464
+ plotargs$row_names_side <- plotargs$row_names_side %||% "left"
465
+ plotargs$show_column_names <- plotargs$show_column_names %||% TRUE
466
+
467
+ if (!is.null(case$subset_by)) {
468
+ plotargs$columns_split_by <- case$subset_by
469
+ }
470
+ p <- do_call(plotthis::Heatmap, plotargs)
471
+
472
+ devpars <- devpars %||% list()
473
+ devpars$res <- devpars$res %||% 100
474
+ devpars$width <- devpars$width %||% (attr(p, "width") * devpars$res) %||% 1000
475
+ devpars$height <- devpars$height %||% (attr(p, "height") * devpars$res) %||% 1000
476
+
477
+ prefix <- file.path(caseinfo$prefix, paste0(slugify(plotname), ".merged_heatmap"))
478
+ png(paste0(prefix, ".png"), res = devpars$res, width = devpars$width, height = devpars$height)
479
+ print(p)
480
+ dev.off()
481
+
482
+ descr <- plotargs$descr %||% "Merged Heatmaps for Pathway Activity of all subsets."
483
+
484
+ reporter$add(
485
+ list(name = plotname, contents = list(
486
+ list(kind = "descr", content = descr),
487
+ reporter$image(prefix, c(), FALSE)
488
+ )),
489
+ h1 = casename,
490
+ h2 = "Merged Heatmaps",
491
+ ui = "tabs"
492
+ )
396
493
  }
397
494
  }
495
+
496
+ sapply(names(cases), do_case)
497
+
498
+ reporter$save(dirname(outdir))