biopipen 0.29.2__py3-none-any.whl → 0.30.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (105) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +2 -0
  3. biopipen/core/filters.py +21 -0
  4. biopipen/ns/plot.py +55 -0
  5. biopipen/ns/scrna.py +49 -13
  6. biopipen/ns/web.py +87 -5
  7. biopipen/scripts/bam/CNAClinic.R +2 -1
  8. biopipen/scripts/cellranger/CellRangerCount.py +3 -3
  9. biopipen/scripts/cellranger/CellRangerSummary.R +2 -1
  10. biopipen/scripts/cnv/AneuploidyScore.R +1 -1
  11. biopipen/scripts/cnv/AneuploidyScoreSummary.R +2 -2
  12. biopipen/scripts/delim/RowsBinder.R +1 -1
  13. biopipen/scripts/delim/SampleInfo.R +3 -2
  14. biopipen/scripts/gene/GeneNameConversion.R +2 -2
  15. biopipen/scripts/gsea/Enrichr.R +3 -3
  16. biopipen/scripts/gsea/FGSEA.R +2 -2
  17. biopipen/scripts/gsea/GSEA.R +2 -2
  18. biopipen/scripts/gsea/PreRank.R +2 -2
  19. biopipen/scripts/plot/Heatmap.R +3 -3
  20. biopipen/scripts/plot/Manhattan.R +2 -1
  21. biopipen/scripts/plot/QQPlot.R +1 -1
  22. biopipen/scripts/plot/ROC.R +1 -1
  23. biopipen/scripts/plot/Scatter.R +112 -0
  24. biopipen/scripts/plot/VennDiagram.R +3 -3
  25. biopipen/scripts/regulatory/MotifAffinityTest.R +3 -7
  26. biopipen/scripts/rnaseq/Simulation.R +1 -1
  27. biopipen/scripts/rnaseq/UnitConversion.R +2 -1
  28. biopipen/scripts/scrna/AnnData2Seurat.R +1 -1
  29. biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +24 -8
  30. biopipen/scripts/scrna/CellTypeAnnotation-common.R +10 -0
  31. biopipen/scripts/scrna/CellTypeAnnotation-direct.R +9 -1
  32. biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +12 -8
  33. biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +15 -2
  34. biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +38 -15
  35. biopipen/scripts/scrna/CellTypeAnnotation.R +3 -0
  36. biopipen/scripts/scrna/CellsDistribution.R +3 -2
  37. biopipen/scripts/scrna/DimPlots.R +1 -1
  38. biopipen/scripts/scrna/ExprImputation-alra.R +1 -1
  39. biopipen/scripts/scrna/MarkersFinder.R +5 -5
  40. biopipen/scripts/scrna/MetaMarkers.R +4 -4
  41. biopipen/scripts/scrna/ModuleScoreCalculator.R +2 -1
  42. biopipen/scripts/scrna/RadarPlots.R +1 -1
  43. biopipen/scripts/scrna/ScFGSEA.R +4 -3
  44. biopipen/scripts/scrna/Seurat2AnnData.R +1 -1
  45. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +73 -0
  46. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +4 -3
  47. biopipen/scripts/scrna/SeuratClusterStats-features.R +6 -5
  48. biopipen/scripts/scrna/SeuratClusterStats-hists.R +6 -5
  49. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +4 -3
  50. biopipen/scripts/scrna/SeuratClusterStats-stats.R +4 -3
  51. biopipen/scripts/scrna/SeuratClusterStats.R +24 -8
  52. biopipen/scripts/scrna/SeuratClustering-common.R +213 -0
  53. biopipen/scripts/scrna/SeuratClustering.R +10 -170
  54. biopipen/scripts/scrna/SeuratMap2Ref.R +65 -31
  55. biopipen/scripts/scrna/SeuratMetadataMutater.R +2 -2
  56. biopipen/scripts/scrna/SeuratPreparing-common.R +452 -0
  57. biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +201 -0
  58. biopipen/scripts/scrna/SeuratPreparing.R +22 -562
  59. biopipen/scripts/scrna/SeuratSubClustering.R +24 -39
  60. biopipen/scripts/scrna/TopExpressingGenes.R +1 -1
  61. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +2 -2
  62. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +2 -2
  63. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +3 -3
  64. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +3 -3
  65. biopipen/scripts/snp/MatrixEQTL.R +1 -1
  66. biopipen/scripts/snp/PlinkCallRate.R +2 -2
  67. biopipen/scripts/snp/PlinkFreq.R +2 -2
  68. biopipen/scripts/snp/PlinkHWE.R +2 -2
  69. biopipen/scripts/snp/PlinkHet.R +2 -2
  70. biopipen/scripts/snp/PlinkIBD.R +2 -2
  71. biopipen/scripts/stats/ChowTest.R +1 -1
  72. biopipen/scripts/stats/DiffCoexpr.R +1 -1
  73. biopipen/scripts/stats/LiquidAssoc.R +1 -1
  74. biopipen/scripts/stats/Mediation.R +11 -9
  75. biopipen/scripts/stats/MetaPvalue.R +4 -1
  76. biopipen/scripts/stats/MetaPvalue1.R +4 -1
  77. biopipen/scripts/tcr/Attach2Seurat.R +1 -1
  78. biopipen/scripts/tcr/CDR3AAPhyschem.R +1 -1
  79. biopipen/scripts/tcr/CloneResidency.R +2 -2
  80. biopipen/scripts/tcr/CloneSizeQQPlot.R +2 -2
  81. biopipen/scripts/tcr/Immunarch-basic.R +0 -4
  82. biopipen/scripts/tcr/Immunarch-clonality.R +0 -4
  83. biopipen/scripts/tcr/Immunarch-diversity.R +2 -24
  84. biopipen/scripts/tcr/Immunarch-geneusage.R +0 -2
  85. biopipen/scripts/tcr/Immunarch-kmer.R +0 -2
  86. biopipen/scripts/tcr/Immunarch-overlap.R +0 -2
  87. biopipen/scripts/tcr/Immunarch-spectratyping.R +0 -2
  88. biopipen/scripts/tcr/Immunarch-tracking.R +0 -2
  89. biopipen/scripts/tcr/Immunarch-vjjunc.R +0 -2
  90. biopipen/scripts/tcr/Immunarch.R +43 -11
  91. biopipen/scripts/tcr/ImmunarchFilter.R +1 -1
  92. biopipen/scripts/tcr/ImmunarchLoading.R +2 -2
  93. biopipen/scripts/tcr/SampleDiversity.R +1 -1
  94. biopipen/scripts/tcr/TCRClusterStats.R +2 -2
  95. biopipen/scripts/tcr/TCRClustering.R +2 -2
  96. biopipen/scripts/tcr/TESSA.R +2 -2
  97. biopipen/scripts/vcf/TruvariBenchSummary.R +2 -2
  98. biopipen/scripts/vcf/TruvariConsistency.R +1 -1
  99. biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
  100. biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
  101. biopipen/scripts/web/gcloud_common.py +49 -0
  102. {biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/METADATA +1 -1
  103. {biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/RECORD +105 -96
  104. {biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/WHEEL +0 -0
  105. {biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/entry_points.txt +0 -0
@@ -1,4 +1,4 @@
1
- source("{{biopipen_dir}}/utils/misc.R")
1
+ {{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
2
2
 
3
3
  library(parallel)
4
4
  library(Seurat)
@@ -17,6 +17,7 @@ refnorm = {{envs.refnorm | r}}
17
17
  ncores = {{envs.ncores | r}}
18
18
  split_by = {{envs.split_by | r}}
19
19
  mutaters = {{envs.mutaters | r}}
20
+ skip_if_normalized = {{envs.skip_if_normalized | r}}
20
21
  sctransform_args = {{envs.SCTransform | r: todot="-"}}
21
22
  normalizedata_args = {{envs.NormalizeData | r: todot="-"}}
22
23
  findtransferanchors_args = {{envs.FindTransferAnchors | r: todot="-"}}
@@ -40,7 +41,7 @@ mapquery_args$refdata[[use]] = use
40
41
 
41
42
  outdir = dirname(outfile)
42
43
  if (is.null(split_by)) {
43
- options(future.globals.maxSize = 80000 * 1024^2)
44
+ options(future.globals.maxSize = 8 * 1024 ^ 4)
44
45
  future::plan(strategy = "multicore", workers = ncores)
45
46
  }
46
47
 
@@ -98,6 +99,7 @@ if (refnorm == "SCTransform") {
98
99
  # Load Seurat object
99
100
  log_info("- Loading Seurat object")
100
101
  sobj = readRDS(sobjfile)
102
+ defassay <- DefaultAssay(sobj)
101
103
 
102
104
  if (!is.null(mutaters) && length(mutaters) > 0) {
103
105
  log_info("- Applying mutaters")
@@ -126,43 +128,61 @@ if (!is.null(split_by)) {
126
128
  # Normalize data
127
129
  log_info("- Normalizing data")
128
130
  if (refnorm == "SCTransform") {
129
- log_info(" Using SCTransform normalization")
130
- sctransform_args$residual.features = rownames(x = reference)
131
- if (is.null(split_by)) {
132
- sctransform_args$object = sobj
133
- query = do_call(SCTransform, sctransform_args)
131
+ if (defassay == "SCT" && skip_if_normalized) {
132
+ log_warn(" Skipping normalization as the object is already SCTransform'ed")
133
+ query = sobj
134
134
  } else {
135
- query = mclapply(
136
- X = sobj,
137
- FUN = function(x) {
138
- sctransform_args$object = x
139
- do_call(SCTransform, sctransform_args)
140
- },
141
- mc.cores = ncores
142
- )
143
- if (any(unlist(lapply(query, class)) == "try-error")) {
144
- stop(paste0("\nmclapply (SCTransform) error:", query))
135
+ log_info(" Using SCTransform normalization")
136
+ sctransform_args$residual.features = rownames(x = reference)
137
+ if (is.null(split_by)) {
138
+ sctransform_args$object = sobj
139
+ query = do_call(SCTransform, sctransform_args)
140
+ sctransform_args$object <- NULL
141
+ rm(sctransform_args)
142
+ gc()
143
+ } else {
144
+ query = mclapply(
145
+ X = sobj,
146
+ FUN = function(x) {
147
+ sctransform_args$object = x
148
+ do_call(SCTransform, sctransform_args)
149
+ },
150
+ mc.cores = ncores
151
+ )
152
+ if (any(unlist(lapply(query, class)) == "try-error")) {
153
+ stop(paste0("\nmclapply (SCTransform) error:", query))
154
+ }
145
155
  }
146
156
  }
147
157
  } else {
148
- log_info(" Using NormalizeData normalization")
149
- if (is.null(split_by)) {
150
- normalizedata_args$object = sobj
151
- query = do_call(NormalizeData, normalizedata_args)
158
+ if (defassay == "RNA" && skip_if_normalized) {
159
+ log_warn(" Skipping normalization as the object is already LogNormalize'd")
160
+ query = sobj
152
161
  } else {
153
- query = mclapply(
154
- X = sobj,
155
- FUN = function(x) {
156
- normalizedata_args$object = x
157
- do_call(NormalizeData, normalizedata_args)
158
- },
159
- mc.cores = ncores
160
- )
161
- if (any(unlist(lapply(query, class)) == "try-error")) {
162
- stop(paste0("\nmclapply (NormalizeData) error:", query))
162
+ log_info(" Using NormalizeData normalization")
163
+ if (is.null(split_by)) {
164
+ normalizedata_args$object = sobj
165
+ query = do_call(NormalizeData, normalizedata_args)
166
+ } else {
167
+ query = mclapply(
168
+ X = sobj,
169
+ FUN = function(x) {
170
+ normalizedata_args$object = x
171
+ do_call(NormalizeData, normalizedata_args)
172
+ },
173
+ mc.cores = ncores
174
+ )
175
+ if (any(unlist(lapply(query, class)) == "try-error")) {
176
+ stop(paste0("\nmclapply (NormalizeData) error:", query))
177
+ }
163
178
  }
179
+ normalizedata_args$object <- NULL
180
+ rm(normalizedata_args)
181
+ gc()
164
182
  }
165
183
  }
184
+ rm(sobj)
185
+ gc()
166
186
 
167
187
  # Find anchors between query and reference
168
188
  log_info("- Finding anchors")
@@ -170,6 +190,10 @@ findtransferanchors_args$reference = reference
170
190
  if (is.null(split_by)) {
171
191
  findtransferanchors_args$query = query
172
192
  anchors = do_call(FindTransferAnchors, findtransferanchors_args)
193
+ findtransferanchors_args$reference = NULL
194
+ findtransferanchors_args$query = NULL
195
+ rm(findtransferanchors_args)
196
+ gc()
173
197
  } else {
174
198
  anchors = mclapply(
175
199
  X = query,
@@ -191,6 +215,10 @@ if (is.null(split_by)) {
191
215
  mapquery_args$query = query
192
216
  mapquery_args$anchorset = anchors
193
217
  query = do_call(MapQuery, mapquery_args)
218
+ mapquery_args$reference = NULL
219
+ mapquery_args$query = NULL
220
+ mapquery_args$anchorset = NULL
221
+ gc()
194
222
  } else {
195
223
  query = mclapply(
196
224
  X = seq_along(query),
@@ -221,6 +249,9 @@ if (is.null(split_by)) {
221
249
  if (e$message == "subscript out of bounds") stop(mappingscore_sob_msg)
222
250
  stop(e)
223
251
  })
252
+ mappingscore_args$anchors = NULL
253
+ rm(mappingscore_args)
254
+ gc()
224
255
  } else {
225
256
  mappingscore = mclapply(
226
257
  X = seq_along(query),
@@ -266,6 +297,9 @@ if (is.null(split_by)) {
266
297
 
267
298
  # Combine the results
268
299
  log_info("- Merging the results")
300
+ gc()
301
+ # Memory efficient way to merge the results
302
+ # query = Reduce(function(x, y) merge(x, y, merge.dr = "ref.umap"), query)
269
303
  query = merge(query[[1]], query[2:length(query)], merge.dr = "ref.umap")
270
304
  }
271
305
 
@@ -1,5 +1,5 @@
1
- source("{{biopipen_dir}}/utils/misc.R")
2
- source("{{biopipen_dir}}/utils/mutate_helpers.R")
1
+ {{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
2
+ {{ biopipen_dir | joinpaths: "utils", "mutate_helpers.R" | source_r }}
3
3
 
4
4
  library(rlang)
5
5
  library(tibble)
@@ -0,0 +1,452 @@
1
+
2
+ stringify_list <- function(x) {
3
+ paste(sapply(names(x), function(n) paste(n, x[[n]], sep = " = ") ), collapse = "; ")
4
+ }
5
+
6
+ format_args <- function(args) {
7
+ paste(capture.output(str(args)), collapse = ", ")
8
+ }
9
+
10
+ rename_files = function(e, sample, path) {
11
+ tmpdatadir = file.path(joboutdir, "renamed", sample)
12
+ if (dir.exists(tmpdatadir)) {
13
+ unlink(tmpdatadir, recursive = TRUE)
14
+ }
15
+ dir.create(tmpdatadir, recursive = TRUE, showWarnings = FALSE)
16
+ barcodefile = Sys.glob(file.path(path, "*barcodes.tsv.gz"))[1]
17
+ file.symlink(
18
+ normalizePath(barcodefile),
19
+ file.path(tmpdatadir, "barcodes.tsv.gz")
20
+ )
21
+ genefile = glob(file.path(path, "*{genes,features}.tsv.gz"))[1]
22
+ file.symlink(
23
+ normalizePath(genefile),
24
+ file.path(tmpdatadir, "features.tsv.gz")
25
+ )
26
+ matrixfile = Sys.glob(file.path(path, "*matrix.mtx.gz"))[1]
27
+ file.symlink(
28
+ normalizePath(matrixfile),
29
+ file.path(tmpdatadir, "matrix.mtx.gz")
30
+ )
31
+ Read10X(data.dir = tmpdatadir)
32
+ }
33
+
34
+
35
+ perform_cell_qc <- function(sobj, per_sample = FALSE) {
36
+ log_prefix <- ifelse(per_sample, " ", "- ")
37
+ log_info("{log_prefix}Adding metadata for QC ...")
38
+ sobj$percent.mt <- PercentageFeatureSet(sobj, pattern = "^MT-")
39
+ sobj$percent.ribo <- PercentageFeatureSet(sobj, pattern = "^RP[SL]")
40
+ sobj$percent.hb <- PercentageFeatureSet(sobj, pattern = "^HB[^(P)]")
41
+ sobj$percent.plat <- PercentageFeatureSet(sobj, pattern = "PECAM1|PF4")
42
+
43
+ if (is.null(envs$cell_qc) || length(envs$cell_qc) == 0) {
44
+ log_warn("{log_prefix}No cell QC criteria is provided. All cells will be kept.")
45
+ cell_qc <- "TRUE"
46
+ } else {
47
+ cell_qc <- envs$cell_qc
48
+ }
49
+
50
+ sobj@meta.data <- sobj@meta.data %>% mutate(.QC = !!rlang::parse_expr(cell_qc))
51
+
52
+ if (is.null(cell_qc_df)) {
53
+ cell_qc_df <<- sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE]
54
+ } else {
55
+ cell_qc_df <<- rbind(cell_qc_df, sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE])
56
+ }
57
+
58
+ # Do the filtering
59
+ log_info("{log_prefix}Filtering cells using QC criteria ...")
60
+ sobj <- subset(sobj, subset = .QC)
61
+ sobj$.QC <- NULL
62
+
63
+ return(sobj)
64
+ }
65
+
66
+ report_cell_qc = function(ngenes) {
67
+ # uses cell_qc_df
68
+
69
+ # Violin plots
70
+ log_info("- Plotting violin plots ...")
71
+ add_report(
72
+ list(
73
+ kind = "descr",
74
+ content = paste(
75
+ "The violin plots for each feature. The cells are grouped by sample.",
76
+ "The cells that fail the QC criteria are colored in red, and",
77
+ "the cells that pass the QC criteria are colored in black.",
78
+ "The cells that fail the QC criteria are filtered out in the returned Seurat object."
79
+ )
80
+ ),
81
+ h1 = "Violin Plots"
82
+ )
83
+ for (feat in feats) {
84
+ log_info(" For feature: {feat}")
85
+ vln_p <- ggplot(cell_qc_df, aes(x = Sample, y = !!sym(feat), color = .QC)) +
86
+ geom_violin(fill = "white", width = 0.5) +
87
+ geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
88
+ scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
89
+ labs(x = "Sample", y = feat) +
90
+ theme_minimal()
91
+
92
+ vlnplot = file.path(plotsdir, paste0(slugify(feat), ".vln.png"))
93
+ png(
94
+ vlnplot,
95
+ width = 800 + length(samples) * 15, height = 600, res = 100
96
+ )
97
+ print(vln_p)
98
+ dev.off()
99
+
100
+ add_report(
101
+ list(
102
+ src = vlnplot,
103
+ name = feat,
104
+ descr = paste0("Distribution of ", feat, " for each sample.")
105
+ ),
106
+ h1 = "Violin Plots",
107
+ ui = "table_of_images"
108
+ )
109
+ }
110
+
111
+ # Scatter plots against nCount_RNA
112
+ log_info("- Plotting scatter plots ...")
113
+ add_report(
114
+ list(
115
+ kind = "descr",
116
+ content = paste(
117
+ "The scatter plots for each feature against nCount_RNA. ",
118
+ "The cells that fail the QC criteria are colored in red, and",
119
+ "the cells that pass the QC criteria are colored in black.",
120
+ "The cells that fail the QC criteria are filtered out in the returned Seurat object."
121
+ )
122
+ ),
123
+ h1 = "Scatter Plots"
124
+ )
125
+ for (feat in setdiff(feats, "nCount_RNA")) {
126
+ log_info(" For feature: {feat}, against nCount_RNA")
127
+ scat_p <- ggplot(cell_qc_df, aes(x = nCount_RNA, y = !!sym(feat), color = .QC)) +
128
+ geom_point() +
129
+ scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
130
+ labs(x = "nCount_RNA", y = feat) +
131
+ theme_minimal()
132
+
133
+ scatfile = file.path(plotsdir, paste0(slugify(feat), "-nCount_RNA.scatter.png"))
134
+ png(scatfile, width = 800, height = 600, res = 100)
135
+ print(scat_p)
136
+ dev.off()
137
+
138
+ add_report(
139
+ list(
140
+ src = scatfile,
141
+ name = paste0(feat, " vs nCount_RNA"),
142
+ descr = paste0("Scatter plot for ", feat, " against nCount_RNA")
143
+ ),
144
+ h1 = "Scatter Plots",
145
+ ui = "table_of_images"
146
+ )
147
+ }
148
+
149
+ # return the dim_df calculated from the cell_qc_df
150
+ rbind(
151
+ cell_qc_df %>%
152
+ # group_by(Sample) %>%
153
+ summarise(
154
+ when = "Before_Cell_QC",
155
+ nCells = dplyr::n(),
156
+ nGenes = ngenes
157
+ ) %>%
158
+ ungroup(),
159
+ cell_qc_df %>%
160
+ filter(.QC) %>%
161
+ # group_by(Sample) %>%
162
+ summarise(
163
+ when = "After_Cell_QC",
164
+ nCells = dplyr::n(),
165
+ nGenes = ngenes
166
+ ) %>%
167
+ ungroup()
168
+ )
169
+ }
170
+
171
+ load_sample = function(sample) {
172
+ log_info("- Loading sample: {sample} ...")
173
+ mdata = as.data.frame(metadata)[metadata$Sample == sample, , drop=TRUE]
174
+ path = as.character(mdata$RNAData)
175
+ if (is.na(path) || !is.character(path) || nchar(path) == 0 || path == "NA") {
176
+ warning(paste0("No path found for sample: ", sample))
177
+ return (NULL)
178
+ }
179
+
180
+ # obj_list = list()
181
+ if (dir.exists(path)) {
182
+ exprs = tryCatch(
183
+ # Read10X requires
184
+ # - barcodes.tsv.gz
185
+ # - genes.tsv.gz
186
+ # - matrix.mtx.gz
187
+ # But sometimes, they are prefixed with sample name
188
+ # e.g.GSM4143656_SAM24345863-ln1.barcodes.tsv.gz
189
+ { Read10X(data.dir = path) },
190
+ error = function(e) rename_files(e, sample, path)
191
+ )
192
+ } else {
193
+ exprs = Read10X_h5(path)
194
+ }
195
+ if ("Gene Expression" %in% names(exprs)) {
196
+ exprs = exprs[["Gene Expression"]]
197
+ }
198
+ obj <- CreateSeuratObject(exprs, project=sample)
199
+ # filter the cells that don't have any gene expressions
200
+ # cell_exprs = colSums(obj@assays$RNA)
201
+ # obj = subset(obj, cells = names(cell_exprs[cell_exprs > 0]))
202
+ obj = RenameCells(obj, add.cell.id = sample)
203
+ # Attach meta data
204
+ for (mname in names(mdata)) {
205
+ if (mname %in% c("RNAData", "TCRData")) { next }
206
+ mdt = mdata[[mname]]
207
+ if (is.factor(mdt)) { mdt = levels(mdt)[mdt] }
208
+ obj[[mname]] = mdt
209
+ }
210
+
211
+ if (isTRUE(envs$cell_qc_per_sample)) {
212
+ log_info("- Perform cell QC for sample: {sample} ...")
213
+ obj = perform_cell_qc(obj, TRUE)
214
+ }
215
+
216
+ if (isTRUE(envs$use_sct)) {
217
+ # so that we have data and scale.data layers on RNA assay
218
+ # useful for visualization in case some genes are not in
219
+ # the SCT assay
220
+ obj = NormalizeData(obj, verbose = FALSE)
221
+ obj = FindVariableFeatures(obj, verbose = FALSE)
222
+ obj = ScaleData(obj, verbose = FALSE)
223
+ }
224
+ obj
225
+ }
226
+
227
+ run_gene_qc <- function(sobj) {
228
+ cached <- get_cached(
229
+ list(
230
+ cell_qc = envs$cell_qc,
231
+ gene_qc = envs$gene_qc,
232
+ cell_qc_per_sample = envs$cell_qc_per_sample,
233
+ use_sct = envs$use_sct
234
+ ),
235
+ "GeneQC",
236
+ cache_dir
237
+ )
238
+ if (!is.null(cached$data)) {
239
+ log_info("Loading gene-QC'ed object from cache ...")
240
+ sobj <- cached$data
241
+ } else {
242
+ log_info("Filtering genes ...")
243
+ genes <- rownames(sobj)
244
+ filtered <- FALSE
245
+ if (!is.null(envs$gene_qc$min_cells) && envs$gene_qc$min_cells > 0) {
246
+ genes = genes[Matrix::rowSums(sobj) >= envs$gene_qc$min_cells]
247
+ filtered <- TRUE
248
+ }
249
+ excludes <- envs$gene_qc$excludes
250
+ if (!is.null(excludes)) {
251
+ if (length(excludes) == 1) {
252
+ excludes <- trimws(unlist(strsplit(excludes, ",")))
253
+ }
254
+ for (ex in excludes) {
255
+ genes <- genes[!grepl(ex, genes)]
256
+ }
257
+ filtered <- TRUE
258
+ }
259
+ if (filtered) {
260
+ sobj = subset(sobj, features = genes)
261
+ }
262
+ cached$data <- sobj
263
+ save_to_cache(cached, "GeneQC", cache_dir)
264
+ }
265
+ sobj
266
+ }
267
+
268
+ run_cell_qc <- function(sobj) {
269
+ cached <- get_cached(
270
+ list(cell_qc = envs$cell_qc, cell_qc_per_sample = envs$cell_qc_per_sample, use_sct = envs$use_sct),
271
+ "CellQC",
272
+ cache_dir
273
+ )
274
+ if (!is.null(cached$data)) {
275
+ log_info("Loading cell-QC'ed object from cache ...")
276
+ sobj <- cached$data$sobj
277
+ cell_qc_df <<- cached$data$cell_qc_df
278
+ } else {
279
+ # Load data
280
+ log_info("Reading samples individually ...")
281
+ obj_list = lapply(samples, load_sample)
282
+
283
+ log_info("Merging samples ...")
284
+ sobj = Reduce(merge, obj_list)
285
+ rm(obj_list)
286
+ gc()
287
+
288
+ if (!envs$cell_qc_per_sample) {
289
+ log_info("Performing cell QC ...")
290
+ sobj = perform_cell_qc(sobj)
291
+ }
292
+
293
+ cached$data <- list(sobj = sobj, cell_qc_df = cell_qc_df)
294
+ save_to_cache(cached, "CellQC", cache_dir)
295
+ }
296
+ sobj
297
+ }
298
+
299
+ run_transformation <- function(sobj) {
300
+ envs_cache <- envs
301
+ envs_cache$ncores <- NULL
302
+ envs_cache$doublet_detector <- NULL
303
+ envs_cache$DoubletFinder <- NULL
304
+ envs_cache$scDblFinder <- NULL
305
+ envs_cache$IntegrateLayers <- NULL
306
+ cached <- get_cached(envs_cache, "Transformed", cache_dir)
307
+ if (!is.null(cached$data)) {
308
+ log_info("Loading transformed object from cache ...")
309
+ sobj <- cached$data
310
+ } else {
311
+ log_info("Performing transformation/scaling ...")
312
+ # Not joined yet
313
+ # sobj[["RNA"]] <- split(sobj[["RNA"]], f = sobj$Sample)
314
+ if (envs$use_sct) {
315
+ log_info("- Running SCTransform ...")
316
+ SCTransformArgs <- envs$SCTransform
317
+ # log to stdout but don't populate it to running log
318
+ print(paste0(" SCTransform: ", format_args(SCTransformArgs)))
319
+ log_debug(" SCTransform: {format_args(SCTransformArgs)}")
320
+ SCTransformArgs$object <- sobj
321
+ sobj <- do_call(SCTransform, SCTransformArgs)
322
+ # Default is to use the SCT assay
323
+
324
+ # Cleanup memory
325
+ SCTransformArgs$object <- NULL
326
+ rm(SCTransformArgs)
327
+ gc()
328
+ } else {
329
+ log_info("- Running NormalizeData ...")
330
+ NormalizeDataArgs <- envs$NormalizeData
331
+ print(paste0(" NormalizeData: ", format_args(NormalizeDataArgs)))
332
+ log_debug(" NormalizeData: {format_args(NormalizeDataArgs)}")
333
+ NormalizeDataArgs$object <- sobj
334
+ sobj <- do_call(NormalizeData, NormalizeDataArgs)
335
+
336
+ # Cleanup memory
337
+ NormalizeDataArgs$object <- NULL
338
+ rm(NormalizeDataArgs)
339
+ gc()
340
+
341
+ log_info("- Running FindVariableFeatures ...")
342
+ FindVariableFeaturesArgs <- envs$FindVariableFeatures
343
+ print(paste0(" FindVariableFeatures: ", format_args(FindVariableFeaturesArgs)))
344
+ log_debug(" FindVariableFeatures: {format_args(FindVariableFeaturesArgs)}")
345
+ FindVariableFeaturesArgs$object <- sobj
346
+ sobj <- do_call(FindVariableFeatures, FindVariableFeaturesArgs)
347
+
348
+ # Cleanup memory
349
+ FindVariableFeaturesArgs$object <- NULL
350
+ rm(FindVariableFeaturesArgs)
351
+ gc()
352
+
353
+ log_info("- Running ScaleData ...")
354
+ ScaleDataArgs <- envs$ScaleData
355
+ print(paste0(" ScaleData: ", format_args(ScaleDataArgs)))
356
+ log_debug(" ScaleData: {format_args(ScaleDataArgs)}")
357
+ ScaleDataArgs$object <- sobj
358
+ sobj <- do_call(ScaleData, ScaleDataArgs)
359
+
360
+ # Cleanup memory
361
+ ScaleDataArgs$object <- NULL
362
+ rm(ScaleDataArgs)
363
+ gc()
364
+ }
365
+
366
+ log_info("- Running RunPCA ...")
367
+ RunPCAArgs <- envs$RunPCA
368
+ RunPCAArgs$npcs <- if (is.null(RunPCAArgs$npcs)) { 50 } else { min(RunPCAArgs$npcs, ncol(sobj) - 1) }
369
+ print(paste0(" RunPCA: ", format_args(RunPCAArgs)))
370
+ log_debug(" RunPCA: {format_args(RunPCAArgs)}")
371
+ RunPCAArgs$object <- sobj
372
+ sobj <- do_call(RunPCA, RunPCAArgs)
373
+
374
+ # Cleanup memory
375
+ RunPCAArgs$object <- NULL
376
+ rm(RunPCAArgs)
377
+ gc()
378
+
379
+ cached$data <- sobj
380
+ save_to_cache(cached, "Transformed", cache_dir)
381
+ }
382
+
383
+ sobj
384
+ }
385
+
386
+ run_integration <- function(sobj) {
387
+
388
+ envs_cache <- envs
389
+ envs_cache$ncores <- NULL
390
+ envs_cache$doublet_detector <- NULL
391
+ envs_cache$DoubletFinder <- NULL
392
+ envs_cache$scDblFinder <- NULL
393
+ cached <- get_cached(envs_cache, "Integrated", cache_dir)
394
+
395
+ if (!is.null(cached$data)) {
396
+ log_info("Loading integrated/layer-joined object from cache ...")
397
+ sobj <- cached$data
398
+ } else {
399
+
400
+ if (!envs$no_integration) {
401
+ log_info("- Running IntegrateLayers (method = {envs$IntegrateLayers$method}) ...")
402
+ IntegrateLayersArgs <- envs$IntegrateLayers
403
+ method <- IntegrateLayersArgs$method
404
+ if (!is.null(IntegrateLayersArgs$reference) && is.character(IntegrateLayersArgs$reference)) {
405
+ log_info(" Using reference samples: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
406
+ IntegrateLayersArgs$reference <- match(IntegrateLayersArgs$reference, samples)
407
+ log_info(" Transferred to indices: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
408
+ }
409
+ if (method %in% c("CCA", "cca")) { method <- "CCAIntegration" } else
410
+ if (method %in% c("RPCA", "rpca")) { method <- "RPCAIntegration" } else
411
+ if (method %in% c("Harmony", "harmony")) { method <- "HarmonyIntegration" } else
412
+ if (method %in% c("FastMNN", "fastmnn")) { method <- "FastMNNIntegration" } else
413
+ if (method %in% c("scVI", "scvi")) { method <- "scVIIntegration" } else
414
+ { stop(paste0("Unknown integration method: ", method)) }
415
+ if (envs$use_sct && is.null(IntegrateLayersArgs$normalization.method)) {
416
+ IntegrateLayersArgs$normalization.method <- "SCT"
417
+ }
418
+ IntegrateLayersArgs$method <- eval(parse(text = method))
419
+ new_reductions <- list(
420
+ "CCAIntegration" = "integrated.cca",
421
+ "RPCAIntegration" = "integrated.rpca",
422
+ "HarmonyIntegration" = "harmony",
423
+ "FastMNNIntegration" = "integration.mnn",
424
+ "scVIIntegration" = "integrated.scvi"
425
+ )
426
+ if (is.null(IntegrateLayersArgs$new.reduction)) {
427
+ IntegrateLayersArgs$new.reduction <- new_reductions[[method]]
428
+ }
429
+ print(paste0(" IntegrateLayers: ", format_args(IntegrateLayersArgs)))
430
+ log_debug(" IntegrateLayers: {format_args(IntegrateLayersArgs)}")
431
+ IntegrateLayersArgs$object <- sobj
432
+ sobj <- do_call(IntegrateLayers, IntegrateLayersArgs)
433
+ # Save it for dimension reduction plots
434
+ sobj@misc$integrated_new_reduction <- IntegrateLayersArgs$new.reduction
435
+
436
+ # Cleanup memory
437
+ IntegrateLayersArgs$object <- NULL
438
+ rm(IntegrateLayersArgs)
439
+ gc()
440
+ }
441
+
442
+ if (!envs$use_sct) {
443
+ log_info("- Joining layers ...")
444
+ sobj <- JoinLayers(sobj)
445
+ }
446
+
447
+ cached$data <- sobj
448
+ save_to_cache(cached, "Integrated", cache_dir)
449
+ }
450
+
451
+ sobj
452
+ }