biopipen 0.29.2__py3-none-any.whl → 0.30.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (105) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +2 -0
  3. biopipen/core/filters.py +21 -0
  4. biopipen/ns/plot.py +55 -0
  5. biopipen/ns/scrna.py +49 -13
  6. biopipen/ns/web.py +87 -5
  7. biopipen/scripts/bam/CNAClinic.R +2 -1
  8. biopipen/scripts/cellranger/CellRangerCount.py +3 -3
  9. biopipen/scripts/cellranger/CellRangerSummary.R +2 -1
  10. biopipen/scripts/cnv/AneuploidyScore.R +1 -1
  11. biopipen/scripts/cnv/AneuploidyScoreSummary.R +2 -2
  12. biopipen/scripts/delim/RowsBinder.R +1 -1
  13. biopipen/scripts/delim/SampleInfo.R +3 -2
  14. biopipen/scripts/gene/GeneNameConversion.R +2 -2
  15. biopipen/scripts/gsea/Enrichr.R +3 -3
  16. biopipen/scripts/gsea/FGSEA.R +2 -2
  17. biopipen/scripts/gsea/GSEA.R +2 -2
  18. biopipen/scripts/gsea/PreRank.R +2 -2
  19. biopipen/scripts/plot/Heatmap.R +3 -3
  20. biopipen/scripts/plot/Manhattan.R +2 -1
  21. biopipen/scripts/plot/QQPlot.R +1 -1
  22. biopipen/scripts/plot/ROC.R +1 -1
  23. biopipen/scripts/plot/Scatter.R +112 -0
  24. biopipen/scripts/plot/VennDiagram.R +3 -3
  25. biopipen/scripts/regulatory/MotifAffinityTest.R +3 -7
  26. biopipen/scripts/rnaseq/Simulation.R +1 -1
  27. biopipen/scripts/rnaseq/UnitConversion.R +2 -1
  28. biopipen/scripts/scrna/AnnData2Seurat.R +1 -1
  29. biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +24 -8
  30. biopipen/scripts/scrna/CellTypeAnnotation-common.R +10 -0
  31. biopipen/scripts/scrna/CellTypeAnnotation-direct.R +9 -1
  32. biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +12 -8
  33. biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +15 -2
  34. biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +38 -15
  35. biopipen/scripts/scrna/CellTypeAnnotation.R +3 -0
  36. biopipen/scripts/scrna/CellsDistribution.R +3 -2
  37. biopipen/scripts/scrna/DimPlots.R +1 -1
  38. biopipen/scripts/scrna/ExprImputation-alra.R +1 -1
  39. biopipen/scripts/scrna/MarkersFinder.R +5 -5
  40. biopipen/scripts/scrna/MetaMarkers.R +4 -4
  41. biopipen/scripts/scrna/ModuleScoreCalculator.R +2 -1
  42. biopipen/scripts/scrna/RadarPlots.R +1 -1
  43. biopipen/scripts/scrna/ScFGSEA.R +4 -3
  44. biopipen/scripts/scrna/Seurat2AnnData.R +1 -1
  45. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +73 -0
  46. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +4 -3
  47. biopipen/scripts/scrna/SeuratClusterStats-features.R +6 -5
  48. biopipen/scripts/scrna/SeuratClusterStats-hists.R +6 -5
  49. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +4 -3
  50. biopipen/scripts/scrna/SeuratClusterStats-stats.R +4 -3
  51. biopipen/scripts/scrna/SeuratClusterStats.R +24 -8
  52. biopipen/scripts/scrna/SeuratClustering-common.R +213 -0
  53. biopipen/scripts/scrna/SeuratClustering.R +10 -170
  54. biopipen/scripts/scrna/SeuratMap2Ref.R +65 -31
  55. biopipen/scripts/scrna/SeuratMetadataMutater.R +2 -2
  56. biopipen/scripts/scrna/SeuratPreparing-common.R +452 -0
  57. biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +201 -0
  58. biopipen/scripts/scrna/SeuratPreparing.R +22 -562
  59. biopipen/scripts/scrna/SeuratSubClustering.R +24 -39
  60. biopipen/scripts/scrna/TopExpressingGenes.R +1 -1
  61. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +2 -2
  62. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +2 -2
  63. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +3 -3
  64. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +3 -3
  65. biopipen/scripts/snp/MatrixEQTL.R +1 -1
  66. biopipen/scripts/snp/PlinkCallRate.R +2 -2
  67. biopipen/scripts/snp/PlinkFreq.R +2 -2
  68. biopipen/scripts/snp/PlinkHWE.R +2 -2
  69. biopipen/scripts/snp/PlinkHet.R +2 -2
  70. biopipen/scripts/snp/PlinkIBD.R +2 -2
  71. biopipen/scripts/stats/ChowTest.R +1 -1
  72. biopipen/scripts/stats/DiffCoexpr.R +1 -1
  73. biopipen/scripts/stats/LiquidAssoc.R +1 -1
  74. biopipen/scripts/stats/Mediation.R +11 -9
  75. biopipen/scripts/stats/MetaPvalue.R +4 -1
  76. biopipen/scripts/stats/MetaPvalue1.R +4 -1
  77. biopipen/scripts/tcr/Attach2Seurat.R +1 -1
  78. biopipen/scripts/tcr/CDR3AAPhyschem.R +1 -1
  79. biopipen/scripts/tcr/CloneResidency.R +2 -2
  80. biopipen/scripts/tcr/CloneSizeQQPlot.R +2 -2
  81. biopipen/scripts/tcr/Immunarch-basic.R +0 -4
  82. biopipen/scripts/tcr/Immunarch-clonality.R +0 -4
  83. biopipen/scripts/tcr/Immunarch-diversity.R +2 -24
  84. biopipen/scripts/tcr/Immunarch-geneusage.R +0 -2
  85. biopipen/scripts/tcr/Immunarch-kmer.R +0 -2
  86. biopipen/scripts/tcr/Immunarch-overlap.R +0 -2
  87. biopipen/scripts/tcr/Immunarch-spectratyping.R +0 -2
  88. biopipen/scripts/tcr/Immunarch-tracking.R +0 -2
  89. biopipen/scripts/tcr/Immunarch-vjjunc.R +0 -2
  90. biopipen/scripts/tcr/Immunarch.R +43 -11
  91. biopipen/scripts/tcr/ImmunarchFilter.R +1 -1
  92. biopipen/scripts/tcr/ImmunarchLoading.R +2 -2
  93. biopipen/scripts/tcr/SampleDiversity.R +1 -1
  94. biopipen/scripts/tcr/TCRClusterStats.R +2 -2
  95. biopipen/scripts/tcr/TCRClustering.R +2 -2
  96. biopipen/scripts/tcr/TESSA.R +2 -2
  97. biopipen/scripts/vcf/TruvariBenchSummary.R +2 -2
  98. biopipen/scripts/vcf/TruvariConsistency.R +1 -1
  99. biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
  100. biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
  101. biopipen/scripts/web/gcloud_common.py +49 -0
  102. {biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/METADATA +1 -1
  103. {biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/RECORD +105 -96
  104. {biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/WHEEL +0 -0
  105. {biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/entry_points.txt +0 -0
@@ -1,5 +1,5 @@
1
- source("{{biopipen_dir}}/utils/misc.R")
2
- source("{{biopipen_dir}}/utils/caching.R")
1
+ {{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
2
+ {{ biopipen_dir | joinpaths: "utils", "caching.R" | source_r }}
3
3
 
4
4
  library(Seurat)
5
5
  library(future)
@@ -26,9 +26,7 @@ options(future.rng.onMisuse="ignore")
26
26
  options(Seurat.object.assay.version = "v5")
27
27
  plan(strategy = "multicore", workers = envs$ncores)
28
28
 
29
- .stringify_list <- function(x) {
30
- paste(sapply(names(x), function(n) paste(n, x[[n]], sep = " = ") ), collapse = "; ")
31
- }
29
+ {{ biopipen_dir | joinpaths: "scripts", "scrna", "SeuratPreparing-common.R" | source_r }}
32
30
 
33
31
  add_report(
34
32
  list(
@@ -36,7 +34,7 @@ add_report(
36
34
  name = "Filters applied",
37
35
  content = paste0(
38
36
  "<p>Cell filters: ", html_escape(envs$cell_qc), "</p>",
39
- "<p>Gene filters: ", html_escape(.stringify_list(envs$gene_qc)), "</p>"
37
+ "<p>Gene filters: ", html_escape(stringify_list(envs$gene_qc)), "</p>"
40
38
  )
41
39
  ),
42
40
  h1 = "Filters and QC"
@@ -82,302 +80,16 @@ feats = c(
82
80
  "percent.mt", "percent.ribo", "percent.hb", "percent.plat"
83
81
  )
84
82
 
85
- rename_files = function(e, sample, path) {
86
- tmpdatadir = file.path(joboutdir, "renamed", sample)
87
- if (dir.exists(tmpdatadir)) {
88
- unlink(tmpdatadir, recursive = TRUE)
89
- }
90
- dir.create(tmpdatadir, recursive = TRUE, showWarnings = FALSE)
91
- barcodefile = Sys.glob(file.path(path, "*barcodes.tsv.gz"))[1]
92
- file.symlink(
93
- normalizePath(barcodefile),
94
- file.path(tmpdatadir, "barcodes.tsv.gz")
95
- )
96
- genefile = glob(file.path(path, "*{genes,features}.tsv.gz"))[1]
97
- file.symlink(
98
- normalizePath(genefile),
99
- file.path(tmpdatadir, "features.tsv.gz")
100
- )
101
- matrixfile = Sys.glob(file.path(path, "*matrix.mtx.gz"))[1]
102
- file.symlink(
103
- normalizePath(matrixfile),
104
- file.path(tmpdatadir, "matrix.mtx.gz")
105
- )
106
- Read10X(data.dir = tmpdatadir)
107
- }
108
-
109
-
110
- perform_cell_qc <- function(sobj, per_sample = FALSE) {
111
- log_prefix <- ifelse(per_sample, " ", "- ")
112
- log_info("{log_prefix}Adding metadata for QC ...")
113
- sobj$percent.mt <- PercentageFeatureSet(sobj, pattern = "^MT-")
114
- sobj$percent.ribo <- PercentageFeatureSet(sobj, pattern = "^RP[SL]")
115
- sobj$percent.hb <- PercentageFeatureSet(sobj, pattern = "^HB[^(P)]")
116
- sobj$percent.plat <- PercentageFeatureSet(sobj, pattern = "PECAM1|PF4")
117
-
118
- if (is.null(envs$cell_qc) || length(envs$cell_qc) == 0) {
119
- log_warn("{log_prefix}No cell QC criteria is provided. All cells will be kept.")
120
- cell_qc <- "TRUE"
121
- } else {
122
- cell_qc <- envs$cell_qc
123
- }
124
-
125
- sobj@meta.data <- sobj@meta.data %>% mutate(.QC = !!rlang::parse_expr(cell_qc))
126
-
127
- if (is.null(cell_qc_df)) {
128
- cell_qc_df <<- sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE]
129
- } else {
130
- cell_qc_df <<- rbind(cell_qc_df, sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE])
131
- }
132
-
133
- # Do the filtering
134
- log_info("{log_prefix}Filtering cells using QC criteria ...")
135
- sobj <- subset(sobj, subset = .QC)
136
- sobj$.QC <- NULL
137
-
138
- return(sobj)
139
- }
140
-
141
- report_cell_qc = function(ngenes) {
142
- # uses cell_qc_df
143
-
144
- # Violin plots
145
- log_info("- Plotting violin plots ...")
146
- add_report(
147
- list(
148
- kind = "descr",
149
- content = paste(
150
- "The violin plots for each feature. The cells are grouped by sample.",
151
- "The cells that fail the QC criteria are colored in red, and",
152
- "the cells that pass the QC criteria are colored in black.",
153
- "The cells that fail the QC criteria are filtered out in the returned Seurat object."
154
- )
155
- ),
156
- h1 = "Violin Plots"
157
- )
158
- for (feat in feats) {
159
- log_info(" For feature: {feat}")
160
- vln_p <- ggplot(cell_qc_df, aes(x = Sample, y = !!sym(feat), color = .QC)) +
161
- geom_violin(fill = "white", width = 0.5) +
162
- geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
163
- scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
164
- labs(x = "Sample", y = feat) +
165
- theme_minimal()
166
-
167
- vlnplot = file.path(plotsdir, paste0(slugify(feat), ".vln.png"))
168
- png(
169
- vlnplot,
170
- width = 800 + length(samples) * 15, height = 600, res = 100
171
- )
172
- print(vln_p)
173
- dev.off()
174
-
175
- add_report(
176
- list(
177
- src = vlnplot,
178
- name = feat,
179
- descr = paste0("Distribution of ", feat, " for each sample.")
180
- ),
181
- h1 = "Violin Plots",
182
- ui = "table_of_images"
183
- )
184
- }
185
-
186
- # Scatter plots against nCount_RNA
187
- log_info("- Plotting scatter plots ...")
188
- add_report(
189
- list(
190
- kind = "descr",
191
- content = paste(
192
- "The scatter plots for each feature against nCount_RNA. ",
193
- "The cells that fail the QC criteria are colored in red, and",
194
- "the cells that pass the QC criteria are colored in black.",
195
- "The cells that fail the QC criteria are filtered out in the returned Seurat object."
196
- )
197
- ),
198
- h1 = "Scatter Plots"
199
- )
200
- for (feat in setdiff(feats, "nCount_RNA")) {
201
- log_info(" For feature: {feat}, against nCount_RNA")
202
- scat_p <- ggplot(cell_qc_df, aes(x = nCount_RNA, y = !!sym(feat), color = .QC)) +
203
- geom_point() +
204
- scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
205
- labs(x = "nCount_RNA", y = feat) +
206
- theme_minimal()
207
-
208
- scatfile = file.path(plotsdir, paste0(slugify(feat), "-nCount_RNA.scatter.png"))
209
- png(scatfile, width = 800, height = 600, res = 100)
210
- print(scat_p)
211
- dev.off()
212
-
213
- add_report(
214
- list(
215
- src = scatfile,
216
- name = paste0(feat, " vs nCount_RNA"),
217
- descr = paste0("Scatter plot for ", feat, " against nCount_RNA")
218
- ),
219
- h1 = "Scatter Plots",
220
- ui = "table_of_images"
221
- )
222
- }
223
-
224
- # return the dim_df calculated from the cell_qc_df
225
- rbind(
226
- cell_qc_df %>%
227
- # group_by(Sample) %>%
228
- summarise(
229
- when = "Before_Cell_QC",
230
- nCells = dplyr::n(),
231
- nGenes = ngenes
232
- ) %>%
233
- ungroup(),
234
- cell_qc_df %>%
235
- filter(.QC) %>%
236
- # group_by(Sample) %>%
237
- summarise(
238
- when = "After_Cell_QC",
239
- nCells = dplyr::n(),
240
- nGenes = ngenes
241
- ) %>%
242
- ungroup()
243
- )
244
- }
245
-
246
- load_sample = function(sample) {
247
- log_info("- Loading sample: {sample} ...")
248
- mdata = as.data.frame(metadata)[metadata$Sample == sample, , drop=TRUE]
249
- path = as.character(mdata$RNAData)
250
- if (is.na(path) || !is.character(path) || nchar(path) == 0 || path == "NA") {
251
- warning(paste0("No path found for sample: ", sample))
252
- return (NULL)
253
- }
254
-
255
- # obj_list = list()
256
- if (dir.exists(path)) {
257
- exprs = tryCatch(
258
- # Read10X requires
259
- # - barcodes.tsv.gz
260
- # - genes.tsv.gz
261
- # - matrix.mtx.gz
262
- # But sometimes, they are prefixed with sample name
263
- # e.g.GSM4143656_SAM24345863-ln1.barcodes.tsv.gz
264
- { Read10X(data.dir = path) },
265
- error = function(e) rename_files(e, sample, path)
266
- )
267
- } else {
268
- exprs = Read10X_h5(path)
269
- }
270
- if ("Gene Expression" %in% names(exprs)) {
271
- exprs = exprs[["Gene Expression"]]
272
- }
273
- obj <- CreateSeuratObject(exprs, project=sample)
274
- # filter the cells that don't have any gene expressions
275
- # cell_exprs = colSums(obj@assays$RNA)
276
- # obj = subset(obj, cells = names(cell_exprs[cell_exprs > 0]))
277
- obj = RenameCells(obj, add.cell.id = sample)
278
- # Attach meta data
279
- for (mname in names(mdata)) {
280
- if (mname %in% c("RNAData", "TCRData")) { next }
281
- mdt = mdata[[mname]]
282
- if (is.factor(mdt)) { mdt = levels(mdt)[mdt] }
283
- obj[[mname]] = mdt
284
- }
285
-
286
- if (isTRUE(envs$cell_qc_per_sample)) {
287
- log_info("- Perform cell QC for sample: {sample} ...")
288
- obj = perform_cell_qc(obj, TRUE)
289
- }
290
-
291
- if (isTRUE(envs$use_sct)) {
292
- # so that we have data and scale.data layers on RNA assay
293
- # useful for visualization in case some genes are not in
294
- # the SCT assay
295
- obj = NormalizeData(obj, verbose = FALSE)
296
- obj = FindVariableFeatures(obj, verbose = FALSE)
297
- obj = ScaleData(obj, verbose = FALSE)
298
- }
299
- obj
300
- }
301
-
302
- cached <- get_cached(
303
- list(cell_qc = envs$cell_qc, cell_qc_per_sample = envs$cell_qc_per_sample, use_sct = envs$use_sct),
304
- "CellQC",
305
- cache_dir
306
- )
307
- if (!is.null(cached$data)) {
308
- log_info("Loading cell-QC'ed object from cache ...")
309
- sobj <- cached$data$sobj
310
- cell_qc_df <- cached$data$cell_qc_df
311
- cached$data$sobj <- NULL
312
- cached$data$cell_qc_df <- NULL
313
- cached$data <- NULL
314
- rm(cached)
315
- gc()
316
- } else {
317
- # Load data
318
- log_info("Reading samples individually ...")
319
- obj_list = lapply(samples, load_sample)
320
-
321
- log_info("Merging samples ...")
322
- sobj = Reduce(merge, obj_list)
323
- rm(obj_list)
324
- gc()
325
-
326
- if (!envs$cell_qc_per_sample) {
327
- log_info("Performing cell QC ...")
328
- sobj = perform_cell_qc(sobj)
329
- }
330
-
331
- cached$data = list(sobj = sobj, cell_qc_df = cell_qc_df)
332
- save_to_cache(cached, "CellQC", cache_dir)
333
- }
83
+ sobj <- run_cell_qc(sobj)
334
84
 
335
85
  # plot and report the QC
336
86
  log_info("Plotting and reporting QC ...")
337
87
  dim_df = report_cell_qc(nrow(sobj))
338
88
 
339
89
  if (is.list(envs$gene_qc)) {
340
- cached <- get_cached(
341
- list(
342
- cell_qc = envs$cell_qc,
343
- gene_qc = envs$gene_qc,
344
- cell_qc_per_sample = envs$cell_qc_per_sample,
345
- use_sct = envs$use_sct
346
- ),
347
- "GeneQC",
348
- cache_dir
349
- )
350
- if (!is.null(cached$data)) {
351
- log_info("Loading gene-QC'ed object from cache ...")
352
- sobj <- cached$data
353
- cached$data <- NULL
354
- rm(cached)
355
- gc()
356
- } else {
357
- log_info("Filtering genes ...")
358
- genes <- rownames(sobj)
359
- filtered <- FALSE
360
- if (!is.null(envs$gene_qc$min_cells) && envs$gene_qc$min_cells > 0) {
361
- genes = genes[Matrix::rowSums(sobj) >= envs$gene_qc$min_cells]
362
- filtered <- TRUE
363
- }
364
- excludes <- envs$gene_qc$excludes
365
- if (!is.null(excludes)) {
366
- if (length(excludes) == 1) {
367
- excludes <- trimws(unlist(strsplit(excludes, ",")))
368
- }
369
- for (ex in excludes) {
370
- genes <- genes[!grepl(ex, genes)]
371
- }
372
- filtered <- TRUE
373
- }
374
- if (filtered) {
375
- sobj = subset(sobj, features = genes)
376
- }
377
- cached$data <- sobj
378
- save_to_cache(cached, "GeneQC", cache_dir)
379
- }
90
+ sobj <- run_gene_qc(sobj)
380
91
  }
92
+
381
93
  dim_df = rbind(
382
94
  dim_df,
383
95
  data.frame(
@@ -405,277 +117,25 @@ add_report(
405
117
  h1 = "Filters and QC"
406
118
  )
407
119
 
408
- .formatArgs <- function(args) {
409
- paste(capture.output(str(args)), collapse = ", ")
410
- }
411
-
412
- envs_cache <- envs
413
- envs_cache$ncores <- NULL
414
- envs_cache$DoubletFinder <- NULL
415
- envs_cache$IntegrateLayers <- NULL
416
- cached <- get_cached(envs_cache, "Transformed", cache_dir)
417
- if (!is.null(cached$data)) {
418
- log_info("Loading transformed object from cache ...")
419
- sobj <- cached$data
420
- cached$data <- NULL
421
- rm(cached)
422
- gc()
423
- } else {
424
- log_info("Performing transformation/scaling ...")
425
- # Not joined yet
426
- # sobj[["RNA"]] <- split(sobj[["RNA"]], f = sobj$Sample)
427
- if (envs$use_sct) {
428
- log_info("- Running SCTransform ...")
429
- SCTransformArgs <- envs$SCTransform
430
- # log to stdout but don't populate it to running log
431
- print(paste0(" SCTransform: ", .formatArgs(SCTransformArgs)))
432
- log_debug(" SCTransform: {.formatArgs(SCTransformArgs)}")
433
- SCTransformArgs$object <- sobj
434
- sobj <- do_call(SCTransform, SCTransformArgs)
435
- # Default is to use the SCT assay
436
-
437
- # Cleanup memory
438
- SCTransformArgs$object <- NULL
439
- rm(SCTransformArgs)
440
- gc()
441
- } else {
442
- log_info("- Running NormalizeData ...")
443
- NormalizeDataArgs <- envs$NormalizeData
444
- print(paste0(" NormalizeData: ", .formatArgs(NormalizeDataArgs)))
445
- log_debug(" NormalizeData: {.formatArgs(NormalizeDataArgs)}")
446
- NormalizeDataArgs$object <- sobj
447
- sobj <- do_call(NormalizeData, NormalizeDataArgs)
448
-
449
- # Cleanup memory
450
- NormalizeDataArgs$object <- NULL
451
- rm(NormalizeDataArgs)
452
- gc()
453
-
454
- log_info("- Running FindVariableFeatures ...")
455
- FindVariableFeaturesArgs <- envs$FindVariableFeatures
456
- print(paste0(" FindVariableFeatures: ", .formatArgs(FindVariableFeaturesArgs)))
457
- log_debug(" FindVariableFeatures: {.formatArgs(FindVariableFeaturesArgs)}")
458
- FindVariableFeaturesArgs$object <- sobj
459
- sobj <- do_call(FindVariableFeatures, FindVariableFeaturesArgs)
460
-
461
- # Cleanup memory
462
- FindVariableFeaturesArgs$object <- NULL
463
- rm(FindVariableFeaturesArgs)
464
- gc()
465
-
466
- log_info("- Running ScaleData ...")
467
- ScaleDataArgs <- envs$ScaleData
468
- print(paste0(" ScaleData: ", .formatArgs(ScaleDataArgs)))
469
- log_debug(" ScaleData: {.formatArgs(ScaleDataArgs)}")
470
- ScaleDataArgs$object <- sobj
471
- sobj <- do_call(ScaleData, ScaleDataArgs)
472
-
473
- # Cleanup memory
474
- ScaleDataArgs$object <- NULL
475
- rm(ScaleDataArgs)
476
- gc()
477
- }
478
-
479
- log_info("- Running RunPCA ...")
480
- RunPCAArgs <- envs$RunPCA
481
- RunPCAArgs$npcs <- if (is.null(RunPCAArgs$npcs)) { 50 } else { min(RunPCAArgs$npcs, ncol(sobj) - 1) }
482
- print(paste0(" RunPCA: ", .formatArgs(RunPCAArgs)))
483
- log_debug(" RunPCA: {.formatArgs(RunPCAArgs)}")
484
- RunPCAArgs$object <- sobj
485
- sobj <- do_call(RunPCA, RunPCAArgs)
486
-
487
- # Cleanup memory
488
- RunPCAArgs$object <- NULL
489
- rm(RunPCAArgs)
490
- gc()
491
-
492
- cached$data <- sobj
493
- save_to_cache(cached, "Transformed", cache_dir)
494
- }
495
-
496
- envs_cache <- envs
497
- envs_cache$ncores <- NULL
498
- envs_cache$DoubletFinder <- NULL
499
- cached <- get_cached(envs_cache, "Integrated", cache_dir)
500
-
501
- if (!is.null(cached$data)) {
502
- log_info("Loading integrated/layer-joined object from cache ...")
503
- sobj <- cached$data
504
- cached$data <- NULL
505
- rm(cached)
506
- gc()
507
-
508
- } else {
509
-
510
- if (!envs$no_integration) {
511
- log_info("- Running IntegrateLayers (method = {envs$IntegrateLayers$method}) ...")
512
- IntegrateLayersArgs <- envs$IntegrateLayers
513
- method <- IntegrateLayersArgs$method
514
- if (!is.null(IntegrateLayersArgs$reference) && is.character(IntegrateLayersArgs$reference)) {
515
- log_info(" Using reference samples: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
516
- IntegrateLayersArgs$reference <- match(IntegrateLayersArgs$reference, samples)
517
- log_info(" Transferred to indices: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
518
- }
519
- if (method %in% c("CCA", "cca")) { method <- "CCAIntegration" } else
520
- if (method %in% c("RPCA", "rpca")) { method <- "RPCAIntegration" } else
521
- if (method %in% c("Harmony", "harmony")) { method <- "HarmonyIntegration" } else
522
- if (method %in% c("FastMNN", "fastmnn")) { method <- "FastMNNIntegration" } else
523
- if (method %in% c("scVI", "scvi")) { method <- "scVIIntegration" } else
524
- { stop(paste0("Unknown integration method: ", method)) }
525
- if (envs$use_sct && is.null(IntegrateLayersArgs$normalization.method)) {
526
- IntegrateLayersArgs$normalization.method <- "SCT"
527
- }
528
- IntegrateLayersArgs$method <- eval(parse(text = method))
529
- new_reductions <- list(
530
- "CCAIntegration" = "integrated.cca",
531
- "RPCAIntegration" = "integrated.rpca",
532
- "HarmonyIntegration" = "harmony",
533
- "FastMNNIntegration" = "integration.mnn",
534
- "scVIIntegration" = "integrated.scvi"
535
- )
536
- if (is.null(IntegrateLayersArgs$new.reduction)) {
537
- IntegrateLayersArgs$new.reduction <- new_reductions[[method]]
538
- }
539
- print(paste0(" IntegrateLayers: ", .formatArgs(IntegrateLayersArgs)))
540
- log_debug(" IntegrateLayers: {.formatArgs(IntegrateLayersArgs)}")
541
- IntegrateLayersArgs$object <- sobj
542
- sobj <- do_call(IntegrateLayers, IntegrateLayersArgs)
543
- # Save it for dimension reduction plots
544
- sobj@misc$integrated_new_reduction <- IntegrateLayersArgs$new.reduction
545
-
546
- # Cleanup memory
547
- IntegrateLayersArgs$object <- NULL
548
- rm(IntegrateLayersArgs)
549
- gc()
550
- }
551
-
552
- if (!envs$use_sct) {
553
- log_info("- Joining layers ...")
554
- sobj <- JoinLayers(sobj)
555
- }
556
-
557
- cached$data <- sobj
558
- save_to_cache(cached, "Integrated", cache_dir)
559
- }
560
-
120
+ sobj <- run_transformation(sobj)
121
+ sobj <- run_integration(sobj)
561
122
 
562
123
  # This is the last step, doesn't need to be cached
563
- if (!is.null(envs$DoubletFinder) && is.list(envs$DoubletFinder) && envs$DoubletFinder$PCs > 0) {
564
- library(DoubletFinder)
565
-
566
- log_info("Running DoubletFinder ...")
567
- log_info("- Preparing Seurat object ...")
568
-
569
- if (is.null(envs$DoubletFinder$ncores)) {
570
- envs$DoubletFinder$ncores <- envs$ncores
571
- }
572
-
573
- # More controls from envs?
574
- sobj <- FindNeighbors(sobj, dims = 1:envs$DoubletFinder$PCs)
575
- sobj <- FindClusters(sobj)
576
-
577
- log_info("- pK Indentification ...")
578
- sweep.res.list <- paramSweep(
579
- sobj,
580
- PCs = 1:envs$DoubletFinder$PCs,
581
- sct = envs$use_sct,
582
- num.cores = envs$DoubletFinder$ncores
583
- )
584
- sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
585
- bcmvn <- find.pK(sweep.stats)
586
-
587
- bcmvn$Selected <- bcmvn$pK == bcmvn$pK[which.max(bcmvn$BCmetric)[1]]
588
- plot <- ggplot(bcmvn, aes(x = pK, y = BCmetric, color = Selected)) +
589
- geom_point() +
590
- # rotate x axis labels
591
- theme(axis.text.x = element_text(angle = 90, hjust = 1))
592
- ggsave(plot, filename = file.path(plotsdir, "pK_BCmetric.png"))
593
-
594
- pK <- bcmvn$pK[which.max(bcmvn$BCmetric)[1]]
595
- pK <- as.numeric(as.character(pK))
596
- pN <- envs$DoubletFinder$pN
597
- log_info("- Homotypic Doublet Proportion Estimate ...")
598
- homotypic.prop <- modelHomotypic(Idents(sobj))
599
- nExp_poi <- round(nrow(sobj@meta.data) * envs$DoubletFinder$doublets)
600
- nExp_poi.adj <- round(nExp_poi * (1 - homotypic.prop))
601
-
602
- log_info("- Running DoubletFinder ...")
603
- sobj <- doubletFinder(
604
- sobj,
605
- PCs = 1:envs$DoubletFinder$PCs,
606
- pN = pN,
607
- pK = pK,
608
- nExp = nExp_poi.adj,
609
- reuse.pANN = FALSE,
610
- sct = envs$use_sct
611
- )
612
- pANN_col <- paste0("pANN_", pN, "_", pK)
613
- pANN_col <- colnames(sobj@meta.data)[grepl(pANN_col, colnames(sobj@meta.data))]
614
- DF_col <- paste0("DF.classifications_", pN, "_", pK)
615
- DF_col <- colnames(sobj@meta.data)[grepl(DF_col, colnames(sobj@meta.data))]
616
- doublets <- as.data.frame(
617
- cbind(
618
- colnames(sobj),
619
- sobj@meta.data[, pANN_col],
620
- sobj@meta.data[, DF_col]
621
- )
622
- )
623
- colnames(doublets) <- c("Barcode","DoubletFinder_score","DoubletFinder_DropletType")
624
- write.table(
625
- doublets,
626
- file.path(joboutdir, "DoubletFinder_doublets_singlets.txt"),
627
- row.names = FALSE,
628
- quote = FALSE,
629
- sep = "\t"
630
- )
631
-
632
- summary <- as.data.frame(table(doublets$DoubletFinder_DropletType))
633
- colnames(summary) <- c("Classification", "Droplet_N")
634
- write.table(
635
- summary,
636
- file.path(joboutdir, "DoubletFinder_summary.txt"),
637
- row.names = FALSE,
638
- quote = FALSE,
639
- sep = "\t"
640
- )
641
-
642
- # Do a dimplot
643
- log_info("- Plotting dimension reduction ...")
644
- dimp <- DimPlot(
645
- sobj, group.by = DF_col, order = "Doublet",
646
- cols = c("#333333", "#FF3333"), pt.size = 0.8, alpha = 0.5)
647
- ggsave(dimp, filename = file.path(plotsdir, "DoubletFinder_dimplot.png"))
648
-
649
- log_info("- Filtering doublets ...")
650
- sobj <- subset(sobj, cells = doublets$Barcode[doublets$DoubletFinder_DropletType == "Singlet"])
651
-
652
- add_report(
653
- list(
654
- kind = "descr",
655
- content = "The table contains the number of cells classified as singlets and doublets."
656
- ),
657
- list(
658
- kind = "table",
659
- data = list(path = file.path(joboutdir, "DoubletFinder_summary.txt"))
660
- ),
661
- h1 = "DoubletFinder Results",
662
- h2 = "The DoubletFinder Summary"
663
- )
664
- add_report(
665
- list(
666
- name = "pK vs BCmetric",
667
- src = file.path(plotsdir, "pK_BCmetric.png")
668
- ),
669
- list(
670
- name = "Dimension Reduction Plot",
671
- src = file.path(plotsdir, "DoubletFinder_dimplot.png")
672
- ),
673
- ui = "table_of_images",
674
- h1 = "DoubletFinder Results",
675
- h2 = "Plots"
676
- )
124
+ if (!is.null(envs$doublet_detector) && envs$doublet_detector != "none") {
125
+ {{* biopipen_dir | joinpaths: "scripts", "scrna", "SeuratPreparing-doublet_detection.R" | source_r }}
126
+
127
+ detector <- tolower(envs$doublet_detector)
128
+ if (detector == "doubletfinder") detector <- "DoubletFinder"
129
+ if (detector == "scdblfinder") detector <- "scDblFinder"
130
+ dd <- run_dd(detector)
131
+ save_dd(dd, detector)
132
+ sobj <- add_dd_to_seurat(sobj, dd)
133
+ plot_dd(sobj, dd, detector)
134
+ sobj <- filter_dd(sobj, dd, detector)
135
+ report_dd(detector)
677
136
  }
678
137
 
138
+
679
139
  log_info("Saving QC'ed seurat object ...")
680
140
  saveRDS(sobj, rdsfile)
681
141