biopipen 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (85) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +8 -0
  3. biopipen/ns/bam.py +0 -2
  4. biopipen/ns/bed.py +35 -0
  5. biopipen/ns/cellranger_pipeline.py +5 -5
  6. biopipen/ns/cnv.py +18 -2
  7. biopipen/ns/cnvkit_pipeline.py +16 -11
  8. biopipen/ns/gene.py +68 -23
  9. biopipen/ns/misc.py +2 -15
  10. biopipen/ns/plot.py +204 -0
  11. biopipen/ns/regulatory.py +214 -0
  12. biopipen/ns/scrna.py +31 -5
  13. biopipen/ns/snp.py +516 -8
  14. biopipen/ns/stats.py +167 -3
  15. biopipen/ns/vcf.py +196 -0
  16. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  17. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  18. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  19. biopipen/reports/snp/PlinkHet.svelte +18 -0
  20. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  21. biopipen/scripts/bam/CNVpytor.py +144 -46
  22. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  23. biopipen/scripts/bed/BedtoolsMerge.py +1 -1
  24. biopipen/scripts/cnv/AneuploidyScore.R +30 -7
  25. biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
  26. biopipen/scripts/cnv/TMADScore.R +21 -5
  27. biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
  28. biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
  29. biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
  30. biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
  31. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
  32. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
  33. biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
  34. biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
  35. biopipen/scripts/delim/SampleInfo.R +10 -5
  36. biopipen/scripts/gene/GeneNameConversion.R +65 -0
  37. biopipen/scripts/gene/GenePromoters.R +61 -0
  38. biopipen/scripts/misc/Shell.sh +15 -0
  39. biopipen/scripts/plot/Manhattan.R +146 -0
  40. biopipen/scripts/plot/QQPlot.R +146 -0
  41. biopipen/scripts/regulatory/MotifAffinityTest.R +226 -0
  42. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +126 -0
  43. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +96 -0
  44. biopipen/scripts/regulatory/MotifScan.py +159 -0
  45. biopipen/scripts/regulatory/atSNP.R +33 -0
  46. biopipen/scripts/regulatory/motifBreakR.R +1594 -0
  47. biopipen/scripts/scrna/MarkersFinder.R +69 -67
  48. biopipen/scripts/scrna/SeuratClustering.R +71 -29
  49. biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
  50. biopipen/scripts/scrna/SeuratPreparing.R +252 -122
  51. biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
  52. biopipen/scripts/snp/MatrixEQTL.R +85 -44
  53. biopipen/scripts/snp/Plink2GTMat.py +133 -0
  54. biopipen/scripts/snp/PlinkCallRate.R +190 -0
  55. biopipen/scripts/snp/PlinkFilter.py +100 -0
  56. biopipen/scripts/snp/PlinkFreq.R +298 -0
  57. biopipen/scripts/snp/PlinkFromVcf.py +78 -0
  58. biopipen/scripts/snp/PlinkHWE.R +80 -0
  59. biopipen/scripts/snp/PlinkHet.R +92 -0
  60. biopipen/scripts/snp/PlinkIBD.R +200 -0
  61. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  62. biopipen/scripts/stats/Mediation.R +94 -0
  63. biopipen/scripts/stats/MetaPvalue.R +2 -1
  64. biopipen/scripts/stats/MetaPvalue1.R +70 -0
  65. biopipen/scripts/tcr/TCRClusterStats.R +12 -7
  66. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  67. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  68. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  69. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  70. biopipen/scripts/vcf/VcfFix_utils.py +1 -1
  71. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  72. biopipen/utils/gene.R +83 -37
  73. biopipen/utils/gene.py +108 -60
  74. biopipen/utils/misc.R +56 -0
  75. biopipen/utils/misc.py +5 -2
  76. biopipen/utils/reference.py +54 -10
  77. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/METADATA +2 -2
  78. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/RECORD +80 -51
  79. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/entry_points.txt +1 -1
  80. biopipen/ns/bcftools.py +0 -111
  81. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  82. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  83. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  84. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  85. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/WHEEL +0 -0
@@ -1,19 +1,27 @@
1
1
  source("{{biopipen_dir}}/utils/misc.R")
2
+ source("{{biopipen_dir}}/utils/caching.R")
2
3
 
3
4
  library(Seurat)
4
5
  library(future)
5
6
  library(bracer)
6
7
  library(ggplot2)
7
8
  library(dplyr)
8
- library(tidyseurat)
9
+ # library(tidyseurat)
9
10
 
10
- metafile = {{in.metafile | quote}}
11
- rdsfile = {{out.rdsfile | quote}}
12
- joboutdir = {{job.outdir | quote}}
13
- envs = {{envs | r: todot = "-", skip = 1}}
11
+ metafile <- {{in.metafile | quote}}
12
+ rdsfile <- {{out.rdsfile | quote}}
13
+ joboutdir <- {{job.outdir | quote}}
14
+ envs <- {{envs | r: todot = "-", skip = 1}}
15
+
16
+ if (isTRUE(envs$cache)) { envs$cache <- joboutdir }
17
+ if (length(envs$cache) > 1) {
18
+ log_warn("Multiple cache directories (envs.cache) detected, using the first one.")
19
+ envs$cache <- envs$cache[1]
20
+ }
14
21
 
15
22
  set.seed(8525)
16
- options(future.globals.maxSize = 80000 * 1024^2)
23
+ # 8TB
24
+ options(future.globals.maxSize = 8 * 1024 ^ 4)
17
25
  options(future.rng.onMisuse="ignore")
18
26
  options(Seurat.object.assay.version = "v5")
19
27
  plan(strategy = "multicore", workers = envs$ncores)
@@ -34,7 +42,7 @@ add_report(
34
42
  h1 = "Filters and QC"
35
43
  )
36
44
 
37
- metadata = read.table(
45
+ metadata <- read.table(
38
46
  metafile,
39
47
  header = TRUE,
40
48
  row.names = NULL,
@@ -42,6 +50,16 @@ metadata = read.table(
42
50
  check.names = FALSE
43
51
  )
44
52
 
53
+ cache_sig <- capture.output(str(metadata))
54
+ dig_sig <- digest::digest(cache_sig, algo = "md5")
55
+ dig_sig <- substr(dig_sig, 1, 8)
56
+ cache_dir <- NULL
57
+ if (is.character(envs$cache)) {
58
+ cache_dir <- file.path(envs$cache, paste0(dig_sig, ".seuratpreparing_cache"))
59
+ dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE)
60
+ writeLines(cache_sig, file.path(cache_dir, "signature.txt"))
61
+ }
62
+
45
63
  meta_cols = colnames(metadata)
46
64
  if (!"Sample" %in% meta_cols) {
47
65
  stop("Error: Column `Sample` is not found in metafile.")
@@ -90,21 +108,21 @@ rename_files = function(e, sample, path) {
90
108
 
91
109
 
92
110
  perform_cell_qc <- function(sobj, per_sample = FALSE) {
93
- log_prefix = ifelse(per_sample, " ", "- ")
111
+ log_prefix <- ifelse(per_sample, " ", "- ")
94
112
  log_info("{log_prefix}Adding metadata for QC ...")
95
- sobj$percent.mt = PercentageFeatureSet(sobj, pattern = "^MT-")
96
- sobj$percent.ribo = PercentageFeatureSet(sobj, pattern = "^RP[SL]")
97
- sobj$percent.hb = PercentageFeatureSet(sobj, pattern = "^HB[^(P)]")
98
- sobj$percent.plat = PercentageFeatureSet(sobj, pattern = "PECAM1|PF4")
113
+ sobj$percent.mt <- PercentageFeatureSet(sobj, pattern = "^MT-")
114
+ sobj$percent.ribo <- PercentageFeatureSet(sobj, pattern = "^RP[SL]")
115
+ sobj$percent.hb <- PercentageFeatureSet(sobj, pattern = "^HB[^(P)]")
116
+ sobj$percent.plat <- PercentageFeatureSet(sobj, pattern = "PECAM1|PF4")
99
117
 
100
118
  if (is.null(envs$cell_qc) || length(envs$cell_qc) == 0) {
101
119
  log_warn("{log_prefix}No cell QC criteria is provided. All cells will be kept.")
102
- cell_qc = "TRUE"
120
+ cell_qc <- "TRUE"
103
121
  } else {
104
- cell_qc = envs$cell_qc
122
+ cell_qc <- envs$cell_qc
105
123
  }
106
124
 
107
- sobj = sobj %>% mutate(.QC = !!rlang::parse_expr(cell_qc))
125
+ sobj@meta.data <- sobj@meta.data %>% mutate(.QC = !!rlang::parse_expr(cell_qc))
108
126
 
109
127
  if (is.null(cell_qc_df)) {
110
128
  cell_qc_df <<- sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE]
@@ -114,8 +132,8 @@ perform_cell_qc <- function(sobj, per_sample = FALSE) {
114
132
 
115
133
  # Do the filtering
116
134
  log_info("{log_prefix}Filtering cells using QC criteria ...")
117
- sobj = sobj %>% filter(.QC)
118
- sobj$.QC = NULL
135
+ sobj <- subset(sobj, subset = .QC)
136
+ sobj$.QC <- NULL
119
137
 
120
138
  return(sobj)
121
139
  }
@@ -281,42 +299,83 @@ load_sample = function(sample) {
281
299
  obj
282
300
  }
283
301
 
284
- # Load data
285
- log_info("Reading samples individually ...")
286
- obj_list = lapply(samples, load_sample)
287
-
288
- log_info("Merging samples ...")
289
- sobj = Reduce(merge, obj_list)
302
+ cached <- get_cached(
303
+ list(cell_qc = envs$cell_qc, cell_qc_per_sample = envs$cell_qc_per_sample, use_sct = envs$use_sct),
304
+ "CellQC",
305
+ cache_dir
306
+ )
307
+ if (!is.null(cached$data)) {
308
+ log_info("Loading cell-QC'ed object from cache ...")
309
+ sobj <- cached$data$sobj
310
+ cell_qc_df <- cached$data$cell_qc_df
311
+ cached$data$sobj <- NULL
312
+ cached$data$cell_qc_df <- NULL
313
+ cached$data <- NULL
314
+ rm(cached)
315
+ gc()
316
+ } else {
317
+ # Load data
318
+ log_info("Reading samples individually ...")
319
+ obj_list = lapply(samples, load_sample)
320
+
321
+ log_info("Merging samples ...")
322
+ sobj = Reduce(merge, obj_list)
323
+ rm(obj_list)
324
+ gc()
325
+
326
+ if (!envs$cell_qc_per_sample) {
327
+ log_info("Performing cell QC ...")
328
+ sobj = perform_cell_qc(sobj)
329
+ }
290
330
 
291
- if (!envs$cell_qc_per_sample) {
292
- log_info("Performing cell QC ...")
293
- sobj = perform_cell_qc(sobj)
331
+ cached$data = list(sobj = sobj, cell_qc_df = cell_qc_df)
332
+ save_to_cache(cached, "CellQC", cache_dir)
294
333
  }
295
334
 
296
335
  # plot and report the QC
297
336
  log_info("Plotting and reporting QC ...")
298
337
  dim_df = report_cell_qc(nrow(sobj))
299
338
 
300
- log_info("Filtering genes ...")
301
339
  if (is.list(envs$gene_qc)) {
302
- genes <- rownames(sobj)
303
- filtered <- FALSE
304
- if (!is.null(envs$gene_qc$min_cells) && envs$gene_qc$min_cells > 0) {
305
- genes = genes[Matrix::rowSums(sobj) >= envs$gene_qc$min_cells]
306
- filtered <- TRUE
307
- }
308
- excludes <- envs$gene_qc$excludes
309
- if (!is.null(excludes)) {
310
- if (length(excludes) == 1) {
311
- excludes <- trimws(unlist(strsplit(excludes, ",")))
340
+ cached <- get_cached(
341
+ list(
342
+ cell_qc = envs$cell_qc,
343
+ gene_qc = envs$gene_qc,
344
+ cell_qc_per_sample = envs$cell_qc_per_sample,
345
+ use_sct = envs$use_sct
346
+ ),
347
+ "GeneQC",
348
+ cache_dir
349
+ )
350
+ if (!is.null(cached$data)) {
351
+ log_info("Loading gene-QC'ed object from cache ...")
352
+ sobj <- cached$data
353
+ cached$data <- NULL
354
+ rm(cached)
355
+ gc()
356
+ } else {
357
+ log_info("Filtering genes ...")
358
+ genes <- rownames(sobj)
359
+ filtered <- FALSE
360
+ if (!is.null(envs$gene_qc$min_cells) && envs$gene_qc$min_cells > 0) {
361
+ genes = genes[Matrix::rowSums(sobj) >= envs$gene_qc$min_cells]
362
+ filtered <- TRUE
312
363
  }
313
- for (ex in excludes) {
314
- genes <- genes[!grepl(ex, genes)]
364
+ excludes <- envs$gene_qc$excludes
365
+ if (!is.null(excludes)) {
366
+ if (length(excludes) == 1) {
367
+ excludes <- trimws(unlist(strsplit(excludes, ",")))
368
+ }
369
+ for (ex in excludes) {
370
+ genes <- genes[!grepl(ex, genes)]
371
+ }
372
+ filtered <- TRUE
315
373
  }
316
- filtered <- TRUE
317
- }
318
- if (filtered) {
319
- sobj = subset(sobj, features = genes)
374
+ if (filtered) {
375
+ sobj = subset(sobj, features = genes)
376
+ }
377
+ cached$data <- sobj
378
+ save_to_cache(cached, "GeneQC", cache_dir)
320
379
  }
321
380
  }
322
381
  dim_df = rbind(
@@ -350,96 +409,167 @@ add_report(
350
409
  paste(capture.output(str(args)), collapse = ", ")
351
410
  }
352
411
 
353
- log_info("Performing transformation/scaling ...")
354
- # Not joined yet
355
- # sobj[["RNA"]] <- split(sobj[["RNA"]], f = sobj$Sample)
356
- if (envs$use_sct) {
357
- log_info("- Running SCTransform ...")
358
- SCTransformArgs <- envs$SCTransform
359
- # log to stdout but don't populate it to running log
360
- print(paste0(" SCTransform: ", .formatArgs(SCTransformArgs)))
361
- log_debug(" SCTransform: {.formatArgs(SCTransformArgs)}")
362
- SCTransformArgs$object <- sobj
363
- sobj <- do_call(SCTransform, SCTransformArgs)
364
- # Default is to use the SCT assay
412
+ envs_cache <- envs
413
+ envs_cache$ncores <- NULL
414
+ envs_cache$DoubletFinder <- NULL
415
+ envs_cache$IntegrateLayers <- NULL
416
+ cached <- get_cached(envs_cache, "Transformed", cache_dir)
417
+ if (!is.null(cached$data)) {
418
+ log_info("Loading transformed object from cache ...")
419
+ sobj <- cached$data
420
+ cached$data <- NULL
421
+ rm(cached)
422
+ gc()
365
423
  } else {
366
- log_info("- Running NormalizeData ...")
367
- NormalizeDataArgs <- envs$NormalizeData
368
- print(paste0(" NormalizeData: ", .formatArgs(NormalizeDataArgs)))
369
- log_debug(" NormalizeData: {.formatArgs(NormalizeDataArgs)}")
370
- NormalizeDataArgs$object <- sobj
371
- sobj <- do_call(NormalizeData, NormalizeDataArgs)
372
-
373
- log_info("- Running FindVariableFeatures ...")
374
- FindVariableFeaturesArgs <- envs$FindVariableFeatures
375
- print(paste0(" FindVariableFeatures: ", .formatArgs(FindVariableFeaturesArgs)))
376
- log_debug(" FindVariableFeatures: {.formatArgs(FindVariableFeaturesArgs)}")
377
- FindVariableFeaturesArgs$object <- sobj
378
- sobj <- do_call(FindVariableFeatures, FindVariableFeaturesArgs)
379
-
380
- log_info("- Running ScaleData ...")
381
- ScaleDataArgs <- envs$ScaleData
382
- print(paste0(" ScaleData: ", .formatArgs(ScaleDataArgs)))
383
- log_debug(" ScaleData: {.formatArgs(ScaleDataArgs)}")
384
- ScaleDataArgs$object <- sobj
385
- sobj <- do_call(ScaleData, ScaleDataArgs)
424
+ log_info("Performing transformation/scaling ...")
425
+ # Not joined yet
426
+ # sobj[["RNA"]] <- split(sobj[["RNA"]], f = sobj$Sample)
427
+ if (envs$use_sct) {
428
+ log_info("- Running SCTransform ...")
429
+ SCTransformArgs <- envs$SCTransform
430
+ # log to stdout but don't populate it to running log
431
+ print(paste0(" SCTransform: ", .formatArgs(SCTransformArgs)))
432
+ log_debug(" SCTransform: {.formatArgs(SCTransformArgs)}")
433
+ SCTransformArgs$object <- sobj
434
+ sobj <- do_call(SCTransform, SCTransformArgs)
435
+ # Default is to use the SCT assay
436
+
437
+ # Cleanup memory
438
+ SCTransformArgs$object <- NULL
439
+ rm(SCTransformArgs)
440
+ gc()
441
+ } else {
442
+ log_info("- Running NormalizeData ...")
443
+ NormalizeDataArgs <- envs$NormalizeData
444
+ print(paste0(" NormalizeData: ", .formatArgs(NormalizeDataArgs)))
445
+ log_debug(" NormalizeData: {.formatArgs(NormalizeDataArgs)}")
446
+ NormalizeDataArgs$object <- sobj
447
+ sobj <- do_call(NormalizeData, NormalizeDataArgs)
448
+
449
+ # Cleanup memory
450
+ NormalizeDataArgs$object <- NULL
451
+ rm(NormalizeDataArgs)
452
+ gc()
453
+
454
+ log_info("- Running FindVariableFeatures ...")
455
+ FindVariableFeaturesArgs <- envs$FindVariableFeatures
456
+ print(paste0(" FindVariableFeatures: ", .formatArgs(FindVariableFeaturesArgs)))
457
+ log_debug(" FindVariableFeatures: {.formatArgs(FindVariableFeaturesArgs)}")
458
+ FindVariableFeaturesArgs$object <- sobj
459
+ sobj <- do_call(FindVariableFeatures, FindVariableFeaturesArgs)
460
+
461
+ # Cleanup memory
462
+ FindVariableFeaturesArgs$object <- NULL
463
+ rm(FindVariableFeaturesArgs)
464
+ gc()
465
+
466
+ log_info("- Running ScaleData ...")
467
+ ScaleDataArgs <- envs$ScaleData
468
+ print(paste0(" ScaleData: ", .formatArgs(ScaleDataArgs)))
469
+ log_debug(" ScaleData: {.formatArgs(ScaleDataArgs)}")
470
+ ScaleDataArgs$object <- sobj
471
+ sobj <- do_call(ScaleData, ScaleDataArgs)
472
+
473
+ # Cleanup memory
474
+ ScaleDataArgs$object <- NULL
475
+ rm(ScaleDataArgs)
476
+ gc()
477
+ }
478
+
479
+ log_info("- Running RunPCA ...")
480
+ RunPCAArgs <- envs$RunPCA
481
+ RunPCAArgs$npcs <- if (is.null(RunPCAArgs$npcs)) { 50 } else { min(RunPCAArgs$npcs, ncol(sobj) - 1) }
482
+ print(paste0(" RunPCA: ", .formatArgs(RunPCAArgs)))
483
+ log_debug(" RunPCA: {.formatArgs(RunPCAArgs)}")
484
+ RunPCAArgs$object <- sobj
485
+ sobj <- do_call(RunPCA, RunPCAArgs)
486
+
487
+ # Cleanup memory
488
+ RunPCAArgs$object <- NULL
489
+ rm(RunPCAArgs)
490
+ gc()
491
+
492
+ cached$data <- sobj
493
+ save_to_cache(cached, "Transformed", cache_dir)
386
494
  }
387
495
 
388
- log_info("- Running RunPCA ...")
389
- RunPCAArgs <- envs$RunPCA
390
- RunPCAArgs$npcs <- if (is.null(RunPCAArgs$npcs)) { 50 } else { min(RunPCAArgs$npcs, ncol(sobj) - 1) }
391
- print(paste0(" RunPCA: ", .formatArgs(RunPCAArgs)))
392
- log_debug(" RunPCA: {.formatArgs(RunPCAArgs)}")
393
- RunPCAArgs$object <- sobj
394
- sobj <- do_call(RunPCA, RunPCAArgs)
395
-
396
- if (!envs$no_integration) {
397
- log_info("- Running IntegrateLayers (method = {envs$IntegrateLayers$method}) ...")
398
- IntegrateLayersArgs <- envs$IntegrateLayers
399
- method <- IntegrateLayersArgs$method
400
- if (!is.null(IntegrateLayersArgs$reference) && is.character(IntegrateLayersArgs$reference)) {
401
- log_info(" Using reference samples: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
402
- IntegrateLayersArgs$reference <- match(IntegrateLayersArgs$reference, samples)
403
- log_info(" Transferred to indices: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
404
- }
405
- if (method %in% c("CCA", "cca")) { method <- "CCAIntegration" } else
406
- if (method %in% c("RPCA", "rpca")) { method <- "RPCAIntegration" } else
407
- if (method %in% c("Harmony", "harmony")) { method <- "HarmonyIntegration" } else
408
- if (method %in% c("FastMNN", "fastmnn")) { method <- "FastMNNIntegration" } else
409
- if (method %in% c("scVI", "scvi")) { method <- "scVIIntegration" } else
410
- { stop(paste0("Unknown integration method: ", method)) }
411
- if (envs$use_sct && is.null(IntegrateLayersArgs$normalization.method)) {
412
- IntegrateLayersArgs$normalization.method <- "SCT"
496
+ envs_cache <- envs
497
+ envs_cache$ncores <- NULL
498
+ envs_cache$DoubletFinder <- NULL
499
+ cached <- get_cached(envs_cache, "Integrated", cache_dir)
500
+
501
+ if (!is.null(cached$data)) {
502
+ log_info("Loading integrated/layer-joined object from cache ...")
503
+ sobj <- cached$data
504
+ cached$data <- NULL
505
+ rm(cached)
506
+ gc()
507
+
508
+ } else {
509
+
510
+ if (!envs$no_integration) {
511
+ log_info("- Running IntegrateLayers (method = {envs$IntegrateLayers$method}) ...")
512
+ IntegrateLayersArgs <- envs$IntegrateLayers
513
+ method <- IntegrateLayersArgs$method
514
+ if (!is.null(IntegrateLayersArgs$reference) && is.character(IntegrateLayersArgs$reference)) {
515
+ log_info(" Using reference samples: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
516
+ IntegrateLayersArgs$reference <- match(IntegrateLayersArgs$reference, samples)
517
+ log_info(" Transferred to indices: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
518
+ }
519
+ if (method %in% c("CCA", "cca")) { method <- "CCAIntegration" } else
520
+ if (method %in% c("RPCA", "rpca")) { method <- "RPCAIntegration" } else
521
+ if (method %in% c("Harmony", "harmony")) { method <- "HarmonyIntegration" } else
522
+ if (method %in% c("FastMNN", "fastmnn")) { method <- "FastMNNIntegration" } else
523
+ if (method %in% c("scVI", "scvi")) { method <- "scVIIntegration" } else
524
+ { stop(paste0("Unknown integration method: ", method)) }
525
+ if (envs$use_sct && is.null(IntegrateLayersArgs$normalization.method)) {
526
+ IntegrateLayersArgs$normalization.method <- "SCT"
527
+ }
528
+ IntegrateLayersArgs$method <- eval(parse(text = method))
529
+ new_reductions <- list(
530
+ "CCAIntegration" = "integrated.cca",
531
+ "RPCAIntegration" = "integrated.rpca",
532
+ "HarmonyIntegration" = "harmony",
533
+ "FastMNNIntegration" = "integration.mnn",
534
+ "scVIIntegration" = "integrated.scvi"
535
+ )
536
+ if (is.null(IntegrateLayersArgs$new.reduction)) {
537
+ IntegrateLayersArgs$new.reduction <- new_reductions[[method]]
538
+ }
539
+ print(paste0(" IntegrateLayers: ", .formatArgs(IntegrateLayersArgs)))
540
+ log_debug(" IntegrateLayers: {.formatArgs(IntegrateLayersArgs)}")
541
+ IntegrateLayersArgs$object <- sobj
542
+ sobj <- do_call(IntegrateLayers, IntegrateLayersArgs)
543
+ # Save it for dimension reduction plots
544
+ sobj@misc$integrated_new_reduction <- IntegrateLayersArgs$new.reduction
545
+
546
+ # Cleanup memory
547
+ IntegrateLayersArgs$object <- NULL
548
+ rm(IntegrateLayersArgs)
549
+ gc()
413
550
  }
414
- IntegrateLayersArgs$method <- eval(parse(text = method))
415
- new_reductions <- list(
416
- "CCAIntegration" = "integrated.cca",
417
- "RPCAIntegration" = "integrated.rpca",
418
- "HarmonyIntegration" = "harmony",
419
- "FastMNNIntegration" = "integration.mnn",
420
- "scVIIntegration" = "integrated.scvi"
421
- )
422
- if (is.null(IntegrateLayersArgs$new.reduction)) {
423
- IntegrateLayersArgs$new.reduction <- new_reductions[[method]]
551
+
552
+ if (!envs$use_sct) {
553
+ log_info("- Joining layers ...")
554
+ sobj <- JoinLayers(sobj)
424
555
  }
425
- print(paste0(" IntegrateLayers: ", .formatArgs(IntegrateLayersArgs)))
426
- log_debug(" IntegrateLayers: {.formatArgs(IntegrateLayersArgs)}")
427
- IntegrateLayersArgs$object <- sobj
428
- sobj <- do_call(IntegrateLayers, IntegrateLayersArgs)
429
- # Save it for dimension reduction plots
430
- sobj@misc$integrated_new_reduction <- IntegrateLayersArgs$new.reduction
431
- }
432
556
 
433
- if (!envs$use_sct) {
434
- log_info("- Joining layers ...")
435
- sobj <- JoinLayers(sobj)
557
+ cached$data <- sobj
558
+ save_to_cache(cached, "Integrated", cache_dir)
436
559
  }
437
560
 
561
+
562
+ # This is the last step, doesn't need to be cached
438
563
  if (!is.null(envs$DoubletFinder) && is.list(envs$DoubletFinder) && envs$DoubletFinder$PCs > 0) {
439
564
  library(DoubletFinder)
440
565
 
441
566
  log_info("Running DoubletFinder ...")
442
567
  log_info("- Preparing Seurat object ...")
568
+
569
+ if (is.null(envs$DoubletFinder$ncores)) {
570
+ envs$DoubletFinder$ncores <- envs$ncores
571
+ }
572
+
443
573
  # More controls from envs?
444
574
  sobj <- FindNeighbors(sobj, dims = 1:envs$DoubletFinder$PCs)
445
575
  sobj <- FindClusters(sobj)
@@ -449,7 +579,7 @@ if (!is.null(envs$DoubletFinder) && is.list(envs$DoubletFinder) && envs$DoubletF
449
579
  sobj,
450
580
  PCs = 1:envs$DoubletFinder$PCs,
451
581
  sct = envs$use_sct,
452
- num.cores = envs$ncores
582
+ num.cores = envs$DoubletFinder$ncores
453
583
  )
454
584
  sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
455
585
  bcmvn <- find.pK(sweep.stats)
@@ -546,7 +676,7 @@ if (!is.null(envs$DoubletFinder) && is.list(envs$DoubletFinder) && envs$DoubletF
546
676
  )
547
677
  }
548
678
 
549
- log_info("Saving filtered seurat object ...")
679
+ log_info("Saving QC'ed seurat object ...")
550
680
  saveRDS(sobj, rdsfile)
551
681
 
552
682
  save_report(joboutdir)
@@ -8,6 +8,7 @@ library(tidyr)
8
8
  library(dplyr)
9
9
  library(tidyseurat)
10
10
  library(digest)
11
+ library(clustree)
11
12
 
12
13
  set.seed(8525)
13
14
 
@@ -28,6 +29,40 @@ plan(strategy = "multicore", workers = envs$ncores)
28
29
  args
29
30
  }
30
31
 
32
+ .expand_resolution <- function(resolution) {
33
+ expanded_res <- c()
34
+ for (res in resolution) {
35
+ if (is.numeric(res)) {
36
+ expanded_res <- c(expanded_res, res)
37
+ } else {
38
+ # is.character
39
+ parts <- trimws(unlist(strsplit(res, ",")))
40
+ for (part in parts) {
41
+ if (grepl(":", part)) {
42
+ parts <- trimws(unlist(strsplit(part, ":")))
43
+ if (length(parts) == 2) { parts <- c(parts, 0.1) }
44
+ if (length(parts) != 3) {
45
+ stop("Invalid resolution format: {part}. Expected 2 or 3 parts separated by ':' for a range.")
46
+ }
47
+ parts <- as.numeric(parts)
48
+ expanded_res <- c(expanded_res, seq(parts[1], parts[2], by = parts[3]))
49
+ } else {
50
+ expanded_res <- c(expanded_res, as.numeric(part))
51
+ }
52
+ }
53
+ }
54
+ }
55
+ # keep the last resolution at last
56
+ rev(unique(rev(expanded_res)))
57
+ }
58
+
59
+ # recode clusters from 0, 1, 2, ... to s1, s2, s3, ...
60
+ .recode_clusters <- function(clusters) {
61
+ recode <- function(x) paste0("s", as.integer(as.character(x)) + 1)
62
+ clusters <- factor(recode(clusters), levels = recode(levels(clusters)))
63
+ clusters
64
+ }
65
+
31
66
  envs$RunUMAP <- .expand_dims(envs$RunUMAP)
32
67
  envs$FindNeighbors <- .expand_dims(envs$FindNeighbors)
33
68
 
@@ -63,7 +98,8 @@ for (key in names(envs$cases)) {
63
98
  subset = envs$subset,
64
99
  RunUMAP = envs$RunUMAP,
65
100
  FindNeighbors = envs$FindNeighbors,
66
- FindClusters = envs$FindClusters
101
+ FindClusters = envs$FindClusters,
102
+ clustree_devpars = envs$clustree_devpars
67
103
  ),
68
104
  case
69
105
  )
@@ -132,36 +168,49 @@ for (key in names(envs$cases)) {
132
168
  }
133
169
 
134
170
  case$FindClusters$random.seed <- case$FindClusters$random.seed %||% 8525
135
- resolution <- case$FindClusters$resolution %||% 0.8
136
- if (is.character(resolution)) {
137
- if (grepl(",", resolution)) {
138
- resolution <- as.numeric(trimws(unlist(strsplit(resolution, ","))))
139
- } else {
140
- resolution <- as.numeric(resolution)
171
+ resolution <- case$FindClusters$resolution <- .expand_resolution(case$FindClusters$resolution %||% 0.8)
172
+ cached <- get_cached(case$FindClusters, "FindClusters", cache_dir)
173
+ if (is.null(cached$data)) {
174
+ log_info("- Running FindClusters at resolution: {paste(resolution, collapse = ',')} ...")
175
+ case$FindClusters$object <- sobj
176
+ # avoid overwriting the previous clustering results (as they have the same graph name
177
+ sobj1 <- do_call(FindClusters, case$FindClusters)
178
+ graph_name <- case$FindClusters$graph.name %||% paste0(DefaultAssay(sobj), "_snn_res.")
179
+ for (res in resolution) {
180
+ cluster_name <- paste0(graph_name, res)
181
+ new_cluster_name <- paste0(key, ".", res)
182
+ sobj1@meta.data[[new_cluster_name]] <- .recode_clusters(sobj1@meta.data[[cluster_name]])
141
183
  }
184
+ sobj1@meta.data[[key]] <- .recode_clusters(sobj1@meta.data$seurat_clusters)
185
+ keys <- sapply(resolution, function(res) paste0(key, ".", res))
186
+ keys <- c(keys, key)
187
+ cached$data <- sobj1@meta.data[, keys, drop = FALSE]
188
+ save_to_cache(cached, "FindClusters", cache_dir)
189
+ rm(sobj1)
190
+ } else {
191
+ log_info("- Using cached FindClusters at resolution: {paste(resolution, collapse = ',')} ...")
142
192
  }
143
- for (res in resolution) {
144
- case$FindClusters$resolution <- res
145
- cached <- get_cached(case$FindClusters, paste0("FindClusters_", res), cache_dir)
146
- res_key <- paste0("seurat_clusters_", res)
147
- if (is.null(cached$data)) {
148
- log_info("- Running FindClusters at resolution: {res} ...")
149
- case$FindClusters$object <- sobj
150
- sobj1 <- do_call(FindClusters, case$FindClusters)
151
- levels(sobj1$seurat_clusters) <- paste0("s", as.numeric(levels(sobj1$seurat_clusters)) + 1)
152
- sobj1[[res_key]] <- sobj1$seurat_clusters
153
- cached$data <- sobj1@meta.data[, res_key, drop = FALSE]
154
- save_to_cache(cached, paste0("FindClusters_", res), cache_dir)
155
- } else {
156
- log_info("- Using cached FindClusters at resolution: {res} ...")
157
- }
158
- ident_table <- table(cached$data[[res_key]])
159
- log_info(" Found {length(ident_table)} clusters")
160
- print(ident_table)
161
- cat("\n")
193
+
194
+ ident_table <- table(cached$data[[key]])
195
+ log_info(" Found {length(ident_table)} clusters")
196
+ print(ident_table)
197
+ cat("\n")
198
+
199
+ if (length(resolution) > 1) {
200
+ log_info("- Plotting clustree ...")
201
+ png(
202
+ file.path(joboutdir, paste0(key, ".clustree.png")),
203
+ res = case$clustree_devpars$res,
204
+ width = case$clustree_devpars$width,
205
+ height = case$clustree_devpars$height
206
+ )
207
+ p <- clustree(cached$data, prefix = paste0(key, "."))
208
+ print(p)
209
+ dev.off()
162
210
  }
211
+
163
212
  log_info("- Updating meta.data with subclusters...")
164
- srtobj <- AddMetaData(srtobj, metadata = cached$data, col.name = key)
213
+ srtobj <- AddMetaData(srtobj, metadata = cached$data)
165
214
  srtobj[[paste0("sub_umap_", key)]] <- reduc
166
215
  }
167
216