biopipen 0.23.8__py3-none-any.whl → 0.24.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

biopipen/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.23.8"
1
+ __version__ = "0.24.1"
biopipen/core/proc.py CHANGED
@@ -25,3 +25,10 @@ class Proc(PipenProc):
25
25
  "filters": {**FILTERS, **filtermanager.filters},
26
26
  "search_paths": SEARCH_PATHS + [str(REPORT_DIR)],
27
27
  }
28
+
29
+ plugin_opts = {
30
+ "poplog_pattern": (
31
+ r"^(?P<level>INFO|WARN|WARNING|CRITICAL|ERROR|DEBUG?)\s*"
32
+ r"\[\d+-\d+-\d+ \d+:\d+:\d+\] (?P<message>.*)$"
33
+ )
34
+ }
biopipen/ns/cellranger.py CHANGED
@@ -35,7 +35,7 @@ class CellRangerCount(Proc):
35
35
  {%- set fastqs = fastqs[0] | glob: "*.fastq.gz" -%}
36
36
  {%- endif -%}
37
37
  {%- set sample = commonprefix(*fastqs) |
38
- regex_replace: "_L\\d+_$", "" |
38
+ regex_replace: "_L\\d+_?$", "" |
39
39
  regex_replace: "_S\\d+$", "" -%}
40
40
  {{- sample -}}
41
41
  """
@@ -84,7 +84,7 @@ class CellRangerVdj(Proc):
84
84
  {%- set fastqs = fastqs[0] | glob: "*.fastq.gz" -%}
85
85
  {%- endif -%}
86
86
  {%- set sample = commonprefix(*fastqs) |
87
- regex_replace: "_L\\d+_$", "" |
87
+ regex_replace: "_L\\d+_?$", "" |
88
88
  regex_replace: "_S\\d+$", "" -%}
89
89
  {{- sample -}}
90
90
  """
biopipen/ns/scrna.py CHANGED
@@ -278,18 +278,14 @@ class SeuratClustering(Proc):
278
278
  The results will be saved in `seurat_clusters_<resolution>`.
279
279
  The final resolution will be used to define the clusters at `seurat_clusters`.
280
280
  - <more>: See <https://satijalab.org/seurat/reference/findclusters>
281
- cache (type=auto): Whether to cache the seurat object with cluster information.
281
+ cache (type=auto): Whether to cache the information at different steps.
282
282
  If `True`, the seurat object will be cached in the job output directory, which will be not cleaned up when job is rerunning.
283
- The cached seurat object will be saved as `<signature>.cached.RDS` file, where `<signature>` is the signature determined by
283
+ The cached seurat object will be saved as `<signature>.<kind>.RDS` file, where `<signature>` is the signature determined by
284
284
  the input and envs of the process.
285
- See -
286
- * <https://github.com/satijalab/seurat/issues/7849>
287
- * <https://github.com/satijalab/seurat/issues/5358> and
288
- * <https://github.com/satijalab/seurat/issues/6748> for more details.
285
+ See <https://github.com/satijalab/seurat/issues/7849>, <https://github.com/satijalab/seurat/issues/5358> and
286
+ <https://github.com/satijalab/seurat/issues/6748> for more details also about reproducibility issues.
289
287
  To not use the cached seurat object, you can either set `cache` to `False` or delete the cached file at
290
- `<signature>.cached.RDS` in the cache directory.
291
- If `True`, the cache directory is `.pipen/<Pipeline>/SeuratClustering/0/output/`
292
- You can also specify customized directory to save the cached seurat object by setting `cache` to the directory path.
288
+ `<signature>.RDS` in the cache directory.
293
289
 
294
290
  Requires:
295
291
  r-seurat:
@@ -309,7 +305,7 @@ class SeuratClustering(Proc):
309
305
  "RunUMAP": {"dims": 30},
310
306
  "FindNeighbors": {},
311
307
  "FindClusters": {"resolution": 0.8},
312
- "cache": False,
308
+ "cache": config.path.tmpdir,
313
309
  }
314
310
  script = "file://../scripts/scrna/SeuratClustering.R"
315
311
 
@@ -361,18 +357,14 @@ class SeuratSubClustering(Proc):
361
357
  The results will be saved in `<casename>_<resolution>`.
362
358
  The final resolution will be used to define the clusters at `<casename>`.
363
359
  - <more>: See <https://satijalab.org/seurat/reference/findclusters>
364
- cache (type=auto): Whether to cache the seurat object with cluster information.
360
+ cache (type=auto): Whether to cache the information at different steps.
365
361
  If `True`, the seurat object will be cached in the job output directory, which will be not cleaned up when job is rerunning.
366
- The cached seurat object will be saved as `<signature>.cached.RDS` file, where `<signature>` is the signature determined by
362
+ The cached seurat object will be saved as `<signature>.<kind>.RDS` file, where `<signature>` is the signature determined by
367
363
  the input and envs of the process.
368
- See -
369
- * <https://github.com/satijalab/seurat/issues/7849>
370
- * <https://github.com/satijalab/seurat/issues/5358> and
371
- * <https://github.com/satijalab/seurat/issues/6748> for more details.
364
+ See <https://github.com/satijalab/seurat/issues/7849>, <https://github.com/satijalab/seurat/issues/5358> and
365
+ <https://github.com/satijalab/seurat/issues/6748> for more details also about reproducibility issues.
372
366
  To not use the cached seurat object, you can either set `cache` to `False` or delete the cached file at
373
- `<signature>.cached.RDS` in the cache directory.
374
- If `True`, the cache directory is `.pipen/<Pipeline>/SeuratClustering/0/output/`
375
- You can also specify customized directory to save the cached seurat object by setting `cache` to the directory path.
367
+ `<signature>.RDS` in the cache directory.
376
368
  cases (type=json): The cases to perform subclustering.
377
369
  Keys are the names of the cases and values are the dicts inherited from `envs` except `mutaters` and `cache`.
378
370
  If empty, a case with name `subcluster` will be created with default parameters.
@@ -387,7 +379,7 @@ class SeuratSubClustering(Proc):
387
379
  "RunUMAP": {"dims": 30},
388
380
  "FindNeighbors": {},
389
381
  "FindClusters": {"resolution": 0.8},
390
- "cache": False,
382
+ "cache": config.path.tmpdir,
391
383
  "cases": {"subcluster": {}},
392
384
  }
393
385
  script = "file://../scripts/scrna/SeuratSubClustering.R"
@@ -2002,4 +1994,5 @@ class MetaMarkers(Proc):
2002
1994
  plugin_opts = {
2003
1995
  "report": "file://../reports/scrna/MetaMarkers.svelte",
2004
1996
  "report_paging": 8,
1997
+ "poplog_max": 15,
2005
1998
  }
biopipen/ns/tcr.py CHANGED
@@ -563,12 +563,13 @@ class Immunarch(Proc):
563
563
  A Gini coefficient of one (or 100 percents) expresses maximal inequality among values (for example where only one person has all the income).
564
564
  - d50: The D50 index.
565
565
  It is the number of types that are needed to cover 50%% of the total abundance.
566
- - dxx: The Dxx index.
567
- It is the number of types that are needed to cover xx%% of the total abundance.
568
- The percentage should be specified in the `args` argument using `perc` key.
569
566
  - raref: Species richness from the results of sampling through extrapolation.
570
567
  - by: The variables (column names) to group samples.
571
568
  Multiple columns should be separated by `,`.
569
+ - plot_type (choice): The type of the plot, works when `by` is specified.
570
+ Not working for `raref`.
571
+ - box: Boxplot
572
+ - bar: Barplot with error bars
572
573
  - subset: Subset the data before calculating the clonotype volumes.
573
574
  The whole data will be expanded to cell level, and then subsetted.
574
575
  Clone sizes will be re-calculated based on the subsetted data.
@@ -789,9 +790,9 @@ class Immunarch(Proc):
789
790
  },
790
791
  # Diversity
791
792
  "divs": {
792
- "filter": None,
793
793
  "method": "gini",
794
794
  "by": None,
795
+ "plot_type": "bar",
795
796
  "args": {},
796
797
  "order": [],
797
798
  "test": {
@@ -805,8 +806,8 @@ class Immunarch(Proc):
805
806
  "align_y": False,
806
807
  "log": False,
807
808
  "devpars": {
808
- "width": 1000,
809
- "height": 1000,
809
+ "width": 800,
810
+ "height": 800,
810
811
  "res": 100,
811
812
  },
812
813
  "subset": None,
@@ -851,6 +852,7 @@ class Immunarch(Proc):
851
852
  plugin_opts = {
852
853
  "report": "file://../reports/tcr/Immunarch.svelte",
853
854
  "report_paging": 3,
855
+ "poplog_max": 999,
854
856
  }
855
857
 
856
858
 
@@ -1,4 +1,5 @@
1
1
  source("{{biopipen_dir}}/utils/misc.R")
2
+ source("{{biopipen_dir}}/utils/caching.R")
2
3
 
3
4
  library(Seurat)
4
5
  library(future)
@@ -35,80 +36,100 @@ envs$FindNeighbors <- .expand_dims(envs$FindNeighbors)
35
36
  log_info("Reading Seurat object ...")
36
37
  sobj <- readRDS(srtfile)
37
38
 
38
- if (isTRUE(envs$cache)) {
39
- envs$cache <- joboutdir
39
+ if (isTRUE(envs$cache)) { envs$cache <- joboutdir }
40
+ if (length(envs$cache) > 1) {
41
+ log_warn("Multiple cache directories (envs.cache) detected, using the first one.")
42
+ envs$cache <- envs$cache[1]
40
43
  }
41
-
42
- if (is.character(envs$cache) && nchar(envs$cache) > 0) {
43
- log_info("Obtainning the signature ...")
44
- envs2 <- envs
45
- envs2$ncores <- NULL
46
- sig <- c(
47
- capture.output(str(sobj)),
48
- "\n\n-------------------\n\n",
49
- capture.output(str(envs2)),
50
- "\n"
51
- )
52
- digested_sig <- digest::digest(sig, algo = "md5")
53
- cached_file <- file.path(envs$cache, paste0(digested_sig, ".cached.RDS"))
54
- if (file.exists(cached_file)) {
55
- log_info("Using cached results {cached_file}")
56
- # copy cached file to rdsfile
57
- file.copy(cached_file, rdsfile, copy.date = TRUE)
58
- quit()
59
- } else {
60
- log_info("Cached results not found, logging the current and cached signatures.")
61
- log_info("- Current signature: {digested_sig}")
62
- # print(sig)
63
- # sigfiles <- Sys.glob(file.path(envs$cache, "*.signature.txt"))
64
- # for (sigfile in sigfiles) {
65
- # log_info("- Found cached signature file: {sigfile}")
66
- # cached_sig <- readLines(sigfile)
67
- # log_info("- Cached signature:")
68
- # print(cached_sig)
69
- # }
70
- writeLines(sig, file.path(envs$cache, paste0(digested_sig, ".signature.txt")))
71
- }
44
+ sobj_sig <- capture.output(str(sobj))
45
+ dig_sig <- digest::digest(sobj_sig, algo = "md5")
46
+ dig_sig <- substr(dig_sig, 1, 8)
47
+ cache_dir <- NULL
48
+ if (is.character(envs$cache)) {
49
+ cache_dir <- file.path(envs$cache, paste0(dig_sig, ".seurat_cache"))
50
+ dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE)
51
+ writeLines(sobj_sig, file.path(cache_dir, "signature.txt"))
72
52
  }
73
53
 
74
54
  if (length(envs$ScaleData) > 0) {
75
55
  if (DefaultAssay(sobj) == "SCT") {
76
56
  stop("SCT assay detected, but ScaleData is specified. Use SCTransform instead.")
77
57
  }
78
- log_info("Running ScaleData ...")
79
- envs$ScaleData$object <- sobj
80
- sobj <- do_call(ScaleData, envs$ScaleData)
58
+ cached <- get_cached(envs$ScaleData, "ScaleData", cache_dir)
59
+ if (is.null(cached$data)) {
60
+ log_info("Running ScaleData ...")
61
+ envs$ScaleData$object <- sobj
62
+ sobj <- do_call(ScaleData, envs$ScaleData)
63
+ cached$data <- list(assay = sobj@assays$RNA, commands = sobj@commands)
64
+ save_to_cache(cached, "ScaleData", cache_dir)
65
+ } else {
66
+ log_info("Loading cached ScaleData ...")
67
+ sobj@assays$RNA <- cached$data$assay
68
+ sobj@commands <- cached$data$commands
69
+ DefaultAssay(sobj) <- "RNA"
70
+ }
81
71
  } else if (length(envs$SCTransform) > 0) {
82
72
  if (DefaultAssay(sobj) != "SCT") {
83
73
  stop("SCT assay not detected, but SCTransform is specified. Use ScaleData instead.")
84
74
  }
85
- log_info("Running SCTransform ...")
86
- envs$SCTransform$object <- sobj
87
- sobj <- do_call(SCTransform, envs$SCTransform)
75
+ cached <- get_cached(envs$SCTransform, "SCTransform", cache_dir)
76
+ asssay <- envs$SCTransform$new.assay.name %||% "SCT"
77
+ if (is.null(cached$data)) {
78
+ log_info("Running SCTransform ...")
79
+ envs$SCTransform$object <- sobj
80
+ sobj <- do_call(SCTransform, envs$SCTransform)
81
+ cached$data <- list(assay = sobj@assays$SCT, commands = sobj@commands)
82
+ save_to_cache(cached, "SCTransform", cache_dir)
83
+ } else {
84
+ log_info("Loading cached SCTransform ...")
85
+ sobj@assays[[assay]] <- cached$data$assay
86
+ sobj@commands <- cached$data$commands
87
+ DefaultAssay(sobj) <- assay
88
+ }
88
89
  }
89
90
 
90
- log_info("Running RunUMAP ...")
91
- umap_args <- list_setdefault(
92
- envs$RunUMAP,
93
- object = sobj,
94
- dims = 1:30,
95
- reduction = sobj@misc$integrated_new_reduction %||% "pca"
96
- )
97
- umap_args$dims <- 1:min(max(umap_args$dims), ncol(sobj) - 1)
98
- sobj <- do_call(RunUMAP, umap_args)
99
-
100
- log_info("Running FindNeighbors ...")
101
- envs$FindNeighbors$object <- sobj
102
- if (is.null(envs$FindNeighbors$reduction)) {
103
- envs$FindNeighbors$reduction <- sobj@misc$integrated_new_reduction %||% "pca"
91
+ cached <- get_cached(envs$RunUMAP, "RunUMAP", cache_dir)
92
+ reduc_name <- envs$RunUMAP$reduction.name %||% "umap"
93
+ if (is.null(cached$data)) {
94
+ log_info("Running RunUMAP ...")
95
+ umap_args <- list_setdefault(
96
+ envs$RunUMAP,
97
+ object = sobj,
98
+ dims = 1:30,
99
+ reduction = sobj@misc$integrated_new_reduction %||% "pca"
100
+ )
101
+ ncells <- ncol(sobj)
102
+ umap_args$dims <- 1:min(max(umap_args$dims), ncells - 1)
103
+ umap_method <- envs$RunUMAP$umap.method %||% "uwot"
104
+ if (umap_method == "uwot" && is.null(envs$RunUMAP$n.neighbors)) {
105
+ # https://github.com/satijalab/seurat/issues/4312
106
+ umap_args$n.neighbors <- min(ncells - 1, 30)
107
+ }
108
+ sobj <- do_call(RunUMAP, umap_args)
109
+ cached$data <- list(reduc = sobj@reductions[[reduc_name]], commands = sobj@commands)
110
+ save_to_cache(cached, "RunUMAP", cache_dir)
111
+ } else {
112
+ log_info("Loading cached RunUMAP ...")
113
+ sobj@reductions[[reduc_name]] <- cached$data$reduc
114
+ sobj@commands <- cached$data$commands
104
115
  }
105
- sobj <- do_call(FindNeighbors, envs$FindNeighbors)
106
116
 
107
- log_info("Running FindClusters ...")
108
- if (is.null(envs$FindClusters$random.seed)) {
109
- envs$FindClusters$random.seed <- 8525
117
+ cached <- get_cached(envs$FindNeighbors, "FindNeighbors", cache_dir)
118
+ if (is.null(cached$data)) {
119
+ log_info("Running FindNeighbors ...")
120
+ envs$FindNeighbors$object <- sobj
121
+ envs$FindNeighbors$reduction <- sobj@misc$integrated_new_reduction %||% "pca"
122
+ sobj <- do_call(FindNeighbors, envs$FindNeighbors)
123
+ cached$data <- list(graphs = sobj@graphs, commands = sobj@commands)
124
+ save_to_cache(cached, "FindNeighbors", cache_dir)
125
+ } else {
126
+ log_info("Loading cached FindNeighbors ...")
127
+ sobj@graphs <- cached$data$graphs
128
+ sobj@commands <- cached$data$commands
110
129
  }
111
- resolution <- envs$FindClusters$resolution
130
+
131
+ envs$FindClusters$random.seed <- envs$FindClusters$random.seed %||% 8525
132
+ resolution <- envs$FindClusters$resolution %||% 0.8
112
133
  if (is.character(resolution)) {
113
134
  if (grepl(",", resolution)) {
114
135
  resolution <- as.numeric(trimws(unlist(strsplit(resolution, ","))))
@@ -116,42 +137,38 @@ if (is.character(resolution)) {
116
137
  resolution <- as.numeric(resolution)
117
138
  }
118
139
  }
119
- if (is.null(resolution) || length(resolution) == 1) {
120
- envs$FindClusters$resolution <- resolution
121
- envs$FindClusters$object <- sobj
122
- sobj <- do_call(FindClusters, envs$FindClusters)
123
- levels(sobj$seurat_clusters) <- paste0("c", as.numeric(levels(sobj$seurat_clusters)) + 1)
124
- Idents(sobj) <- "seurat_clusters"
125
- ident_table <- table(sobj$seurat_clusters)
126
- log_info("- Found {length(ident_table)} clusters:")
127
- print(ident_table)
128
- } else {
129
- log_info("- Multiple resolutions detected ...")
130
- res_key <- NULL
131
- for (res in resolution) {
132
- findclusters_args <- envs$FindClusters
133
- findclusters_args$resolution <- res
134
- findclusters_args$object <- sobj
135
- sobj <- do_call(FindClusters, findclusters_args)
140
+
141
+ for (res in resolution) {
142
+ envs$FindClusters$resolution <- res
143
+ cached <- get_cached(envs$FindClusters, paste0("FindClusters_", res), cache_dir)
144
+ res_key <- paste0("seurat_clusters_", res)
145
+ if (is.null(cached$data)) {
146
+ log_info("Running FindClusters at resolution: {res} ...")
147
+ envs$FindClusters$object <- sobj
148
+ sobj <- do_call(FindClusters, envs$FindClusters)
136
149
  levels(sobj$seurat_clusters) <- paste0("c", as.numeric(levels(sobj$seurat_clusters)) + 1)
137
- res_key <- paste0("seurat_clusters_", res)
138
150
  sobj[[res_key]] <- sobj$seurat_clusters
139
- ident_table <- table(sobj[[res_key]])
140
- log_info("- Found {length(ident_table)} at resolution: {res}:")
141
- print(ident_table)
151
+ Idents(sobj) <- "seurat_clusters"
152
+ cached$data <- list(clusters = sobj$seurat_clusters, commands = sobj@commands)
153
+ save_to_cache(cached, paste0("FindClusters_", res), cache_dir)
154
+ } else {
155
+ log_info("Loading cached FindClusters at resolution: {res} ...")
156
+ sobj@commands <- cached$data$commands
157
+ sobj[[res_key]] <- cached$data$clusters
158
+ sobj$seurat_clusters <- cached$data$clusters
159
+ Idents(sobj) <- "seurat_clusters"
142
160
  }
161
+ ident_table <- table(Idents(sobj))
162
+ log_info("- Found {length(ident_table)} clusters")
163
+ print(ident_table)
164
+ cat("\n")
143
165
  }
144
166
 
145
167
  if (DefaultAssay(sobj) == "SCT") {
146
- # https://github.com/satijalab/seurat/issues/6968
168
+ # https://github.com/satijalab/seurat/issues/6968
147
169
  log_info("Running PrepSCTFindMarkers ...")
148
170
  sobj <- PrepSCTFindMarkers(sobj)
149
171
  }
150
172
 
151
173
  log_info("Saving results ...")
152
174
  saveRDS(sobj, file = rdsfile)
153
-
154
- if (is.character(envs$cache) && nchar(envs$cache) > 0) {
155
- log_info("Caching results ...")
156
- file.copy(rdsfile, cached_file, overwrite = TRUE)
157
- }
@@ -301,26 +301,31 @@ log_info("Performing transformation/scaling ...")
301
301
  if (envs$use_sct) {
302
302
  log_info("- Running SCTransform ...")
303
303
  SCTransformArgs <- envs$SCTransform
304
- log_info(" SCTransform: {.formatArgs(SCTransformArgs)}")
304
+ # log to stdout but don't populate it to running log
305
+ print(" SCTransform: {.formatArgs(SCTransformArgs)}")
306
+ log_debug(" SCTransform: {.formatArgs(SCTransformArgs)}")
305
307
  SCTransformArgs$object <- sobj
306
308
  sobj <- do_call(SCTransform, SCTransformArgs)
307
309
  # Default is to use the SCT assay
308
310
  } else {
309
311
  log_info("- Running NormalizeData ...")
310
312
  NormalizeDataArgs <- envs$NormalizeData
311
- log_info(" NormalizeData: {.formatArgs(NormalizeDataArgs)}")
313
+ print(" NormalizeData: {.formatArgs(NormalizeDataArgs)}")
314
+ log_debug(" NormalizeData: {.formatArgs(NormalizeDataArgs)}")
312
315
  NormalizeDataArgs$object <- sobj
313
316
  sobj <- do_call(NormalizeData, NormalizeDataArgs)
314
317
 
315
318
  log_info("- Running FindVariableFeatures ...")
316
319
  FindVariableFeaturesArgs <- envs$FindVariableFeatures
317
- log_info(" FindVariableFeatures: {.formatArgs(FindVariableFeaturesArgs)}")
320
+ print(" FindVariableFeatures: {.formatArgs(FindVariableFeaturesArgs)}")
321
+ log_debug(" FindVariableFeatures: {.formatArgs(FindVariableFeaturesArgs)}")
318
322
  FindVariableFeaturesArgs$object <- sobj
319
323
  sobj <- do_call(FindVariableFeatures, FindVariableFeaturesArgs)
320
324
 
321
325
  log_info("- Running ScaleData ...")
322
326
  ScaleDataArgs <- envs$ScaleData
323
- log_info(" ScaleData: {.formatArgs(ScaleDataArgs)}")
327
+ print(" ScaleData: {.formatArgs(ScaleDataArgs)}")
328
+ log_debug(" ScaleData: {.formatArgs(ScaleDataArgs)}")
324
329
  ScaleDataArgs$object <- sobj
325
330
  sobj <- do_call(ScaleData, ScaleDataArgs)
326
331
  }
@@ -328,7 +333,8 @@ if (envs$use_sct) {
328
333
  log_info("- Running RunPCA ...")
329
334
  RunPCAArgs <- envs$RunPCA
330
335
  RunPCAArgs$npcs <- if (is.null(RunPCAArgs$npcs)) { 50 } else { min(RunPCAArgs$npcs, ncol(sobj) - 1) }
331
- log_info(" RunPCA: {.formatArgs(RunPCAArgs)}")
336
+ print(" RunPCA: {.formatArgs(RunPCAArgs)}")
337
+ log_debug(" RunPCA: {.formatArgs(RunPCAArgs)}")
332
338
  RunPCAArgs$object <- sobj
333
339
  sobj <- do_call(RunPCA, RunPCAArgs)
334
340
 
@@ -361,7 +367,8 @@ if (!envs$no_integration) {
361
367
  if (is.null(IntegrateLayersArgs$new.reduction)) {
362
368
  IntegrateLayersArgs$new.reduction <- new_reductions[[method]]
363
369
  }
364
- log_info(" IntegrateLayers: {.formatArgs(IntegrateLayersArgs)}")
370
+ print(" IntegrateLayers: {.formatArgs(IntegrateLayersArgs)}")
371
+ log_debug(" IntegrateLayers: {.formatArgs(IntegrateLayersArgs)}")
365
372
  IntegrateLayersArgs$object <- sobj
366
373
  sobj <- do_call(IntegrateLayers, IntegrateLayersArgs)
367
374
  # Save it for dimension reduction plots
@@ -1,4 +1,5 @@
1
1
  source("{{biopipen_dir}}/utils/misc.R")
2
+ source("{{biopipen_dir}}/utils/caching.R")
2
3
 
3
4
  library(Seurat)
4
5
  library(future)
@@ -33,40 +34,10 @@ envs$FindNeighbors <- .expand_dims(envs$FindNeighbors)
33
34
  log_info("Reading Seurat object ...")
34
35
  srtobj <- readRDS(srtfile)
35
36
 
36
- if (isTRUE(envs$cache)) {
37
- envs$cache <- joboutdir
38
- }
39
-
40
- if (is.character(envs$cache) && nchar(envs$cache) > 0) {
41
- log_info("Obtainning the signature ...")
42
- envs2 <- envs
43
- envs2$ncores <- NULL
44
- sig <- c(
45
- capture.output(str(srtobj)),
46
- "\n\n-------------------\n\n",
47
- capture.output(str(envs2)),
48
- "\n"
49
- )
50
- digested_sig <- digest::digest(sig, algo = "md5")
51
- cached_file <- file.path(envs$cache, paste0(digested_sig, ".cached.RDS"))
52
- if (file.exists(cached_file)) {
53
- log_info("Using cached results {cached_file}")
54
- # copy cached file to rdsfile
55
- file.copy(cached_file, rdsfile, copy.date = TRUE)
56
- quit()
57
- } else {
58
- log_info("Cached results not found.")
59
- log_info("- Current signature: {digested_sig}")
60
- # print(sig)
61
- # sigfiles <- Sys.glob(file.path(envs$cache, "*.signature.txt"))
62
- # for (sigfile in sigfiles) {
63
- # log_info("- Found cached signature file: {sigfile}")
64
- # cached_sig <- readLines(sigfile)
65
- # log_info("- Cached signature:")
66
- # print(cached_sig)
67
- # }
68
- writeLines(sig, file.path(envs$cache, paste0(digested_sig, ".signature.txt")))
69
- }
37
+ if (isTRUE(envs$cache)) { envs$cache <- joboutdir }
38
+ if (length(envs$cache) > 1) {
39
+ log_warn("Multiple cache directories (envs.cache) detected, using the first one.")
40
+ envs$cache <- envs$cache[1]
70
41
  }
71
42
 
72
43
  if (!is.null(envs$mutaters) && length(envs$mutaters) > 0) {
@@ -102,30 +73,66 @@ for (key in names(envs$cases)) {
102
73
  }
103
74
 
104
75
  log_info("- Subsetting ...")
105
- sobj <- srtobj %>% filter(!!parse_expr(case$subset))
106
-
107
- log_info("- Running RunUMAP ...")
108
- umap_args <- list_setdefault(
109
- case$RunUMAP,
110
- object = sobj,
111
- dims = 1:30,
112
- reduction = sobj@misc$integrated_new_reduction %||% "pca"
113
- )
114
- umap_args$dims <- 1:min(max(umap_args$dims), ncol(sobj) - 1)
115
- sobj <- do_call(RunUMAP, umap_args)
116
-
117
- log_info("- Running FindNeighbors ...")
118
- case$FindNeighbors$object <- sobj
119
- if (is.null(case$FindNeighbors$reduction)) {
120
- case$FindNeighbors$reduction <- sobj@misc$integrated_new_reduction %||% "pca"
76
+ sobj <- tryCatch({
77
+ srtobj %>% filter(!!parse_expr(case$subset))
78
+ }, error = function(e) {
79
+ stop(paste0(" Error in subset: ", e$message))
80
+ })
81
+ sobj_sig <- capture.output(str(sobj))
82
+ dig_sig <- digest::digest(sobj_sig, algo = "md5")
83
+ dig_sig <- substr(dig_sig, 1, 8)
84
+ cache_dir <- NULL
85
+ if (is.character(envs$cache)) {
86
+ cache_dir <- file.path(envs$cache, paste0(dig_sig, ".seurat_cache"))
87
+ dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE)
88
+ writeLines(sobj_sig, file.path(cache_dir, "signature.txt"))
121
89
  }
122
- sobj <- do_call(FindNeighbors, case$FindNeighbors)
123
90
 
124
- log_info("- Running FindClusters ...")
125
- if (is.null(case$FindClusters$random.seed)) {
126
- case$FindClusters$random.seed <- 8525
91
+ cached <- get_cached(case$RunUMAP, "RunUMAP", cache_dir)
92
+ reduc_name <- case$RunUMAP$reduction.name %||% "umap"
93
+ if (is.null(cached$data)) {
94
+ log_info("- Running RunUMAP ...")
95
+ umap_args <- list_setdefault(
96
+ case$RunUMAP,
97
+ object = sobj,
98
+ dims = 1:30,
99
+ reduction = sobj@misc$integrated_new_reduction %||% "pca"
100
+ )
101
+ ncells <- ncol(sobj)
102
+ umap_args$dims <- 1:min(max(umap_args$dims), ncells - 1)
103
+ umap_method <- case$RunUMAP$umap.method %||% "uwot"
104
+ if (umap_method == "uwot" && is.null(case$RunUMAP$n.neighbors)) {
105
+ # https://github.com/satijalab/seurat/issues/4312
106
+ umap_args$n.neighbors <- min(ncells - 1, 30)
107
+ }
108
+ sobj <- do_call(RunUMAP, umap_args)
109
+ cached$data <- list(reduc = sobj@reductions[[reduc_name]], commands = sobj@commands)
110
+ save_to_cache(cached, "RunUMAP", cache_dir)
111
+ } else {
112
+ log_info("- Loading cached RunUMAP ...")
113
+ sobj@reductions[[reduc_name]] <- cached$data$reduc
114
+ sobj@commands <- cached$data$commands
115
+ }
116
+ reduc <- cached$data$reduc
117
+
118
+ cached <- get_cached(case$FindNeighbors, "FindNeighbors", cache_dir)
119
+ if (is.null(cached$data)) {
120
+ log_info("- Running FindNeighbors ...")
121
+ case$FindNeighbors$object <- sobj
122
+ if (is.null(case$FindNeighbors$reduction)) {
123
+ case$FindNeighbors$reduction <- sobj@misc$integrated_new_reduction %||% "pca"
124
+ }
125
+ sobj <- do_call(FindNeighbors, case$FindNeighbors)
126
+ cached$data <- list(graphs = sobj@graphs, commands = sobj@commands)
127
+ save_to_cache(cached, "FindNeighbors", cache_dir)
128
+ } else {
129
+ log_info("- Loading cached FindNeighbors ...")
130
+ sobj@graphs <- cached$data$graphs
131
+ sobj@commands <- cached$data$commands
127
132
  }
128
- resolution <- case$FindClusters$resolution
133
+
134
+ case$FindClusters$random.seed <- case$FindClusters$random.seed %||% 8525
135
+ resolution <- case$FindClusters$resolution %||% 0.8
129
136
  if (is.character(resolution)) {
130
137
  if (grepl(",", resolution)) {
131
138
  resolution <- as.numeric(trimws(unlist(strsplit(resolution, ","))))
@@ -133,53 +140,30 @@ for (key in names(envs$cases)) {
133
140
  resolution <- as.numeric(resolution)
134
141
  }
135
142
  }
136
- if (is.null(resolution) || length(resolution) == 1) {
137
- case$FindClusters$resolution <- resolution
138
- case$FindClusters$object <- sobj
139
- sobj <- do_call(FindClusters, case$FindClusters)
140
- levels(sobj$seurat_clusters) <- paste0("s", as.numeric(levels(sobj$seurat_clusters)) + 1)
141
- Idents(sobj) <- "seurat_clusters"
142
- sobj[[key]] <- sobj$seurat_clusters
143
- ident_table <- table(sobj[[key]])
144
- log_info("- Found {length(ident_table)} clusters:")
145
- print(ident_table)
146
- cat("\n")
147
-
148
- log_info("- Updating meta.data with subclusters...")
149
- srtobj <- AddMetaData(srtobj, metadata = sobj@meta.data[, key, drop = FALSE])
150
- srtobj[[paste0("sub_umap_", key)]] <- sobj@reductions$umap
151
- } else {
152
- log_info("- Multiple resolutions detected ...")
153
- log_info("")
154
- metadata <- NULL
155
- for (res in resolution) {
156
- findclusters_args <- case$FindClusters
157
- findclusters_args$resolution <- res
158
- findclusters_args$object <- sobj
159
- sobj1 <- do_call(FindClusters, findclusters_args)
160
- res_key <- paste0(key, "_", res)
143
+ for (res in resolution) {
144
+ case$FindClusters$resolution <- res
145
+ cached <- get_cached(case$FindClusters, paste0("FindClusters_", res), cache_dir)
146
+ res_key <- paste0("seurat_clusters_", res)
147
+ if (is.null(cached$data)) {
148
+ log_info("- Running FindClusters at resolution: {res} ...")
149
+ case$FindClusters$object <- sobj
150
+ sobj1 <- do_call(FindClusters, case$FindClusters)
161
151
  levels(sobj1$seurat_clusters) <- paste0("s", as.numeric(levels(sobj1$seurat_clusters)) + 1)
162
- Idents(sobj1) <- "seurat_clusters"
163
152
  sobj1[[res_key]] <- sobj1$seurat_clusters
164
- ident_table <- table(sobj1[[res_key]])
165
- log_info("- Found {length(ident_table)} at resolution: {res}:")
166
- print(ident_table)
167
- cat("\n")
168
-
169
- log_info("- Updating meta.data with subclusters...")
170
- metadata <- sobj1@meta.data[, res_key, drop = FALSE]
171
- srtobj <- AddMetaData(srtobj, metadata = metadata)
172
- srtobj[[paste0("sub_umap_", res_key)]] <- sobj1@reductions$umap
153
+ cached$data <- sobj1@meta.data[, res_key, drop = FALSE]
154
+ save_to_cache(cached, paste0("FindClusters_", res), cache_dir)
155
+ } else {
156
+ log_info("- Using cached FindClusters at resolution: {res} ...")
173
157
  }
174
- srtobj <- AddMetaData(srtobj, metadata = metadata, col.name = key)
175
- srtobj[[paste0("sub_umap_", key)]] <- sobj1@reductions$umap
158
+ ident_table <- table(cached$data[[res_key]])
159
+ log_info(" Found {length(ident_table)} clusters")
160
+ print(ident_table)
161
+ cat("\n")
176
162
  }
163
+ log_info("- Updating meta.data with subclusters...")
164
+ srtobj <- AddMetaData(srtobj, metadata = cached$data, col.name = key)
165
+ srtobj[[paste0("sub_umap_", key)]] <- reduc
177
166
  }
178
167
 
179
168
  log_info("Saving results ...")
180
169
  saveRDS(srtobj, file = rdsfile)
181
-
182
- if (is.character(envs$cache) && nchar(envs$cache) > 0) {
183
- log_info("Caching results to {cached_file} ...")
184
- invisible(file.copy(rdsfile, cached_file, overwrite = TRUE))
185
- }