biopipen 0.32.3__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (117) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +6 -0
  3. biopipen/core/filters.py +35 -23
  4. biopipen/core/testing.py +6 -1
  5. biopipen/ns/bam.py +39 -0
  6. biopipen/ns/cellranger.py +5 -0
  7. biopipen/ns/cellranger_pipeline.py +2 -2
  8. biopipen/ns/cnvkit_pipeline.py +4 -1
  9. biopipen/ns/delim.py +33 -27
  10. biopipen/ns/protein.py +99 -0
  11. biopipen/ns/scrna.py +411 -250
  12. biopipen/ns/snp.py +16 -3
  13. biopipen/ns/tcr.py +125 -1
  14. biopipen/ns/vcf.py +34 -0
  15. biopipen/ns/web.py +5 -1
  16. biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
  17. biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
  18. biopipen/reports/tcr/ClonalStats.svelte +15 -0
  19. biopipen/reports/utils/misc.liq +20 -7
  20. biopipen/scripts/bam/BamMerge.py +2 -2
  21. biopipen/scripts/bam/BamSampling.py +4 -4
  22. biopipen/scripts/bam/BamSort.py +141 -0
  23. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  24. biopipen/scripts/bam/BamSubsetByBed.py +3 -3
  25. biopipen/scripts/bam/CNVpytor.py +10 -10
  26. biopipen/scripts/bam/ControlFREEC.py +11 -11
  27. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  28. biopipen/scripts/bed/BedConsensus.py +5 -5
  29. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  30. biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
  31. biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
  32. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  33. biopipen/scripts/cellranger/CellRangerCount.py +20 -9
  34. biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
  35. biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
  36. biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
  37. biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
  38. biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
  39. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  40. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
  41. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  42. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  43. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
  44. biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
  45. biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
  46. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  47. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  48. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  49. biopipen/scripts/delim/SampleInfo.R +85 -148
  50. biopipen/scripts/misc/Config2File.py +2 -2
  51. biopipen/scripts/misc/Str2File.py +2 -2
  52. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  53. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  54. biopipen/scripts/protein/Prodigy.py +4 -4
  55. biopipen/scripts/protein/RMSD.py +178 -0
  56. biopipen/scripts/regulatory/MotifScan.py +8 -8
  57. biopipen/scripts/scrna/CellCellCommunication.py +59 -22
  58. biopipen/scripts/scrna/MarkersFinder.R +273 -654
  59. biopipen/scripts/scrna/RadarPlots.R +73 -53
  60. biopipen/scripts/scrna/SCP-plot.R +15202 -0
  61. biopipen/scripts/scrna/ScVelo.py +0 -0
  62. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -31
  63. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -54
  64. biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -403
  65. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +32 -17
  66. biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -239
  67. biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
  68. biopipen/scripts/scrna/SeuratMap2Ref.R +16 -12
  69. biopipen/scripts/scrna/SeuratPreparing.R +138 -81
  70. biopipen/scripts/scrna/SlingShot.R +71 -0
  71. biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
  72. biopipen/scripts/snp/Plink2GTMat.py +26 -11
  73. biopipen/scripts/snp/PlinkFilter.py +7 -7
  74. biopipen/scripts/snp/PlinkFromVcf.py +8 -5
  75. biopipen/scripts/snp/PlinkSimulation.py +4 -4
  76. biopipen/scripts/snp/PlinkUpdateName.py +4 -4
  77. biopipen/scripts/stats/ChowTest.R +48 -22
  78. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  79. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  80. biopipen/scripts/tcr/ClonalStats.R +484 -0
  81. biopipen/scripts/tcr/ScRepLoading.R +127 -0
  82. biopipen/scripts/tcr/TCRDock.py +10 -6
  83. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  84. biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
  85. biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
  86. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  87. biopipen/scripts/vcf/BcftoolsSort.py +4 -4
  88. biopipen/scripts/vcf/BcftoolsView.py +5 -5
  89. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  90. biopipen/scripts/vcf/VcfAnno.py +11 -11
  91. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  92. biopipen/scripts/vcf/VcfFilter.py +5 -5
  93. biopipen/scripts/vcf/VcfFix.py +7 -7
  94. biopipen/scripts/vcf/VcfFix_utils.py +12 -3
  95. biopipen/scripts/vcf/VcfIndex.py +3 -3
  96. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  97. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  98. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  99. biopipen/scripts/vcf/bcftools_utils.py +3 -3
  100. biopipen/scripts/web/Download.py +8 -4
  101. biopipen/scripts/web/DownloadList.py +5 -5
  102. biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
  103. biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
  104. biopipen/scripts/web/gcloud_common.py +1 -1
  105. biopipen/utils/gsea.R +75 -35
  106. biopipen/utils/misc.R +205 -7
  107. biopipen/utils/misc.py +17 -8
  108. biopipen/utils/reference.py +11 -11
  109. biopipen/utils/repr.R +146 -0
  110. biopipen/utils/vcf.py +1 -1
  111. {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/METADATA +8 -8
  112. {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/RECORD +114 -105
  113. {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/WHEEL +1 -1
  114. biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -144
  115. biopipen/scripts/scrna/SeuratPreparing-common.R +0 -467
  116. biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -204
  117. {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/entry_points.txt +0 -0
@@ -1,467 +0,0 @@
1
-
2
- stringify_list <- function(x) {
3
- paste(sapply(names(x), function(n) paste(n, x[[n]], sep = " = ") ), collapse = "; ")
4
- }
5
-
6
- format_args <- function(args) {
7
- paste(capture.output(str(args)), collapse = ", ")
8
- }
9
-
10
- rename_files = function(e, sample, path) {
11
- tmpdatadir = file.path(joboutdir, "renamed", sample)
12
- if (dir.exists(tmpdatadir)) {
13
- unlink(tmpdatadir, recursive = TRUE)
14
- }
15
- dir.create(tmpdatadir, recursive = TRUE, showWarnings = FALSE)
16
- barcodefile = Sys.glob(file.path(path, "*barcodes.tsv.gz"))[1]
17
- file.symlink(
18
- normalizePath(barcodefile),
19
- file.path(tmpdatadir, "barcodes.tsv.gz")
20
- )
21
- genefile = glob(file.path(path, "*{genes,features}.tsv.gz"))[1]
22
- file.symlink(
23
- normalizePath(genefile),
24
- file.path(tmpdatadir, "features.tsv.gz")
25
- )
26
- matrixfile = Sys.glob(file.path(path, "*matrix.mtx.gz"))[1]
27
- file.symlink(
28
- normalizePath(matrixfile),
29
- file.path(tmpdatadir, "matrix.mtx.gz")
30
- )
31
- Read10X(data.dir = tmpdatadir)
32
- }
33
-
34
-
35
- perform_cell_qc <- function(sobj, per_sample = FALSE) {
36
- log_prefix <- ifelse(per_sample, " ", "- ")
37
- log_info("{log_prefix}Adding metadata for QC ...")
38
- sobj$percent.mt <- PercentageFeatureSet(sobj, pattern = "^MT-|^Mt-|^mt-")
39
- sobj$percent.ribo <- PercentageFeatureSet(sobj, pattern = "^RP[SL]|^Rp[sl]")
40
- sobj$percent.hb <- PercentageFeatureSet(sobj, pattern = "^HB[^P]|^Hb[^p]")
41
- sobj$percent.plat <- PercentageFeatureSet(sobj, pattern = "PECAM1|PF4|Pecam1|Pf4")
42
-
43
- if (is.null(envs$cell_qc) || length(envs$cell_qc) == 0) {
44
- log_warn("{log_prefix}No cell QC criteria is provided. All cells will be kept.")
45
- cell_qc <- "TRUE"
46
- } else {
47
- cell_qc <- envs$cell_qc
48
- }
49
-
50
- sobj@meta.data <- sobj@meta.data %>% mutate(.QC = !!rlang::parse_expr(cell_qc))
51
-
52
- if (is.null(cell_qc_df)) {
53
- cell_qc_df <<- sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE]
54
- } else {
55
- cell_qc_df <<- rbind(cell_qc_df, sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE])
56
- }
57
-
58
- # Do the filtering
59
- log_info("{log_prefix}Filtering cells using QC criteria ...")
60
- sobj <- subset(sobj, subset = .QC)
61
- sobj$.QC <- NULL
62
-
63
- return(sobj)
64
- }
65
-
66
- report_cell_qc = function(ngenes) {
67
- # uses cell_qc_df
68
-
69
- # Violin plots
70
- log_info("- Plotting violin plots ...")
71
- add_report(
72
- list(
73
- kind = "descr",
74
- content = paste(
75
- "The violin plots for each feature. The cells are grouped by sample.",
76
- "The cells that fail the QC criteria are colored in red, and",
77
- "the cells that pass the QC criteria are colored in black.",
78
- "The cells that fail the QC criteria are filtered out in the returned Seurat object."
79
- )
80
- ),
81
- h1 = "Violin Plots"
82
- )
83
- for (feat in feats) {
84
- log_info(" For feature: {feat}")
85
- vln_p <- ggplot(cell_qc_df, aes(x = Sample, y = !!sym(feat), color = .QC)) +
86
- geom_violin(fill = "white", width = 0.5) +
87
- geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
88
- scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
89
- labs(x = "Sample", y = feat) +
90
- theme_minimal()
91
-
92
- vlnplot = file.path(plotsdir, paste0(slugify(feat), ".vln.png"))
93
- png(
94
- vlnplot,
95
- width = 800 + length(samples) * 15, height = 600, res = 100
96
- )
97
- print(vln_p)
98
- dev.off()
99
-
100
- vlnplot_pdf = file.path(plotsdir, paste0(slugify(feat), ".vln.pdf"))
101
- pdf(
102
- vlnplot_pdf,
103
- width = (800 + length(samples) * 15) / 100, height = 600 / 100
104
- )
105
- print(vln_p)
106
- dev.off()
107
-
108
- add_report(
109
- list(
110
- src = vlnplot,
111
- name = feat,
112
- download = vlnplot_pdf,
113
- descr = paste0("Distribution of ", feat, " for each sample.")
114
- ),
115
- h1 = "Violin Plots",
116
- ui = "table_of_images"
117
- )
118
- }
119
-
120
- # Scatter plots against nCount_RNA
121
- log_info("- Plotting scatter plots ...")
122
- add_report(
123
- list(
124
- kind = "descr",
125
- content = paste(
126
- "The scatter plots for each feature against nCount_RNA. ",
127
- "The cells that fail the QC criteria are colored in red, and",
128
- "the cells that pass the QC criteria are colored in black.",
129
- "The cells that fail the QC criteria are filtered out in the returned Seurat object."
130
- )
131
- ),
132
- h1 = "Scatter Plots"
133
- )
134
- for (feat in setdiff(feats, "nCount_RNA")) {
135
- log_info(" For feature: {feat}, against nCount_RNA")
136
- scat_p <- ggplot(cell_qc_df, aes(x = nCount_RNA, y = !!sym(feat), color = .QC)) +
137
- geom_point() +
138
- scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
139
- labs(x = "nCount_RNA", y = feat) +
140
- theme_minimal()
141
-
142
- scatfile = file.path(plotsdir, paste0(slugify(feat), "-nCount_RNA.scatter.png"))
143
- png(scatfile, width = 800, height = 600, res = 100)
144
- print(scat_p)
145
- dev.off()
146
-
147
- scatfile_pdf = file.path(plotsdir, paste0(slugify(feat), "-nCount_RNA.scatter.pdf"))
148
- pdf(scatfile_pdf, width = 8, height = 6)
149
- print(scat_p)
150
- dev.off()
151
-
152
- add_report(
153
- list(
154
- src = scatfile,
155
- name = paste0(feat, " vs nCount_RNA"),
156
- download = scatfile_pdf,
157
- descr = paste0("Scatter plot for ", feat, " against nCount_RNA")
158
- ),
159
- h1 = "Scatter Plots",
160
- ui = "table_of_images"
161
- )
162
- }
163
-
164
- # return the dim_df calculated from the cell_qc_df
165
- rbind(
166
- cell_qc_df %>%
167
- # group_by(Sample) %>%
168
- summarise(
169
- when = "Before_Cell_QC",
170
- nCells = dplyr::n(),
171
- nGenes = ngenes
172
- ) %>%
173
- ungroup(),
174
- cell_qc_df %>%
175
- filter(.QC) %>%
176
- # group_by(Sample) %>%
177
- summarise(
178
- when = "After_Cell_QC",
179
- nCells = dplyr::n(),
180
- nGenes = ngenes
181
- ) %>%
182
- ungroup()
183
- )
184
- }
185
-
186
- load_sample = function(sample) {
187
- log_info("- Loading sample: {sample} ...")
188
- mdata = as.data.frame(metadata)[metadata$Sample == sample, , drop=TRUE]
189
- path = as.character(mdata$RNAData)
190
- if (is.na(path) || !is.character(path) || nchar(path) == 0 || path == "NA") {
191
- warning(paste0("No path found for sample: ", sample))
192
- return (NULL)
193
- }
194
-
195
- # obj_list = list()
196
- if (dir.exists(path)) {
197
- exprs = tryCatch(
198
- # Read10X requires
199
- # - barcodes.tsv.gz
200
- # - genes.tsv.gz
201
- # - matrix.mtx.gz
202
- # But sometimes, they are prefixed with sample name
203
- # e.g.GSM4143656_SAM24345863-ln1.barcodes.tsv.gz
204
- { Read10X(data.dir = path) },
205
- error = function(e) rename_files(e, sample, path)
206
- )
207
- } else {
208
- exprs = Read10X_h5(path)
209
- }
210
- if ("Gene Expression" %in% names(exprs)) {
211
- exprs = exprs[["Gene Expression"]]
212
- }
213
- obj <- CreateSeuratObject(exprs, project=sample)
214
- # filter the cells that don't have any gene expressions
215
- # cell_exprs = colSums(obj@assays$RNA)
216
- # obj = subset(obj, cells = names(cell_exprs[cell_exprs > 0]))
217
- obj = RenameCells(obj, add.cell.id = sample)
218
- # Attach meta data
219
- for (mname in names(mdata)) {
220
- if (mname %in% c("RNAData", "TCRData")) { next }
221
- mdt = mdata[[mname]]
222
- if (is.factor(mdt)) { mdt = levels(mdt)[mdt] }
223
- obj[[mname]] = mdt
224
- }
225
-
226
- if (isTRUE(envs$cell_qc_per_sample)) {
227
- log_info("- Perform cell QC for sample: {sample} ...")
228
- obj = perform_cell_qc(obj, per_sample = TRUE)
229
- }
230
-
231
- if (isTRUE(envs$use_sct)) {
232
- # so that we have data and scale.data layers on RNA assay
233
- # useful for visualization in case some genes are not in
234
- # the SCT assay
235
- obj = NormalizeData(obj, verbose = FALSE)
236
- obj = FindVariableFeatures(obj, verbose = FALSE)
237
- obj = ScaleData(obj, verbose = FALSE)
238
- }
239
- obj
240
- }
241
-
242
- run_gene_qc <- function(sobj) {
243
- cached <- get_cached(
244
- list(
245
- cell_qc = envs$cell_qc,
246
- gene_qc = envs$gene_qc,
247
- cell_qc_per_sample = envs$cell_qc_per_sample,
248
- use_sct = envs$use_sct
249
- ),
250
- "GeneQC",
251
- cache_dir
252
- )
253
- if (!is.null(cached$data)) {
254
- log_info("Loading gene-QC'ed object from cache ...")
255
- sobj <- cached$data
256
- } else {
257
- log_info("Filtering genes ...")
258
- genes <- rownames(sobj)
259
- filtered <- FALSE
260
- if (!is.null(envs$gene_qc$min_cells) && envs$gene_qc$min_cells > 0) {
261
- genes = genes[Matrix::rowSums(sobj) >= envs$gene_qc$min_cells]
262
- filtered <- TRUE
263
- }
264
- excludes <- envs$gene_qc$excludes
265
- if (!is.null(excludes)) {
266
- if (length(excludes) == 1) {
267
- excludes <- trimws(unlist(strsplit(excludes, ",")))
268
- }
269
- for (ex in excludes) {
270
- genes <- genes[!grepl(ex, genes)]
271
- }
272
- filtered <- TRUE
273
- }
274
- if (filtered) {
275
- sobj = subset(sobj, features = genes)
276
- }
277
- cached$data <- sobj
278
- save_to_cache(cached, "GeneQC", cache_dir)
279
- }
280
- sobj
281
- }
282
-
283
- run_cell_qc <- function(sobj) {
284
- cached <- get_cached(
285
- list(cell_qc = envs$cell_qc, cell_qc_per_sample = envs$cell_qc_per_sample, use_sct = envs$use_sct),
286
- "CellQC",
287
- cache_dir
288
- )
289
- if (!is.null(cached$data)) {
290
- log_info("Loading cell-QC'ed object from cache ...")
291
- sobj <- cached$data$sobj
292
- cell_qc_df <<- cached$data$cell_qc_df
293
- } else {
294
- # Load data
295
- log_info("Reading samples individually ...")
296
- obj_list = lapply(samples, load_sample)
297
-
298
- log_info("Merging samples ...")
299
- sobj = Reduce(merge, obj_list)
300
- rm(obj_list)
301
- gc()
302
-
303
- if (!envs$cell_qc_per_sample) {
304
- log_info("Performing cell QC ...")
305
- sobj = perform_cell_qc(sobj, per_sample = FALSE)
306
- }
307
-
308
- cached$data <- list(sobj = sobj, cell_qc_df = cell_qc_df)
309
- save_to_cache(cached, "CellQC", cache_dir)
310
- }
311
- sobj
312
- }
313
-
314
- run_transformation <- function(sobj) {
315
- envs_cache <- envs
316
- envs_cache$ncores <- NULL
317
- envs_cache$doublet_detector <- NULL
318
- envs_cache$DoubletFinder <- NULL
319
- envs_cache$scDblFinder <- NULL
320
- envs_cache$IntegrateLayers <- NULL
321
- cached <- get_cached(envs_cache, "Transformed", cache_dir)
322
- if (!is.null(cached$data)) {
323
- log_info("Loading transformed object from cache ...")
324
- sobj <- cached$data
325
- } else {
326
- log_info("Performing transformation/scaling ...")
327
- # Not joined yet
328
- # sobj[["RNA"]] <- split(sobj[["RNA"]], f = sobj$Sample)
329
- if (envs$use_sct) {
330
- log_info("- Running SCTransform ...")
331
- SCTransformArgs <- envs$SCTransform
332
- # log to stdout but don't populate it to running log
333
- print(paste0(" SCTransform: ", format_args(SCTransformArgs)))
334
- log_debug(" SCTransform: {format_args(SCTransformArgs)}")
335
- SCTransformArgs$object <- sobj
336
- sobj <- do_call(SCTransform, SCTransformArgs)
337
- # Default is to use the SCT assay
338
-
339
- # Cleanup memory
340
- SCTransformArgs$object <- NULL
341
- rm(SCTransformArgs)
342
- gc()
343
- } else {
344
- log_info("- Running NormalizeData ...")
345
- NormalizeDataArgs <- envs$NormalizeData
346
- print(paste0(" NormalizeData: ", format_args(NormalizeDataArgs)))
347
- log_debug(" NormalizeData: {format_args(NormalizeDataArgs)}")
348
- NormalizeDataArgs$object <- sobj
349
- sobj <- do_call(NormalizeData, NormalizeDataArgs)
350
-
351
- # Cleanup memory
352
- NormalizeDataArgs$object <- NULL
353
- rm(NormalizeDataArgs)
354
- gc()
355
-
356
- log_info("- Running FindVariableFeatures ...")
357
- FindVariableFeaturesArgs <- envs$FindVariableFeatures
358
- print(paste0(" FindVariableFeatures: ", format_args(FindVariableFeaturesArgs)))
359
- log_debug(" FindVariableFeatures: {format_args(FindVariableFeaturesArgs)}")
360
- FindVariableFeaturesArgs$object <- sobj
361
- sobj <- do_call(FindVariableFeatures, FindVariableFeaturesArgs)
362
-
363
- # Cleanup memory
364
- FindVariableFeaturesArgs$object <- NULL
365
- rm(FindVariableFeaturesArgs)
366
- gc()
367
-
368
- log_info("- Running ScaleData ...")
369
- ScaleDataArgs <- envs$ScaleData
370
- print(paste0(" ScaleData: ", format_args(ScaleDataArgs)))
371
- log_debug(" ScaleData: {format_args(ScaleDataArgs)}")
372
- ScaleDataArgs$object <- sobj
373
- sobj <- do_call(ScaleData, ScaleDataArgs)
374
-
375
- # Cleanup memory
376
- ScaleDataArgs$object <- NULL
377
- rm(ScaleDataArgs)
378
- gc()
379
- }
380
-
381
- log_info("- Running RunPCA ...")
382
- RunPCAArgs <- envs$RunPCA
383
- RunPCAArgs$npcs <- if (is.null(RunPCAArgs$npcs)) { 50 } else { min(RunPCAArgs$npcs, ncol(sobj) - 1) }
384
- print(paste0(" RunPCA: ", format_args(RunPCAArgs)))
385
- log_debug(" RunPCA: {format_args(RunPCAArgs)}")
386
- RunPCAArgs$object <- sobj
387
- sobj <- do_call(RunPCA, RunPCAArgs)
388
-
389
- # Cleanup memory
390
- RunPCAArgs$object <- NULL
391
- rm(RunPCAArgs)
392
- gc()
393
-
394
- cached$data <- sobj
395
- save_to_cache(cached, "Transformed", cache_dir)
396
- }
397
-
398
- sobj
399
- }
400
-
401
- run_integration <- function(sobj) {
402
-
403
- envs_cache <- envs
404
- envs_cache$ncores <- NULL
405
- envs_cache$doublet_detector <- NULL
406
- envs_cache$DoubletFinder <- NULL
407
- envs_cache$scDblFinder <- NULL
408
- cached <- get_cached(envs_cache, "Integrated", cache_dir)
409
-
410
- if (!is.null(cached$data)) {
411
- log_info("Loading integrated/layer-joined object from cache ...")
412
- sobj <- cached$data
413
- } else {
414
-
415
- if (!envs$no_integration) {
416
- log_info("- Running IntegrateLayers (method = {envs$IntegrateLayers$method}) ...")
417
- IntegrateLayersArgs <- envs$IntegrateLayers
418
- method <- IntegrateLayersArgs$method
419
- if (!is.null(IntegrateLayersArgs$reference) && is.character(IntegrateLayersArgs$reference)) {
420
- log_info(" Using reference samples: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
421
- IntegrateLayersArgs$reference <- match(IntegrateLayersArgs$reference, samples)
422
- log_info(" Transferred to indices: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
423
- }
424
- if (method %in% c("CCA", "cca")) { method <- "CCAIntegration" } else
425
- if (method %in% c("RPCA", "rpca")) { method <- "RPCAIntegration" } else
426
- if (method %in% c("Harmony", "harmony")) { method <- "HarmonyIntegration" } else
427
- if (method %in% c("FastMNN", "fastmnn")) { method <- "FastMNNIntegration" } else
428
- if (method %in% c("scVI", "scvi")) { method <- "scVIIntegration" } else
429
- { stop(paste0("Unknown integration method: ", method)) }
430
- if (envs$use_sct && is.null(IntegrateLayersArgs$normalization.method)) {
431
- IntegrateLayersArgs$normalization.method <- "SCT"
432
- }
433
- IntegrateLayersArgs$method <- eval(parse(text = method))
434
- new_reductions <- list(
435
- "CCAIntegration" = "integrated.cca",
436
- "RPCAIntegration" = "integrated.rpca",
437
- "HarmonyIntegration" = "harmony",
438
- "FastMNNIntegration" = "integration.mnn",
439
- "scVIIntegration" = "integrated.scvi"
440
- )
441
- if (is.null(IntegrateLayersArgs$new.reduction)) {
442
- IntegrateLayersArgs$new.reduction <- new_reductions[[method]]
443
- }
444
- print(paste0(" IntegrateLayers: ", format_args(IntegrateLayersArgs)))
445
- log_debug(" IntegrateLayers: {format_args(IntegrateLayersArgs)}")
446
- IntegrateLayersArgs$object <- sobj
447
- sobj <- do_call(IntegrateLayers, IntegrateLayersArgs)
448
- # Save it for dimension reduction plots
449
- sobj@misc$integrated_new_reduction <- IntegrateLayersArgs$new.reduction
450
-
451
- # Cleanup memory
452
- IntegrateLayersArgs$object <- NULL
453
- rm(IntegrateLayersArgs)
454
- gc()
455
- }
456
-
457
- if (!envs$use_sct) {
458
- log_info("- Joining layers ...")
459
- sobj <- JoinLayers(sobj)
460
- }
461
-
462
- cached$data <- sobj
463
- save_to_cache(cached, "Integrated", cache_dir)
464
- }
465
-
466
- sobj
467
- }
@@ -1,204 +0,0 @@
1
- .get_envs_cached_doubletfinder <- function() {
2
- envs_cache <- envs
3
- envs_cache$ncores <- NULL
4
- envs_cache$doublet_detector <- NULL
5
- envs_cache$scDblFinder <- NULL
6
- envs_cache$DoubletFinder$ncores <- NULL
7
- envs_cache
8
- }
9
-
10
- .get_envs_cached_scdblfinder <- function() {
11
- envs_cache <- envs
12
- envs_cache$ncores <- NULL
13
- envs_cache$doublet_detector <- NULL
14
- envs_cache$DoubletFinder <- NULL
15
- envs_cache$scDblFinder$ncores <- NULL
16
- envs_cache
17
- }
18
-
19
- .run_doubletfinder <- function() {
20
- library(DoubletFinder)
21
- log_info("- Preparing Seurat object ...")
22
-
23
- if (is.null(envs$DoubletFinder$ncores)) {
24
- envs$DoubletFinder$ncores <- envs$ncores
25
- }
26
-
27
- # More controls from envs?
28
- sobj <- FindNeighbors(sobj, dims = 1:envs$DoubletFinder$PCs)
29
- sobj <- FindClusters(sobj)
30
-
31
- log_info("- pK Indentification ...")
32
- sweep.res.list <- paramSweep(
33
- sobj,
34
- PCs = 1:envs$DoubletFinder$PCs,
35
- sct = envs$use_sct,
36
- num.cores = envs$DoubletFinder$ncores
37
- )
38
- sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
39
- bcmvn <- find.pK(sweep.stats)
40
- bcmvn$Selected <- bcmvn$pK == bcmvn$pK[which.max(bcmvn$BCmetric)[1]]
41
-
42
- pK <- bcmvn$pK[which.max(bcmvn$BCmetric)[1]]
43
- pK <- as.numeric(as.character(pK))
44
- pN <- envs$DoubletFinder$pN
45
- log_info("- Homotypic Doublet Proportion Estimate ...")
46
- homotypic.prop <- modelHomotypic(Idents(sobj))
47
- nExp_poi <- round(nrow(sobj@meta.data) * envs$DoubletFinder$doublets)
48
- nExp_poi.adj <- round(nExp_poi * (1 - homotypic.prop))
49
-
50
- log_info("- Running DoubletFinder ...")
51
- sobj <- doubletFinder(
52
- sobj,
53
- PCs = 1:envs$DoubletFinder$PCs,
54
- pN = pN,
55
- pK = pK,
56
- nExp = nExp_poi.adj,
57
- reuse.pANN = FALSE,
58
- sct = envs$use_sct
59
- )
60
- pANN_col <- paste0("pANN_", pN, "_", pK)
61
- pANN_col <- colnames(sobj@meta.data)[grepl(pANN_col, colnames(sobj@meta.data))]
62
- DF_col <- paste0("DF.classifications_", pN, "_", pK)
63
- DF_col <- colnames(sobj@meta.data)[grepl(DF_col, colnames(sobj@meta.data))]
64
- doublets <- sobj@meta.data[, c(pANN_col, DF_col), drop = FALSE]
65
- colnames(doublets) <- c("DoubletFinder_score","DoubletFinder_DropletType")
66
- doublets$DoubletFinder_DropletType <- tolower(doublets$DoubletFinder_DropletType)
67
-
68
- pk_plot <- ggplot(bcmvn, aes(x = pK, y = BCmetric, color = Selected)) +
69
- geom_point() +
70
- # rotate x axis labels
71
- theme(axis.text.x = element_text(angle = 90, hjust = 1))
72
- list(doublets = doublets, pk_plot = pk_plot)
73
- }
74
-
75
- .run_scdblfinder <- function() {
76
- library(scDblFinder)
77
- if (is.null(envs$scDblFinder$ncores)) {
78
- envs$scDblFinder$ncores <- envs$ncores
79
- }
80
-
81
- envs$scDblFinder$sce <- GetAssayData(sobj, layer = "counts")
82
- if (envs$scDblFinder$ncores > 1) {
83
- envs$scDblFinder$BPPARAM <- BiocParallel::MulticoreParam(envs$scDblFinder$ncores, RNGseed = 8525)
84
- }
85
- envs$scDblFinder$returnType <- "table"
86
- envs$scDblFinder$ncores <- NULL
87
-
88
- doublets <- do_call(scDblFinder, envs$scDblFinder)
89
- doublets <- doublets[doublets$type == "real", , drop = FALSE]
90
- doublets <- doublets[, c("score", "class"), drop = FALSE]
91
- colnames(doublets) <- c("scDblFinder_score", "scDblFinder_DropletType")
92
-
93
- list(doublets = doublets)
94
- }
95
-
96
- run_dd <- function(detector) {
97
- log_info("Running {detector} ...")
98
- if (detector == "DoubletFinder") {
99
- envs_cache_fun <- .get_envs_cached_doubletfinder
100
- run_fun <- .run_doubletfinder
101
- } else if (detector == "scDblFinder") {
102
- envs_cache_fun <- .get_envs_cached_scdblfinder
103
- run_fun <- .run_scdblfinder
104
- } else {
105
- stop("Unknown doublet detector: ", detector)
106
- }
107
-
108
- cached <- get_cached(envs_cache_fun(), detector, cache_dir)
109
- if (!is.null(cached$data)) {
110
- log_info("- Loading cached results ...")
111
- results <- cached$data
112
- } else {
113
- results <- run_fun()
114
-
115
- cached$data <- results
116
- save_to_cache(cached, detector, cache_dir)
117
- }
118
-
119
- results
120
- }
121
-
122
- save_dd <- function(dd, detector) {
123
- doublets <- dd$doublets
124
- write.table(
125
- doublets,
126
- file.path(joboutdir, paste0(detector, "_doublets_singlets.txt")),
127
- row.names = FALSE,
128
- quote = FALSE,
129
- sep = "\t"
130
- )
131
-
132
- summary <- as.data.frame(table(dd$doublets[[paste0(detector, "_DropletType")]]))
133
- colnames(summary) <- c("Classification", "Droplet_N")
134
- write.table(
135
- summary,
136
- file.path(joboutdir, paste0(detector, "_summary.txt")),
137
- row.names = FALSE,
138
- quote = FALSE,
139
- sep = "\t"
140
- )
141
-
142
- n_doublet <- summary$Droplet_N[summary$Classification == 'doublet']
143
- log_info("- {n_doublet}/{sum(summary$Droplet_N)} doublets detected.")
144
- }
145
-
146
- add_dd_to_seurat <- function(sobj, dd) {
147
- AddMetaData(sobj, metadata = as.data.frame(dd$doublets))
148
- }
149
-
150
- plot_dd <- function(sobj, dd, detector) {
151
- if (detector == "DoubletFinder") {
152
- log_debug("- Plotting pK vs BCmetric ...")
153
- ggsave(dd$pk_plot, filename = file.path(plotsdir, "DoubletFinder_pK_BCmetric.png"))
154
- }
155
-
156
- log_info("- Plotting dimension reduction ...")
157
- dimp <- DimPlot(
158
- sobj, group.by = paste0(detector, "_DropletType"), order = "doublet",
159
- cols = c("#333333", "#FF3333"), pt.size = 0.8, alpha = 0.5)
160
- ggsave(dimp, filename = file.path(plotsdir, paste0(detector, "_dimplot.png")))
161
- ggsave(dimp, filename = file.path(plotsdir, paste0(detector, "_dimplot.pdf")))
162
- }
163
-
164
- filter_dd <- function(sobj, dd, detector) {
165
- subset(sobj,
166
- cells = rownames(dd$doublets[
167
- dd$doublets[[paste0(detector, "_DropletType")]] == "singlet", ,
168
- drop = FALSE
169
- ]))
170
- }
171
-
172
- report_dd <- function(detector) {
173
- add_report(
174
- list(
175
- kind = "descr",
176
- content = "The table contains the number of cells classified as singlets and doublets."
177
- ),
178
- list(
179
- kind = "table",
180
- data = list(path = file.path(joboutdir, paste0(detector, "_summary.txt")))
181
- ),
182
- h1 = paste0(detector, " Results"),
183
- h2 = paste0("The ", detector, " Summary")
184
- )
185
-
186
- if (detector == "DoubletFinder") {
187
- add_report(
188
- list(name = "pK vs BCmetric", src = file.path(plotsdir, "DoubletFinder_pK_BCmetric.png")),
189
- list(name = "Dimension Reduction Plot", src = file.path(plotsdir, "DoubletFinder_dimplot.png"),
190
- download = file.path(plotsdir, "DoubletFinder_dimplot.pdf")),
191
- ui = "table_of_images",
192
- h1 = "DoubletFinder Results",
193
- h2 = "Plots"
194
- )
195
- } else {
196
- add_report(
197
- list(name = "Dimension Reduction Plot", src = file.path(plotsdir, "scDblFinder_dimplot.png"),
198
- download = file.path(plotsdir, "scDblFinder_dimplot.pdf")),
199
- ui = "table_of_images",
200
- h1 = "scDblFinder Results",
201
- h2 = "Plots"
202
- )
203
- }
204
- }