biopipen 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (134) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +6 -0
  3. biopipen/core/filters.py +77 -26
  4. biopipen/core/testing.py +6 -1
  5. biopipen/ns/bam.py +39 -0
  6. biopipen/ns/cellranger.py +5 -0
  7. biopipen/ns/cellranger_pipeline.py +2 -2
  8. biopipen/ns/cnvkit_pipeline.py +4 -1
  9. biopipen/ns/delim.py +33 -27
  10. biopipen/ns/protein.py +99 -0
  11. biopipen/ns/scrna.py +411 -250
  12. biopipen/ns/snp.py +16 -3
  13. biopipen/ns/tcr.py +125 -1
  14. biopipen/ns/vcf.py +34 -0
  15. biopipen/ns/web.py +5 -1
  16. biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
  17. biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
  18. biopipen/reports/tcr/ClonalStats.svelte +15 -0
  19. biopipen/reports/utils/misc.liq +22 -7
  20. biopipen/scripts/bam/BamMerge.py +2 -2
  21. biopipen/scripts/bam/BamSampling.py +4 -4
  22. biopipen/scripts/bam/BamSort.py +141 -0
  23. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  24. biopipen/scripts/bam/BamSubsetByBed.py +3 -3
  25. biopipen/scripts/bam/CNVpytor.py +10 -10
  26. biopipen/scripts/bam/ControlFREEC.py +11 -11
  27. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  28. biopipen/scripts/bed/BedConsensus.py +5 -5
  29. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  30. biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
  31. biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
  32. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  33. biopipen/scripts/cellranger/CellRangerCount.py +20 -9
  34. biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
  35. biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
  36. biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
  37. biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
  38. biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
  39. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  40. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
  41. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  42. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  43. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
  44. biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
  45. biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
  46. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  47. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  48. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  49. biopipen/scripts/delim/SampleInfo.R +85 -139
  50. biopipen/scripts/misc/Config2File.py +2 -2
  51. biopipen/scripts/misc/Str2File.py +2 -2
  52. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  53. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  54. biopipen/scripts/protein/Prodigy.py +4 -4
  55. biopipen/scripts/protein/RMSD.py +178 -0
  56. biopipen/scripts/regulatory/MotifScan.py +8 -8
  57. biopipen/scripts/scrna/CellCellCommunication.py +59 -22
  58. biopipen/scripts/scrna/CellsDistribution.R +31 -6
  59. biopipen/scripts/scrna/MarkersFinder.R +272 -602
  60. biopipen/scripts/scrna/MetaMarkers.R +16 -7
  61. biopipen/scripts/scrna/RadarPlots.R +75 -35
  62. biopipen/scripts/scrna/SCP-plot.R +15202 -0
  63. biopipen/scripts/scrna/ScVelo.py +0 -0
  64. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -25
  65. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -47
  66. biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -385
  67. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +33 -13
  68. biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -228
  69. biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
  70. biopipen/scripts/scrna/SeuratMap2Ref.R +16 -6
  71. biopipen/scripts/scrna/SeuratPreparing.R +138 -81
  72. biopipen/scripts/scrna/SlingShot.R +71 -0
  73. biopipen/scripts/scrna/TopExpressingGenes.R +9 -7
  74. biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
  75. biopipen/scripts/snp/Plink2GTMat.py +26 -11
  76. biopipen/scripts/snp/PlinkFilter.py +7 -7
  77. biopipen/scripts/snp/PlinkFromVcf.py +8 -5
  78. biopipen/scripts/snp/PlinkSimulation.py +4 -4
  79. biopipen/scripts/snp/PlinkUpdateName.py +4 -4
  80. biopipen/scripts/stats/ChowTest.R +48 -22
  81. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  82. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  83. biopipen/scripts/tcr/CDR3AAPhyschem.R +12 -2
  84. biopipen/scripts/tcr/ClonalStats.R +484 -0
  85. biopipen/scripts/tcr/CloneResidency.R +23 -5
  86. biopipen/scripts/tcr/Immunarch-basic.R +8 -1
  87. biopipen/scripts/tcr/Immunarch-clonality.R +5 -0
  88. biopipen/scripts/tcr/Immunarch-diversity.R +25 -4
  89. biopipen/scripts/tcr/Immunarch-geneusage.R +15 -1
  90. biopipen/scripts/tcr/Immunarch-kmer.R +14 -1
  91. biopipen/scripts/tcr/Immunarch-overlap.R +15 -1
  92. biopipen/scripts/tcr/Immunarch-spectratyping.R +10 -1
  93. biopipen/scripts/tcr/Immunarch-tracking.R +6 -0
  94. biopipen/scripts/tcr/Immunarch-vjjunc.R +33 -0
  95. biopipen/scripts/tcr/ScRepLoading.R +127 -0
  96. biopipen/scripts/tcr/TCRClusterStats.R +24 -7
  97. biopipen/scripts/tcr/TCRDock.py +10 -6
  98. biopipen/scripts/tcr/TESSA.R +6 -1
  99. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  100. biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
  101. biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
  102. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  103. biopipen/scripts/vcf/BcftoolsSort.py +4 -4
  104. biopipen/scripts/vcf/BcftoolsView.py +5 -5
  105. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  106. biopipen/scripts/vcf/VcfAnno.py +11 -11
  107. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  108. biopipen/scripts/vcf/VcfFilter.py +5 -5
  109. biopipen/scripts/vcf/VcfFix.py +7 -7
  110. biopipen/scripts/vcf/VcfFix_utils.py +12 -3
  111. biopipen/scripts/vcf/VcfIndex.py +3 -3
  112. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  113. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  114. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  115. biopipen/scripts/vcf/bcftools_utils.py +3 -3
  116. biopipen/scripts/web/Download.py +8 -4
  117. biopipen/scripts/web/DownloadList.py +5 -5
  118. biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
  119. biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
  120. biopipen/scripts/web/gcloud_common.py +1 -1
  121. biopipen/utils/gsea.R +96 -42
  122. biopipen/utils/misc.R +205 -7
  123. biopipen/utils/misc.py +17 -8
  124. biopipen/utils/plot.R +53 -17
  125. biopipen/utils/reference.py +11 -11
  126. biopipen/utils/repr.R +146 -0
  127. biopipen/utils/vcf.py +1 -1
  128. {biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/METADATA +9 -9
  129. {biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/RECORD +131 -122
  130. {biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/WHEEL +1 -1
  131. biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -139
  132. biopipen/scripts/scrna/SeuratPreparing-common.R +0 -452
  133. biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -201
  134. {biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/entry_points.txt +0 -0
@@ -1,452 +0,0 @@
1
-
2
- stringify_list <- function(x) {
3
- paste(sapply(names(x), function(n) paste(n, x[[n]], sep = " = ") ), collapse = "; ")
4
- }
5
-
6
- format_args <- function(args) {
7
- paste(capture.output(str(args)), collapse = ", ")
8
- }
9
-
10
- rename_files = function(e, sample, path) {
11
- tmpdatadir = file.path(joboutdir, "renamed", sample)
12
- if (dir.exists(tmpdatadir)) {
13
- unlink(tmpdatadir, recursive = TRUE)
14
- }
15
- dir.create(tmpdatadir, recursive = TRUE, showWarnings = FALSE)
16
- barcodefile = Sys.glob(file.path(path, "*barcodes.tsv.gz"))[1]
17
- file.symlink(
18
- normalizePath(barcodefile),
19
- file.path(tmpdatadir, "barcodes.tsv.gz")
20
- )
21
- genefile = glob(file.path(path, "*{genes,features}.tsv.gz"))[1]
22
- file.symlink(
23
- normalizePath(genefile),
24
- file.path(tmpdatadir, "features.tsv.gz")
25
- )
26
- matrixfile = Sys.glob(file.path(path, "*matrix.mtx.gz"))[1]
27
- file.symlink(
28
- normalizePath(matrixfile),
29
- file.path(tmpdatadir, "matrix.mtx.gz")
30
- )
31
- Read10X(data.dir = tmpdatadir)
32
- }
33
-
34
-
35
- perform_cell_qc <- function(sobj, per_sample = FALSE) {
36
- log_prefix <- ifelse(per_sample, " ", "- ")
37
- log_info("{log_prefix}Adding metadata for QC ...")
38
- sobj$percent.mt <- PercentageFeatureSet(sobj, pattern = "^MT-|^Mt-|^mt-")
39
- sobj$percent.ribo <- PercentageFeatureSet(sobj, pattern = "^RP[SL]|^Rp[sl]")
40
- sobj$percent.hb <- PercentageFeatureSet(sobj, pattern = "^HB[^P]|^Hb[^p]")
41
- sobj$percent.plat <- PercentageFeatureSet(sobj, pattern = "PECAM1|PF4|Pecam1|Pf4")
42
-
43
- if (is.null(envs$cell_qc) || length(envs$cell_qc) == 0) {
44
- log_warn("{log_prefix}No cell QC criteria is provided. All cells will be kept.")
45
- cell_qc <- "TRUE"
46
- } else {
47
- cell_qc <- envs$cell_qc
48
- }
49
-
50
- sobj@meta.data <- sobj@meta.data %>% mutate(.QC = !!rlang::parse_expr(cell_qc))
51
-
52
- if (is.null(cell_qc_df)) {
53
- cell_qc_df <<- sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE]
54
- } else {
55
- cell_qc_df <<- rbind(cell_qc_df, sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE])
56
- }
57
-
58
- # Do the filtering
59
- log_info("{log_prefix}Filtering cells using QC criteria ...")
60
- sobj <- subset(sobj, subset = .QC)
61
- sobj$.QC <- NULL
62
-
63
- return(sobj)
64
- }
65
-
66
- report_cell_qc = function(ngenes) {
67
- # uses cell_qc_df
68
-
69
- # Violin plots
70
- log_info("- Plotting violin plots ...")
71
- add_report(
72
- list(
73
- kind = "descr",
74
- content = paste(
75
- "The violin plots for each feature. The cells are grouped by sample.",
76
- "The cells that fail the QC criteria are colored in red, and",
77
- "the cells that pass the QC criteria are colored in black.",
78
- "The cells that fail the QC criteria are filtered out in the returned Seurat object."
79
- )
80
- ),
81
- h1 = "Violin Plots"
82
- )
83
- for (feat in feats) {
84
- log_info(" For feature: {feat}")
85
- vln_p <- ggplot(cell_qc_df, aes(x = Sample, y = !!sym(feat), color = .QC)) +
86
- geom_violin(fill = "white", width = 0.5) +
87
- geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
88
- scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
89
- labs(x = "Sample", y = feat) +
90
- theme_minimal()
91
-
92
- vlnplot = file.path(plotsdir, paste0(slugify(feat), ".vln.png"))
93
- png(
94
- vlnplot,
95
- width = 800 + length(samples) * 15, height = 600, res = 100
96
- )
97
- print(vln_p)
98
- dev.off()
99
-
100
- add_report(
101
- list(
102
- src = vlnplot,
103
- name = feat,
104
- descr = paste0("Distribution of ", feat, " for each sample.")
105
- ),
106
- h1 = "Violin Plots",
107
- ui = "table_of_images"
108
- )
109
- }
110
-
111
- # Scatter plots against nCount_RNA
112
- log_info("- Plotting scatter plots ...")
113
- add_report(
114
- list(
115
- kind = "descr",
116
- content = paste(
117
- "The scatter plots for each feature against nCount_RNA. ",
118
- "The cells that fail the QC criteria are colored in red, and",
119
- "the cells that pass the QC criteria are colored in black.",
120
- "The cells that fail the QC criteria are filtered out in the returned Seurat object."
121
- )
122
- ),
123
- h1 = "Scatter Plots"
124
- )
125
- for (feat in setdiff(feats, "nCount_RNA")) {
126
- log_info(" For feature: {feat}, against nCount_RNA")
127
- scat_p <- ggplot(cell_qc_df, aes(x = nCount_RNA, y = !!sym(feat), color = .QC)) +
128
- geom_point() +
129
- scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
130
- labs(x = "nCount_RNA", y = feat) +
131
- theme_minimal()
132
-
133
- scatfile = file.path(plotsdir, paste0(slugify(feat), "-nCount_RNA.scatter.png"))
134
- png(scatfile, width = 800, height = 600, res = 100)
135
- print(scat_p)
136
- dev.off()
137
-
138
- add_report(
139
- list(
140
- src = scatfile,
141
- name = paste0(feat, " vs nCount_RNA"),
142
- descr = paste0("Scatter plot for ", feat, " against nCount_RNA")
143
- ),
144
- h1 = "Scatter Plots",
145
- ui = "table_of_images"
146
- )
147
- }
148
-
149
- # return the dim_df calculated from the cell_qc_df
150
- rbind(
151
- cell_qc_df %>%
152
- # group_by(Sample) %>%
153
- summarise(
154
- when = "Before_Cell_QC",
155
- nCells = dplyr::n(),
156
- nGenes = ngenes
157
- ) %>%
158
- ungroup(),
159
- cell_qc_df %>%
160
- filter(.QC) %>%
161
- # group_by(Sample) %>%
162
- summarise(
163
- when = "After_Cell_QC",
164
- nCells = dplyr::n(),
165
- nGenes = ngenes
166
- ) %>%
167
- ungroup()
168
- )
169
- }
170
-
171
- load_sample = function(sample) {
172
- log_info("- Loading sample: {sample} ...")
173
- mdata = as.data.frame(metadata)[metadata$Sample == sample, , drop=TRUE]
174
- path = as.character(mdata$RNAData)
175
- if (is.na(path) || !is.character(path) || nchar(path) == 0 || path == "NA") {
176
- warning(paste0("No path found for sample: ", sample))
177
- return (NULL)
178
- }
179
-
180
- # obj_list = list()
181
- if (dir.exists(path)) {
182
- exprs = tryCatch(
183
- # Read10X requires
184
- # - barcodes.tsv.gz
185
- # - genes.tsv.gz
186
- # - matrix.mtx.gz
187
- # But sometimes, they are prefixed with sample name
188
- # e.g.GSM4143656_SAM24345863-ln1.barcodes.tsv.gz
189
- { Read10X(data.dir = path) },
190
- error = function(e) rename_files(e, sample, path)
191
- )
192
- } else {
193
- exprs = Read10X_h5(path)
194
- }
195
- if ("Gene Expression" %in% names(exprs)) {
196
- exprs = exprs[["Gene Expression"]]
197
- }
198
- obj <- CreateSeuratObject(exprs, project=sample)
199
- # filter the cells that don't have any gene expressions
200
- # cell_exprs = colSums(obj@assays$RNA)
201
- # obj = subset(obj, cells = names(cell_exprs[cell_exprs > 0]))
202
- obj = RenameCells(obj, add.cell.id = sample)
203
- # Attach meta data
204
- for (mname in names(mdata)) {
205
- if (mname %in% c("RNAData", "TCRData")) { next }
206
- mdt = mdata[[mname]]
207
- if (is.factor(mdt)) { mdt = levels(mdt)[mdt] }
208
- obj[[mname]] = mdt
209
- }
210
-
211
- if (isTRUE(envs$cell_qc_per_sample)) {
212
- log_info("- Perform cell QC for sample: {sample} ...")
213
- obj = perform_cell_qc(obj, per_sample = TRUE)
214
- }
215
-
216
- if (isTRUE(envs$use_sct)) {
217
- # so that we have data and scale.data layers on RNA assay
218
- # useful for visualization in case some genes are not in
219
- # the SCT assay
220
- obj = NormalizeData(obj, verbose = FALSE)
221
- obj = FindVariableFeatures(obj, verbose = FALSE)
222
- obj = ScaleData(obj, verbose = FALSE)
223
- }
224
- obj
225
- }
226
-
227
- run_gene_qc <- function(sobj) {
228
- cached <- get_cached(
229
- list(
230
- cell_qc = envs$cell_qc,
231
- gene_qc = envs$gene_qc,
232
- cell_qc_per_sample = envs$cell_qc_per_sample,
233
- use_sct = envs$use_sct
234
- ),
235
- "GeneQC",
236
- cache_dir
237
- )
238
- if (!is.null(cached$data)) {
239
- log_info("Loading gene-QC'ed object from cache ...")
240
- sobj <- cached$data
241
- } else {
242
- log_info("Filtering genes ...")
243
- genes <- rownames(sobj)
244
- filtered <- FALSE
245
- if (!is.null(envs$gene_qc$min_cells) && envs$gene_qc$min_cells > 0) {
246
- genes = genes[Matrix::rowSums(sobj) >= envs$gene_qc$min_cells]
247
- filtered <- TRUE
248
- }
249
- excludes <- envs$gene_qc$excludes
250
- if (!is.null(excludes)) {
251
- if (length(excludes) == 1) {
252
- excludes <- trimws(unlist(strsplit(excludes, ",")))
253
- }
254
- for (ex in excludes) {
255
- genes <- genes[!grepl(ex, genes)]
256
- }
257
- filtered <- TRUE
258
- }
259
- if (filtered) {
260
- sobj = subset(sobj, features = genes)
261
- }
262
- cached$data <- sobj
263
- save_to_cache(cached, "GeneQC", cache_dir)
264
- }
265
- sobj
266
- }
267
-
268
- run_cell_qc <- function(sobj) {
269
- cached <- get_cached(
270
- list(cell_qc = envs$cell_qc, cell_qc_per_sample = envs$cell_qc_per_sample, use_sct = envs$use_sct),
271
- "CellQC",
272
- cache_dir
273
- )
274
- if (!is.null(cached$data)) {
275
- log_info("Loading cell-QC'ed object from cache ...")
276
- sobj <- cached$data$sobj
277
- cell_qc_df <<- cached$data$cell_qc_df
278
- } else {
279
- # Load data
280
- log_info("Reading samples individually ...")
281
- obj_list = lapply(samples, load_sample)
282
-
283
- log_info("Merging samples ...")
284
- sobj = Reduce(merge, obj_list)
285
- rm(obj_list)
286
- gc()
287
-
288
- if (!envs$cell_qc_per_sample) {
289
- log_info("Performing cell QC ...")
290
- sobj = perform_cell_qc(sobj, per_sample = FALSE)
291
- }
292
-
293
- cached$data <- list(sobj = sobj, cell_qc_df = cell_qc_df)
294
- save_to_cache(cached, "CellQC", cache_dir)
295
- }
296
- sobj
297
- }
298
-
299
- run_transformation <- function(sobj) {
300
- envs_cache <- envs
301
- envs_cache$ncores <- NULL
302
- envs_cache$doublet_detector <- NULL
303
- envs_cache$DoubletFinder <- NULL
304
- envs_cache$scDblFinder <- NULL
305
- envs_cache$IntegrateLayers <- NULL
306
- cached <- get_cached(envs_cache, "Transformed", cache_dir)
307
- if (!is.null(cached$data)) {
308
- log_info("Loading transformed object from cache ...")
309
- sobj <- cached$data
310
- } else {
311
- log_info("Performing transformation/scaling ...")
312
- # Not joined yet
313
- # sobj[["RNA"]] <- split(sobj[["RNA"]], f = sobj$Sample)
314
- if (envs$use_sct) {
315
- log_info("- Running SCTransform ...")
316
- SCTransformArgs <- envs$SCTransform
317
- # log to stdout but don't populate it to running log
318
- print(paste0(" SCTransform: ", format_args(SCTransformArgs)))
319
- log_debug(" SCTransform: {format_args(SCTransformArgs)}")
320
- SCTransformArgs$object <- sobj
321
- sobj <- do_call(SCTransform, SCTransformArgs)
322
- # Default is to use the SCT assay
323
-
324
- # Cleanup memory
325
- SCTransformArgs$object <- NULL
326
- rm(SCTransformArgs)
327
- gc()
328
- } else {
329
- log_info("- Running NormalizeData ...")
330
- NormalizeDataArgs <- envs$NormalizeData
331
- print(paste0(" NormalizeData: ", format_args(NormalizeDataArgs)))
332
- log_debug(" NormalizeData: {format_args(NormalizeDataArgs)}")
333
- NormalizeDataArgs$object <- sobj
334
- sobj <- do_call(NormalizeData, NormalizeDataArgs)
335
-
336
- # Cleanup memory
337
- NormalizeDataArgs$object <- NULL
338
- rm(NormalizeDataArgs)
339
- gc()
340
-
341
- log_info("- Running FindVariableFeatures ...")
342
- FindVariableFeaturesArgs <- envs$FindVariableFeatures
343
- print(paste0(" FindVariableFeatures: ", format_args(FindVariableFeaturesArgs)))
344
- log_debug(" FindVariableFeatures: {format_args(FindVariableFeaturesArgs)}")
345
- FindVariableFeaturesArgs$object <- sobj
346
- sobj <- do_call(FindVariableFeatures, FindVariableFeaturesArgs)
347
-
348
- # Cleanup memory
349
- FindVariableFeaturesArgs$object <- NULL
350
- rm(FindVariableFeaturesArgs)
351
- gc()
352
-
353
- log_info("- Running ScaleData ...")
354
- ScaleDataArgs <- envs$ScaleData
355
- print(paste0(" ScaleData: ", format_args(ScaleDataArgs)))
356
- log_debug(" ScaleData: {format_args(ScaleDataArgs)}")
357
- ScaleDataArgs$object <- sobj
358
- sobj <- do_call(ScaleData, ScaleDataArgs)
359
-
360
- # Cleanup memory
361
- ScaleDataArgs$object <- NULL
362
- rm(ScaleDataArgs)
363
- gc()
364
- }
365
-
366
- log_info("- Running RunPCA ...")
367
- RunPCAArgs <- envs$RunPCA
368
- RunPCAArgs$npcs <- if (is.null(RunPCAArgs$npcs)) { 50 } else { min(RunPCAArgs$npcs, ncol(sobj) - 1) }
369
- print(paste0(" RunPCA: ", format_args(RunPCAArgs)))
370
- log_debug(" RunPCA: {format_args(RunPCAArgs)}")
371
- RunPCAArgs$object <- sobj
372
- sobj <- do_call(RunPCA, RunPCAArgs)
373
-
374
- # Cleanup memory
375
- RunPCAArgs$object <- NULL
376
- rm(RunPCAArgs)
377
- gc()
378
-
379
- cached$data <- sobj
380
- save_to_cache(cached, "Transformed", cache_dir)
381
- }
382
-
383
- sobj
384
- }
385
-
386
- run_integration <- function(sobj) {
387
-
388
- envs_cache <- envs
389
- envs_cache$ncores <- NULL
390
- envs_cache$doublet_detector <- NULL
391
- envs_cache$DoubletFinder <- NULL
392
- envs_cache$scDblFinder <- NULL
393
- cached <- get_cached(envs_cache, "Integrated", cache_dir)
394
-
395
- if (!is.null(cached$data)) {
396
- log_info("Loading integrated/layer-joined object from cache ...")
397
- sobj <- cached$data
398
- } else {
399
-
400
- if (!envs$no_integration) {
401
- log_info("- Running IntegrateLayers (method = {envs$IntegrateLayers$method}) ...")
402
- IntegrateLayersArgs <- envs$IntegrateLayers
403
- method <- IntegrateLayersArgs$method
404
- if (!is.null(IntegrateLayersArgs$reference) && is.character(IntegrateLayersArgs$reference)) {
405
- log_info(" Using reference samples: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
406
- IntegrateLayersArgs$reference <- match(IntegrateLayersArgs$reference, samples)
407
- log_info(" Transferred to indices: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
408
- }
409
- if (method %in% c("CCA", "cca")) { method <- "CCAIntegration" } else
410
- if (method %in% c("RPCA", "rpca")) { method <- "RPCAIntegration" } else
411
- if (method %in% c("Harmony", "harmony")) { method <- "HarmonyIntegration" } else
412
- if (method %in% c("FastMNN", "fastmnn")) { method <- "FastMNNIntegration" } else
413
- if (method %in% c("scVI", "scvi")) { method <- "scVIIntegration" } else
414
- { stop(paste0("Unknown integration method: ", method)) }
415
- if (envs$use_sct && is.null(IntegrateLayersArgs$normalization.method)) {
416
- IntegrateLayersArgs$normalization.method <- "SCT"
417
- }
418
- IntegrateLayersArgs$method <- eval(parse(text = method))
419
- new_reductions <- list(
420
- "CCAIntegration" = "integrated.cca",
421
- "RPCAIntegration" = "integrated.rpca",
422
- "HarmonyIntegration" = "harmony",
423
- "FastMNNIntegration" = "integration.mnn",
424
- "scVIIntegration" = "integrated.scvi"
425
- )
426
- if (is.null(IntegrateLayersArgs$new.reduction)) {
427
- IntegrateLayersArgs$new.reduction <- new_reductions[[method]]
428
- }
429
- print(paste0(" IntegrateLayers: ", format_args(IntegrateLayersArgs)))
430
- log_debug(" IntegrateLayers: {format_args(IntegrateLayersArgs)}")
431
- IntegrateLayersArgs$object <- sobj
432
- sobj <- do_call(IntegrateLayers, IntegrateLayersArgs)
433
- # Save it for dimension reduction plots
434
- sobj@misc$integrated_new_reduction <- IntegrateLayersArgs$new.reduction
435
-
436
- # Cleanup memory
437
- IntegrateLayersArgs$object <- NULL
438
- rm(IntegrateLayersArgs)
439
- gc()
440
- }
441
-
442
- if (!envs$use_sct) {
443
- log_info("- Joining layers ...")
444
- sobj <- JoinLayers(sobj)
445
- }
446
-
447
- cached$data <- sobj
448
- save_to_cache(cached, "Integrated", cache_dir)
449
- }
450
-
451
- sobj
452
- }
@@ -1,201 +0,0 @@
1
- .get_envs_cached_doubletfinder <- function() {
2
- envs_cache <- envs
3
- envs_cache$ncores <- NULL
4
- envs_cache$doublet_detector <- NULL
5
- envs_cache$scDblFinder <- NULL
6
- envs_cache$DoubletFinder$ncores <- NULL
7
- envs_cache
8
- }
9
-
10
- .get_envs_cached_scdblfinder <- function() {
11
- envs_cache <- envs
12
- envs_cache$ncores <- NULL
13
- envs_cache$doublet_detector <- NULL
14
- envs_cache$DoubletFinder <- NULL
15
- envs_cache$scDblFinder$ncores <- NULL
16
- envs_cache
17
- }
18
-
19
- .run_doubletfinder <- function() {
20
- library(DoubletFinder)
21
- log_info("- Preparing Seurat object ...")
22
-
23
- if (is.null(envs$DoubletFinder$ncores)) {
24
- envs$DoubletFinder$ncores <- envs$ncores
25
- }
26
-
27
- # More controls from envs?
28
- sobj <- FindNeighbors(sobj, dims = 1:envs$DoubletFinder$PCs)
29
- sobj <- FindClusters(sobj)
30
-
31
- log_info("- pK Indentification ...")
32
- sweep.res.list <- paramSweep(
33
- sobj,
34
- PCs = 1:envs$DoubletFinder$PCs,
35
- sct = envs$use_sct,
36
- num.cores = envs$DoubletFinder$ncores
37
- )
38
- sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
39
- bcmvn <- find.pK(sweep.stats)
40
- bcmvn$Selected <- bcmvn$pK == bcmvn$pK[which.max(bcmvn$BCmetric)[1]]
41
-
42
- pK <- bcmvn$pK[which.max(bcmvn$BCmetric)[1]]
43
- pK <- as.numeric(as.character(pK))
44
- pN <- envs$DoubletFinder$pN
45
- log_info("- Homotypic Doublet Proportion Estimate ...")
46
- homotypic.prop <- modelHomotypic(Idents(sobj))
47
- nExp_poi <- round(nrow(sobj@meta.data) * envs$DoubletFinder$doublets)
48
- nExp_poi.adj <- round(nExp_poi * (1 - homotypic.prop))
49
-
50
- log_info("- Running DoubletFinder ...")
51
- sobj <- doubletFinder(
52
- sobj,
53
- PCs = 1:envs$DoubletFinder$PCs,
54
- pN = pN,
55
- pK = pK,
56
- nExp = nExp_poi.adj,
57
- reuse.pANN = FALSE,
58
- sct = envs$use_sct
59
- )
60
- pANN_col <- paste0("pANN_", pN, "_", pK)
61
- pANN_col <- colnames(sobj@meta.data)[grepl(pANN_col, colnames(sobj@meta.data))]
62
- DF_col <- paste0("DF.classifications_", pN, "_", pK)
63
- DF_col <- colnames(sobj@meta.data)[grepl(DF_col, colnames(sobj@meta.data))]
64
- doublets <- sobj@meta.data[, c(pANN_col, DF_col), drop = FALSE]
65
- colnames(doublets) <- c("DoubletFinder_score","DoubletFinder_DropletType")
66
- doublets$DoubletFinder_DropletType <- tolower(doublets$DoubletFinder_DropletType)
67
-
68
- pk_plot <- ggplot(bcmvn, aes(x = pK, y = BCmetric, color = Selected)) +
69
- geom_point() +
70
- # rotate x axis labels
71
- theme(axis.text.x = element_text(angle = 90, hjust = 1))
72
- list(doublets = doublets, pk_plot = pk_plot)
73
- }
74
-
75
- .run_scdblfinder <- function() {
76
- library(scDblFinder)
77
- if (is.null(envs$scDblFinder$ncores)) {
78
- envs$scDblFinder$ncores <- envs$ncores
79
- }
80
-
81
- envs$scDblFinder$sce <- GetAssayData(sobj, layer = "counts")
82
- if (envs$scDblFinder$ncores > 1) {
83
- envs$scDblFinder$BPPARAM <- BiocParallel::MulticoreParam(envs$scDblFinder$ncores, RNGseed = 8525)
84
- }
85
- envs$scDblFinder$returnType <- "table"
86
- envs$scDblFinder$ncores <- NULL
87
-
88
- doublets <- do_call(scDblFinder, envs$scDblFinder)
89
- doublets <- doublets[doublets$type == "real", , drop = FALSE]
90
- doublets <- doublets[, c("score", "class"), drop = FALSE]
91
- colnames(doublets) <- c("scDblFinder_score", "scDblFinder_DropletType")
92
-
93
- list(doublets = doublets)
94
- }
95
-
96
- run_dd <- function(detector) {
97
- log_info("Running {detector} ...")
98
- if (detector == "DoubletFinder") {
99
- envs_cache_fun <- .get_envs_cached_doubletfinder
100
- run_fun <- .run_doubletfinder
101
- } else if (detector == "scDblFinder") {
102
- envs_cache_fun <- .get_envs_cached_scdblfinder
103
- run_fun <- .run_scdblfinder
104
- } else {
105
- stop("Unknown doublet detector: ", detector)
106
- }
107
-
108
- cached <- get_cached(envs_cache_fun(), detector, cache_dir)
109
- if (!is.null(cached$data)) {
110
- log_info("- Loading cached results ...")
111
- results <- cached$data
112
- } else {
113
- results <- run_fun()
114
-
115
- cached$data <- results
116
- save_to_cache(cached, detector, cache_dir)
117
- }
118
-
119
- results
120
- }
121
-
122
- save_dd <- function(dd, detector) {
123
- doublets <- dd$doublets
124
- write.table(
125
- doublets,
126
- file.path(joboutdir, paste0(detector, "_doublets_singlets.txt")),
127
- row.names = FALSE,
128
- quote = FALSE,
129
- sep = "\t"
130
- )
131
-
132
- summary <- as.data.frame(table(dd$doublets[[paste0(detector, "_DropletType")]]))
133
- colnames(summary) <- c("Classification", "Droplet_N")
134
- write.table(
135
- summary,
136
- file.path(joboutdir, paste0(detector, "_summary.txt")),
137
- row.names = FALSE,
138
- quote = FALSE,
139
- sep = "\t"
140
- )
141
-
142
- n_doublet <- summary$Droplet_N[summary$Classification == 'doublet']
143
- log_info("- {n_doublet}/{sum(summary$Droplet_N)} doublets detected.")
144
- }
145
-
146
- add_dd_to_seurat <- function(sobj, dd) {
147
- AddMetaData(sobj, metadata = as.data.frame(dd$doublets))
148
- }
149
-
150
- plot_dd <- function(sobj, dd, detector) {
151
- if (detector == "DoubletFinder") {
152
- log_debug("- Plotting pK vs BCmetric ...")
153
- ggsave(dd$pk_plot, filename = file.path(plotsdir, "DoubletFinder_pK_BCmetric.png"))
154
- }
155
-
156
- log_info("- Plotting dimension reduction ...")
157
- dimp <- DimPlot(
158
- sobj, group.by = paste0(detector, "_DropletType"), order = "doublet",
159
- cols = c("#333333", "#FF3333"), pt.size = 0.8, alpha = 0.5)
160
- ggsave(dimp, filename = file.path(plotsdir, paste0(detector, "_dimplot.png")))
161
- }
162
-
163
- filter_dd <- function(sobj, dd, detector) {
164
- subset(sobj,
165
- cells = rownames(dd$doublets[
166
- dd$doublets[[paste0(detector, "_DropletType")]] == "singlet", ,
167
- drop = FALSE
168
- ]))
169
- }
170
-
171
- report_dd <- function(detector) {
172
- add_report(
173
- list(
174
- kind = "descr",
175
- content = "The table contains the number of cells classified as singlets and doublets."
176
- ),
177
- list(
178
- kind = "table",
179
- data = list(path = file.path(joboutdir, paste0(detector, "_summary.txt")))
180
- ),
181
- h1 = paste0(detector, " Results"),
182
- h2 = paste0("The ", detector, " Summary")
183
- )
184
-
185
- if (detector == "DoubletFinder") {
186
- add_report(
187
- list(name = "pK vs BCmetric", src = file.path(plotsdir, "pK_BCmetric.png")),
188
- list(name = "Dimension Reduction Plot", src = file.path(plotsdir, "DoubletFinder_dimplot.png")),
189
- ui = "table_of_images",
190
- h1 = "DoubletFinder Results",
191
- h2 = "Plots"
192
- )
193
- } else {
194
- add_report(
195
- list(name = "Dimension Reduction Plot",src = file.path(plotsdir, "scDblFinder_dimplot.png")),
196
- ui = "table_of_images",
197
- h1 = "scDblFinder Results",
198
- h2 = "Plots"
199
- )
200
- }
201
- }