biopipen 0.29.2__py3-none-any.whl → 0.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +2 -0
- biopipen/core/filters.py +21 -0
- biopipen/ns/plot.py +55 -0
- biopipen/ns/scrna.py +110 -21
- biopipen/ns/web.py +87 -5
- biopipen/scripts/bam/CNAClinic.R +2 -1
- biopipen/scripts/cellranger/CellRangerCount.py +3 -3
- biopipen/scripts/cellranger/CellRangerSummary.R +2 -1
- biopipen/scripts/cnv/AneuploidyScore.R +1 -1
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +2 -2
- biopipen/scripts/delim/RowsBinder.R +1 -1
- biopipen/scripts/delim/SampleInfo.R +3 -2
- biopipen/scripts/gene/GeneNameConversion.R +2 -2
- biopipen/scripts/gsea/Enrichr.R +3 -3
- biopipen/scripts/gsea/FGSEA.R +2 -2
- biopipen/scripts/gsea/GSEA.R +2 -2
- biopipen/scripts/gsea/PreRank.R +2 -2
- biopipen/scripts/plot/Heatmap.R +3 -3
- biopipen/scripts/plot/Manhattan.R +2 -1
- biopipen/scripts/plot/QQPlot.R +1 -1
- biopipen/scripts/plot/ROC.R +1 -1
- biopipen/scripts/plot/Scatter.R +112 -0
- biopipen/scripts/plot/VennDiagram.R +3 -3
- biopipen/scripts/regulatory/MotifAffinityTest.R +3 -7
- biopipen/scripts/rnaseq/Simulation.R +1 -1
- biopipen/scripts/rnaseq/UnitConversion.R +2 -1
- biopipen/scripts/scrna/AnnData2Seurat.R +1 -1
- biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +24 -8
- biopipen/scripts/scrna/CellTypeAnnotation-common.R +10 -0
- biopipen/scripts/scrna/CellTypeAnnotation-direct.R +9 -1
- biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +12 -8
- biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +15 -2
- biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +38 -15
- biopipen/scripts/scrna/CellTypeAnnotation.R +3 -0
- biopipen/scripts/scrna/CellsDistribution.R +4 -3
- biopipen/scripts/scrna/DimPlots.R +1 -1
- biopipen/scripts/scrna/ExprImputation-alra.R +1 -1
- biopipen/scripts/scrna/MarkersFinder.R +5 -5
- biopipen/scripts/scrna/MetaMarkers.R +4 -4
- biopipen/scripts/scrna/ModuleScoreCalculator.R +2 -1
- biopipen/scripts/scrna/RadarPlots.R +1 -1
- biopipen/scripts/scrna/ScFGSEA.R +4 -3
- biopipen/scripts/scrna/ScSimulation.R +64 -0
- biopipen/scripts/scrna/Seurat2AnnData.R +1 -1
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +73 -0
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +4 -3
- biopipen/scripts/scrna/SeuratClusterStats-features.R +6 -5
- biopipen/scripts/scrna/SeuratClusterStats-hists.R +6 -5
- biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +4 -3
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +20 -25
- biopipen/scripts/scrna/SeuratClusterStats.R +24 -8
- biopipen/scripts/scrna/SeuratClustering-common.R +213 -0
- biopipen/scripts/scrna/SeuratClustering.R +10 -170
- biopipen/scripts/scrna/SeuratMap2Ref.R +98 -54
- biopipen/scripts/scrna/SeuratMetadataMutater.R +2 -2
- biopipen/scripts/scrna/SeuratPreparing-common.R +452 -0
- biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +201 -0
- biopipen/scripts/scrna/SeuratPreparing.R +22 -562
- biopipen/scripts/scrna/SeuratSubClustering.R +24 -39
- biopipen/scripts/scrna/TopExpressingGenes.R +1 -1
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +2 -2
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +2 -2
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +3 -3
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +3 -3
- biopipen/scripts/snp/MatrixEQTL.R +1 -1
- biopipen/scripts/snp/PlinkCallRate.R +2 -2
- biopipen/scripts/snp/PlinkFreq.R +2 -2
- biopipen/scripts/snp/PlinkHWE.R +2 -2
- biopipen/scripts/snp/PlinkHet.R +2 -2
- biopipen/scripts/snp/PlinkIBD.R +2 -2
- biopipen/scripts/stats/ChowTest.R +1 -1
- biopipen/scripts/stats/DiffCoexpr.R +1 -1
- biopipen/scripts/stats/LiquidAssoc.R +1 -1
- biopipen/scripts/stats/Mediation.R +11 -9
- biopipen/scripts/stats/MetaPvalue.R +4 -1
- biopipen/scripts/stats/MetaPvalue1.R +4 -1
- biopipen/scripts/tcr/Attach2Seurat.R +1 -1
- biopipen/scripts/tcr/CDR3AAPhyschem.R +1 -1
- biopipen/scripts/tcr/CloneResidency.R +2 -2
- biopipen/scripts/tcr/CloneSizeQQPlot.R +2 -2
- biopipen/scripts/tcr/Immunarch-basic.R +0 -4
- biopipen/scripts/tcr/Immunarch-clonality.R +0 -4
- biopipen/scripts/tcr/Immunarch-diversity.R +2 -24
- biopipen/scripts/tcr/Immunarch-geneusage.R +0 -2
- biopipen/scripts/tcr/Immunarch-kmer.R +0 -2
- biopipen/scripts/tcr/Immunarch-overlap.R +0 -2
- biopipen/scripts/tcr/Immunarch-spectratyping.R +0 -2
- biopipen/scripts/tcr/Immunarch-tracking.R +0 -2
- biopipen/scripts/tcr/Immunarch-vjjunc.R +0 -2
- biopipen/scripts/tcr/Immunarch.R +43 -11
- biopipen/scripts/tcr/ImmunarchFilter.R +1 -1
- biopipen/scripts/tcr/ImmunarchLoading.R +2 -2
- biopipen/scripts/tcr/SampleDiversity.R +1 -1
- biopipen/scripts/tcr/TCRClusterStats.R +2 -2
- biopipen/scripts/tcr/TCRClustering.R +2 -2
- biopipen/scripts/tcr/TESSA.R +2 -2
- biopipen/scripts/vcf/TruvariBenchSummary.R +2 -2
- biopipen/scripts/vcf/TruvariConsistency.R +1 -1
- biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
- biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
- biopipen/scripts/web/gcloud_common.py +49 -0
- {biopipen-0.29.2.dist-info → biopipen-0.31.0.dist-info}/METADATA +7 -7
- {biopipen-0.29.2.dist-info → biopipen-0.31.0.dist-info}/RECORD +106 -96
- {biopipen-0.29.2.dist-info → biopipen-0.31.0.dist-info}/WHEEL +0 -0
- {biopipen-0.29.2.dist-info → biopipen-0.31.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,452 @@
|
|
|
1
|
+
|
|
2
|
+
stringify_list <- function(x) {
|
|
3
|
+
paste(sapply(names(x), function(n) paste(n, x[[n]], sep = " = ") ), collapse = "; ")
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
format_args <- function(args) {
|
|
7
|
+
paste(capture.output(str(args)), collapse = ", ")
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
rename_files = function(e, sample, path) {
|
|
11
|
+
tmpdatadir = file.path(joboutdir, "renamed", sample)
|
|
12
|
+
if (dir.exists(tmpdatadir)) {
|
|
13
|
+
unlink(tmpdatadir, recursive = TRUE)
|
|
14
|
+
}
|
|
15
|
+
dir.create(tmpdatadir, recursive = TRUE, showWarnings = FALSE)
|
|
16
|
+
barcodefile = Sys.glob(file.path(path, "*barcodes.tsv.gz"))[1]
|
|
17
|
+
file.symlink(
|
|
18
|
+
normalizePath(barcodefile),
|
|
19
|
+
file.path(tmpdatadir, "barcodes.tsv.gz")
|
|
20
|
+
)
|
|
21
|
+
genefile = glob(file.path(path, "*{genes,features}.tsv.gz"))[1]
|
|
22
|
+
file.symlink(
|
|
23
|
+
normalizePath(genefile),
|
|
24
|
+
file.path(tmpdatadir, "features.tsv.gz")
|
|
25
|
+
)
|
|
26
|
+
matrixfile = Sys.glob(file.path(path, "*matrix.mtx.gz"))[1]
|
|
27
|
+
file.symlink(
|
|
28
|
+
normalizePath(matrixfile),
|
|
29
|
+
file.path(tmpdatadir, "matrix.mtx.gz")
|
|
30
|
+
)
|
|
31
|
+
Read10X(data.dir = tmpdatadir)
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
perform_cell_qc <- function(sobj, per_sample = FALSE) {
|
|
36
|
+
log_prefix <- ifelse(per_sample, " ", "- ")
|
|
37
|
+
log_info("{log_prefix}Adding metadata for QC ...")
|
|
38
|
+
sobj$percent.mt <- PercentageFeatureSet(sobj, pattern = "^MT-")
|
|
39
|
+
sobj$percent.ribo <- PercentageFeatureSet(sobj, pattern = "^RP[SL]")
|
|
40
|
+
sobj$percent.hb <- PercentageFeatureSet(sobj, pattern = "^HB[^(P)]")
|
|
41
|
+
sobj$percent.plat <- PercentageFeatureSet(sobj, pattern = "PECAM1|PF4")
|
|
42
|
+
|
|
43
|
+
if (is.null(envs$cell_qc) || length(envs$cell_qc) == 0) {
|
|
44
|
+
log_warn("{log_prefix}No cell QC criteria is provided. All cells will be kept.")
|
|
45
|
+
cell_qc <- "TRUE"
|
|
46
|
+
} else {
|
|
47
|
+
cell_qc <- envs$cell_qc
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
sobj@meta.data <- sobj@meta.data %>% mutate(.QC = !!rlang::parse_expr(cell_qc))
|
|
51
|
+
|
|
52
|
+
if (is.null(cell_qc_df)) {
|
|
53
|
+
cell_qc_df <<- sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE]
|
|
54
|
+
} else {
|
|
55
|
+
cell_qc_df <<- rbind(cell_qc_df, sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE])
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# Do the filtering
|
|
59
|
+
log_info("{log_prefix}Filtering cells using QC criteria ...")
|
|
60
|
+
sobj <- subset(sobj, subset = .QC)
|
|
61
|
+
sobj$.QC <- NULL
|
|
62
|
+
|
|
63
|
+
return(sobj)
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
report_cell_qc = function(ngenes) {
|
|
67
|
+
# uses cell_qc_df
|
|
68
|
+
|
|
69
|
+
# Violin plots
|
|
70
|
+
log_info("- Plotting violin plots ...")
|
|
71
|
+
add_report(
|
|
72
|
+
list(
|
|
73
|
+
kind = "descr",
|
|
74
|
+
content = paste(
|
|
75
|
+
"The violin plots for each feature. The cells are grouped by sample.",
|
|
76
|
+
"The cells that fail the QC criteria are colored in red, and",
|
|
77
|
+
"the cells that pass the QC criteria are colored in black.",
|
|
78
|
+
"The cells that fail the QC criteria are filtered out in the returned Seurat object."
|
|
79
|
+
)
|
|
80
|
+
),
|
|
81
|
+
h1 = "Violin Plots"
|
|
82
|
+
)
|
|
83
|
+
for (feat in feats) {
|
|
84
|
+
log_info(" For feature: {feat}")
|
|
85
|
+
vln_p <- ggplot(cell_qc_df, aes(x = Sample, y = !!sym(feat), color = .QC)) +
|
|
86
|
+
geom_violin(fill = "white", width = 0.5) +
|
|
87
|
+
geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
|
|
88
|
+
scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
|
|
89
|
+
labs(x = "Sample", y = feat) +
|
|
90
|
+
theme_minimal()
|
|
91
|
+
|
|
92
|
+
vlnplot = file.path(plotsdir, paste0(slugify(feat), ".vln.png"))
|
|
93
|
+
png(
|
|
94
|
+
vlnplot,
|
|
95
|
+
width = 800 + length(samples) * 15, height = 600, res = 100
|
|
96
|
+
)
|
|
97
|
+
print(vln_p)
|
|
98
|
+
dev.off()
|
|
99
|
+
|
|
100
|
+
add_report(
|
|
101
|
+
list(
|
|
102
|
+
src = vlnplot,
|
|
103
|
+
name = feat,
|
|
104
|
+
descr = paste0("Distribution of ", feat, " for each sample.")
|
|
105
|
+
),
|
|
106
|
+
h1 = "Violin Plots",
|
|
107
|
+
ui = "table_of_images"
|
|
108
|
+
)
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# Scatter plots against nCount_RNA
|
|
112
|
+
log_info("- Plotting scatter plots ...")
|
|
113
|
+
add_report(
|
|
114
|
+
list(
|
|
115
|
+
kind = "descr",
|
|
116
|
+
content = paste(
|
|
117
|
+
"The scatter plots for each feature against nCount_RNA. ",
|
|
118
|
+
"The cells that fail the QC criteria are colored in red, and",
|
|
119
|
+
"the cells that pass the QC criteria are colored in black.",
|
|
120
|
+
"The cells that fail the QC criteria are filtered out in the returned Seurat object."
|
|
121
|
+
)
|
|
122
|
+
),
|
|
123
|
+
h1 = "Scatter Plots"
|
|
124
|
+
)
|
|
125
|
+
for (feat in setdiff(feats, "nCount_RNA")) {
|
|
126
|
+
log_info(" For feature: {feat}, against nCount_RNA")
|
|
127
|
+
scat_p <- ggplot(cell_qc_df, aes(x = nCount_RNA, y = !!sym(feat), color = .QC)) +
|
|
128
|
+
geom_point() +
|
|
129
|
+
scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
|
|
130
|
+
labs(x = "nCount_RNA", y = feat) +
|
|
131
|
+
theme_minimal()
|
|
132
|
+
|
|
133
|
+
scatfile = file.path(plotsdir, paste0(slugify(feat), "-nCount_RNA.scatter.png"))
|
|
134
|
+
png(scatfile, width = 800, height = 600, res = 100)
|
|
135
|
+
print(scat_p)
|
|
136
|
+
dev.off()
|
|
137
|
+
|
|
138
|
+
add_report(
|
|
139
|
+
list(
|
|
140
|
+
src = scatfile,
|
|
141
|
+
name = paste0(feat, " vs nCount_RNA"),
|
|
142
|
+
descr = paste0("Scatter plot for ", feat, " against nCount_RNA")
|
|
143
|
+
),
|
|
144
|
+
h1 = "Scatter Plots",
|
|
145
|
+
ui = "table_of_images"
|
|
146
|
+
)
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
# return the dim_df calculated from the cell_qc_df
|
|
150
|
+
rbind(
|
|
151
|
+
cell_qc_df %>%
|
|
152
|
+
# group_by(Sample) %>%
|
|
153
|
+
summarise(
|
|
154
|
+
when = "Before_Cell_QC",
|
|
155
|
+
nCells = dplyr::n(),
|
|
156
|
+
nGenes = ngenes
|
|
157
|
+
) %>%
|
|
158
|
+
ungroup(),
|
|
159
|
+
cell_qc_df %>%
|
|
160
|
+
filter(.QC) %>%
|
|
161
|
+
# group_by(Sample) %>%
|
|
162
|
+
summarise(
|
|
163
|
+
when = "After_Cell_QC",
|
|
164
|
+
nCells = dplyr::n(),
|
|
165
|
+
nGenes = ngenes
|
|
166
|
+
) %>%
|
|
167
|
+
ungroup()
|
|
168
|
+
)
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
load_sample = function(sample) {
|
|
172
|
+
log_info("- Loading sample: {sample} ...")
|
|
173
|
+
mdata = as.data.frame(metadata)[metadata$Sample == sample, , drop=TRUE]
|
|
174
|
+
path = as.character(mdata$RNAData)
|
|
175
|
+
if (is.na(path) || !is.character(path) || nchar(path) == 0 || path == "NA") {
|
|
176
|
+
warning(paste0("No path found for sample: ", sample))
|
|
177
|
+
return (NULL)
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
# obj_list = list()
|
|
181
|
+
if (dir.exists(path)) {
|
|
182
|
+
exprs = tryCatch(
|
|
183
|
+
# Read10X requires
|
|
184
|
+
# - barcodes.tsv.gz
|
|
185
|
+
# - genes.tsv.gz
|
|
186
|
+
# - matrix.mtx.gz
|
|
187
|
+
# But sometimes, they are prefixed with sample name
|
|
188
|
+
# e.g.GSM4143656_SAM24345863-ln1.barcodes.tsv.gz
|
|
189
|
+
{ Read10X(data.dir = path) },
|
|
190
|
+
error = function(e) rename_files(e, sample, path)
|
|
191
|
+
)
|
|
192
|
+
} else {
|
|
193
|
+
exprs = Read10X_h5(path)
|
|
194
|
+
}
|
|
195
|
+
if ("Gene Expression" %in% names(exprs)) {
|
|
196
|
+
exprs = exprs[["Gene Expression"]]
|
|
197
|
+
}
|
|
198
|
+
obj <- CreateSeuratObject(exprs, project=sample)
|
|
199
|
+
# filter the cells that don't have any gene expressions
|
|
200
|
+
# cell_exprs = colSums(obj@assays$RNA)
|
|
201
|
+
# obj = subset(obj, cells = names(cell_exprs[cell_exprs > 0]))
|
|
202
|
+
obj = RenameCells(obj, add.cell.id = sample)
|
|
203
|
+
# Attach meta data
|
|
204
|
+
for (mname in names(mdata)) {
|
|
205
|
+
if (mname %in% c("RNAData", "TCRData")) { next }
|
|
206
|
+
mdt = mdata[[mname]]
|
|
207
|
+
if (is.factor(mdt)) { mdt = levels(mdt)[mdt] }
|
|
208
|
+
obj[[mname]] = mdt
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
if (isTRUE(envs$cell_qc_per_sample)) {
|
|
212
|
+
log_info("- Perform cell QC for sample: {sample} ...")
|
|
213
|
+
obj = perform_cell_qc(obj, TRUE)
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
if (isTRUE(envs$use_sct)) {
|
|
217
|
+
# so that we have data and scale.data layers on RNA assay
|
|
218
|
+
# useful for visualization in case some genes are not in
|
|
219
|
+
# the SCT assay
|
|
220
|
+
obj = NormalizeData(obj, verbose = FALSE)
|
|
221
|
+
obj = FindVariableFeatures(obj, verbose = FALSE)
|
|
222
|
+
obj = ScaleData(obj, verbose = FALSE)
|
|
223
|
+
}
|
|
224
|
+
obj
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
run_gene_qc <- function(sobj) {
|
|
228
|
+
cached <- get_cached(
|
|
229
|
+
list(
|
|
230
|
+
cell_qc = envs$cell_qc,
|
|
231
|
+
gene_qc = envs$gene_qc,
|
|
232
|
+
cell_qc_per_sample = envs$cell_qc_per_sample,
|
|
233
|
+
use_sct = envs$use_sct
|
|
234
|
+
),
|
|
235
|
+
"GeneQC",
|
|
236
|
+
cache_dir
|
|
237
|
+
)
|
|
238
|
+
if (!is.null(cached$data)) {
|
|
239
|
+
log_info("Loading gene-QC'ed object from cache ...")
|
|
240
|
+
sobj <- cached$data
|
|
241
|
+
} else {
|
|
242
|
+
log_info("Filtering genes ...")
|
|
243
|
+
genes <- rownames(sobj)
|
|
244
|
+
filtered <- FALSE
|
|
245
|
+
if (!is.null(envs$gene_qc$min_cells) && envs$gene_qc$min_cells > 0) {
|
|
246
|
+
genes = genes[Matrix::rowSums(sobj) >= envs$gene_qc$min_cells]
|
|
247
|
+
filtered <- TRUE
|
|
248
|
+
}
|
|
249
|
+
excludes <- envs$gene_qc$excludes
|
|
250
|
+
if (!is.null(excludes)) {
|
|
251
|
+
if (length(excludes) == 1) {
|
|
252
|
+
excludes <- trimws(unlist(strsplit(excludes, ",")))
|
|
253
|
+
}
|
|
254
|
+
for (ex in excludes) {
|
|
255
|
+
genes <- genes[!grepl(ex, genes)]
|
|
256
|
+
}
|
|
257
|
+
filtered <- TRUE
|
|
258
|
+
}
|
|
259
|
+
if (filtered) {
|
|
260
|
+
sobj = subset(sobj, features = genes)
|
|
261
|
+
}
|
|
262
|
+
cached$data <- sobj
|
|
263
|
+
save_to_cache(cached, "GeneQC", cache_dir)
|
|
264
|
+
}
|
|
265
|
+
sobj
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
run_cell_qc <- function(sobj) {
|
|
269
|
+
cached <- get_cached(
|
|
270
|
+
list(cell_qc = envs$cell_qc, cell_qc_per_sample = envs$cell_qc_per_sample, use_sct = envs$use_sct),
|
|
271
|
+
"CellQC",
|
|
272
|
+
cache_dir
|
|
273
|
+
)
|
|
274
|
+
if (!is.null(cached$data)) {
|
|
275
|
+
log_info("Loading cell-QC'ed object from cache ...")
|
|
276
|
+
sobj <- cached$data$sobj
|
|
277
|
+
cell_qc_df <<- cached$data$cell_qc_df
|
|
278
|
+
} else {
|
|
279
|
+
# Load data
|
|
280
|
+
log_info("Reading samples individually ...")
|
|
281
|
+
obj_list = lapply(samples, load_sample)
|
|
282
|
+
|
|
283
|
+
log_info("Merging samples ...")
|
|
284
|
+
sobj = Reduce(merge, obj_list)
|
|
285
|
+
rm(obj_list)
|
|
286
|
+
gc()
|
|
287
|
+
|
|
288
|
+
if (!envs$cell_qc_per_sample) {
|
|
289
|
+
log_info("Performing cell QC ...")
|
|
290
|
+
sobj = perform_cell_qc(sobj)
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
cached$data <- list(sobj = sobj, cell_qc_df = cell_qc_df)
|
|
294
|
+
save_to_cache(cached, "CellQC", cache_dir)
|
|
295
|
+
}
|
|
296
|
+
sobj
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
run_transformation <- function(sobj) {
|
|
300
|
+
envs_cache <- envs
|
|
301
|
+
envs_cache$ncores <- NULL
|
|
302
|
+
envs_cache$doublet_detector <- NULL
|
|
303
|
+
envs_cache$DoubletFinder <- NULL
|
|
304
|
+
envs_cache$scDblFinder <- NULL
|
|
305
|
+
envs_cache$IntegrateLayers <- NULL
|
|
306
|
+
cached <- get_cached(envs_cache, "Transformed", cache_dir)
|
|
307
|
+
if (!is.null(cached$data)) {
|
|
308
|
+
log_info("Loading transformed object from cache ...")
|
|
309
|
+
sobj <- cached$data
|
|
310
|
+
} else {
|
|
311
|
+
log_info("Performing transformation/scaling ...")
|
|
312
|
+
# Not joined yet
|
|
313
|
+
# sobj[["RNA"]] <- split(sobj[["RNA"]], f = sobj$Sample)
|
|
314
|
+
if (envs$use_sct) {
|
|
315
|
+
log_info("- Running SCTransform ...")
|
|
316
|
+
SCTransformArgs <- envs$SCTransform
|
|
317
|
+
# log to stdout but don't populate it to running log
|
|
318
|
+
print(paste0(" SCTransform: ", format_args(SCTransformArgs)))
|
|
319
|
+
log_debug(" SCTransform: {format_args(SCTransformArgs)}")
|
|
320
|
+
SCTransformArgs$object <- sobj
|
|
321
|
+
sobj <- do_call(SCTransform, SCTransformArgs)
|
|
322
|
+
# Default is to use the SCT assay
|
|
323
|
+
|
|
324
|
+
# Cleanup memory
|
|
325
|
+
SCTransformArgs$object <- NULL
|
|
326
|
+
rm(SCTransformArgs)
|
|
327
|
+
gc()
|
|
328
|
+
} else {
|
|
329
|
+
log_info("- Running NormalizeData ...")
|
|
330
|
+
NormalizeDataArgs <- envs$NormalizeData
|
|
331
|
+
print(paste0(" NormalizeData: ", format_args(NormalizeDataArgs)))
|
|
332
|
+
log_debug(" NormalizeData: {format_args(NormalizeDataArgs)}")
|
|
333
|
+
NormalizeDataArgs$object <- sobj
|
|
334
|
+
sobj <- do_call(NormalizeData, NormalizeDataArgs)
|
|
335
|
+
|
|
336
|
+
# Cleanup memory
|
|
337
|
+
NormalizeDataArgs$object <- NULL
|
|
338
|
+
rm(NormalizeDataArgs)
|
|
339
|
+
gc()
|
|
340
|
+
|
|
341
|
+
log_info("- Running FindVariableFeatures ...")
|
|
342
|
+
FindVariableFeaturesArgs <- envs$FindVariableFeatures
|
|
343
|
+
print(paste0(" FindVariableFeatures: ", format_args(FindVariableFeaturesArgs)))
|
|
344
|
+
log_debug(" FindVariableFeatures: {format_args(FindVariableFeaturesArgs)}")
|
|
345
|
+
FindVariableFeaturesArgs$object <- sobj
|
|
346
|
+
sobj <- do_call(FindVariableFeatures, FindVariableFeaturesArgs)
|
|
347
|
+
|
|
348
|
+
# Cleanup memory
|
|
349
|
+
FindVariableFeaturesArgs$object <- NULL
|
|
350
|
+
rm(FindVariableFeaturesArgs)
|
|
351
|
+
gc()
|
|
352
|
+
|
|
353
|
+
log_info("- Running ScaleData ...")
|
|
354
|
+
ScaleDataArgs <- envs$ScaleData
|
|
355
|
+
print(paste0(" ScaleData: ", format_args(ScaleDataArgs)))
|
|
356
|
+
log_debug(" ScaleData: {format_args(ScaleDataArgs)}")
|
|
357
|
+
ScaleDataArgs$object <- sobj
|
|
358
|
+
sobj <- do_call(ScaleData, ScaleDataArgs)
|
|
359
|
+
|
|
360
|
+
# Cleanup memory
|
|
361
|
+
ScaleDataArgs$object <- NULL
|
|
362
|
+
rm(ScaleDataArgs)
|
|
363
|
+
gc()
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
log_info("- Running RunPCA ...")
|
|
367
|
+
RunPCAArgs <- envs$RunPCA
|
|
368
|
+
RunPCAArgs$npcs <- if (is.null(RunPCAArgs$npcs)) { 50 } else { min(RunPCAArgs$npcs, ncol(sobj) - 1) }
|
|
369
|
+
print(paste0(" RunPCA: ", format_args(RunPCAArgs)))
|
|
370
|
+
log_debug(" RunPCA: {format_args(RunPCAArgs)}")
|
|
371
|
+
RunPCAArgs$object <- sobj
|
|
372
|
+
sobj <- do_call(RunPCA, RunPCAArgs)
|
|
373
|
+
|
|
374
|
+
# Cleanup memory
|
|
375
|
+
RunPCAArgs$object <- NULL
|
|
376
|
+
rm(RunPCAArgs)
|
|
377
|
+
gc()
|
|
378
|
+
|
|
379
|
+
cached$data <- sobj
|
|
380
|
+
save_to_cache(cached, "Transformed", cache_dir)
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
sobj
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
run_integration <- function(sobj) {
|
|
387
|
+
|
|
388
|
+
envs_cache <- envs
|
|
389
|
+
envs_cache$ncores <- NULL
|
|
390
|
+
envs_cache$doublet_detector <- NULL
|
|
391
|
+
envs_cache$DoubletFinder <- NULL
|
|
392
|
+
envs_cache$scDblFinder <- NULL
|
|
393
|
+
cached <- get_cached(envs_cache, "Integrated", cache_dir)
|
|
394
|
+
|
|
395
|
+
if (!is.null(cached$data)) {
|
|
396
|
+
log_info("Loading integrated/layer-joined object from cache ...")
|
|
397
|
+
sobj <- cached$data
|
|
398
|
+
} else {
|
|
399
|
+
|
|
400
|
+
if (!envs$no_integration) {
|
|
401
|
+
log_info("- Running IntegrateLayers (method = {envs$IntegrateLayers$method}) ...")
|
|
402
|
+
IntegrateLayersArgs <- envs$IntegrateLayers
|
|
403
|
+
method <- IntegrateLayersArgs$method
|
|
404
|
+
if (!is.null(IntegrateLayersArgs$reference) && is.character(IntegrateLayersArgs$reference)) {
|
|
405
|
+
log_info(" Using reference samples: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
|
|
406
|
+
IntegrateLayersArgs$reference <- match(IntegrateLayersArgs$reference, samples)
|
|
407
|
+
log_info(" Transferred to indices: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
|
|
408
|
+
}
|
|
409
|
+
if (method %in% c("CCA", "cca")) { method <- "CCAIntegration" } else
|
|
410
|
+
if (method %in% c("RPCA", "rpca")) { method <- "RPCAIntegration" } else
|
|
411
|
+
if (method %in% c("Harmony", "harmony")) { method <- "HarmonyIntegration" } else
|
|
412
|
+
if (method %in% c("FastMNN", "fastmnn")) { method <- "FastMNNIntegration" } else
|
|
413
|
+
if (method %in% c("scVI", "scvi")) { method <- "scVIIntegration" } else
|
|
414
|
+
{ stop(paste0("Unknown integration method: ", method)) }
|
|
415
|
+
if (envs$use_sct && is.null(IntegrateLayersArgs$normalization.method)) {
|
|
416
|
+
IntegrateLayersArgs$normalization.method <- "SCT"
|
|
417
|
+
}
|
|
418
|
+
IntegrateLayersArgs$method <- eval(parse(text = method))
|
|
419
|
+
new_reductions <- list(
|
|
420
|
+
"CCAIntegration" = "integrated.cca",
|
|
421
|
+
"RPCAIntegration" = "integrated.rpca",
|
|
422
|
+
"HarmonyIntegration" = "harmony",
|
|
423
|
+
"FastMNNIntegration" = "integration.mnn",
|
|
424
|
+
"scVIIntegration" = "integrated.scvi"
|
|
425
|
+
)
|
|
426
|
+
if (is.null(IntegrateLayersArgs$new.reduction)) {
|
|
427
|
+
IntegrateLayersArgs$new.reduction <- new_reductions[[method]]
|
|
428
|
+
}
|
|
429
|
+
print(paste0(" IntegrateLayers: ", format_args(IntegrateLayersArgs)))
|
|
430
|
+
log_debug(" IntegrateLayers: {format_args(IntegrateLayersArgs)}")
|
|
431
|
+
IntegrateLayersArgs$object <- sobj
|
|
432
|
+
sobj <- do_call(IntegrateLayers, IntegrateLayersArgs)
|
|
433
|
+
# Save it for dimension reduction plots
|
|
434
|
+
sobj@misc$integrated_new_reduction <- IntegrateLayersArgs$new.reduction
|
|
435
|
+
|
|
436
|
+
# Cleanup memory
|
|
437
|
+
IntegrateLayersArgs$object <- NULL
|
|
438
|
+
rm(IntegrateLayersArgs)
|
|
439
|
+
gc()
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
if (!envs$use_sct) {
|
|
443
|
+
log_info("- Joining layers ...")
|
|
444
|
+
sobj <- JoinLayers(sobj)
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
cached$data <- sobj
|
|
448
|
+
save_to_cache(cached, "Integrated", cache_dir)
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
sobj
|
|
452
|
+
}
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
.get_envs_cached_doubletfinder <- function() {
|
|
2
|
+
envs_cache <- envs
|
|
3
|
+
envs_cache$ncores <- NULL
|
|
4
|
+
envs_cache$doublet_detector <- NULL
|
|
5
|
+
envs_cache$scDblFinder <- NULL
|
|
6
|
+
envs_cache$DoubletFinder$ncores <- NULL
|
|
7
|
+
envs_cache
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
.get_envs_cached_scdblfinder <- function() {
|
|
11
|
+
envs_cache <- envs
|
|
12
|
+
envs_cache$ncores <- NULL
|
|
13
|
+
envs_cache$doublet_detector <- NULL
|
|
14
|
+
envs_cache$DoubletFinder <- NULL
|
|
15
|
+
envs_cache$scDblFinder$ncores <- NULL
|
|
16
|
+
envs_cache
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
.run_doubletfinder <- function() {
|
|
20
|
+
library(DoubletFinder)
|
|
21
|
+
log_info("- Preparing Seurat object ...")
|
|
22
|
+
|
|
23
|
+
if (is.null(envs$DoubletFinder$ncores)) {
|
|
24
|
+
envs$DoubletFinder$ncores <- envs$ncores
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
# More controls from envs?
|
|
28
|
+
sobj <- FindNeighbors(sobj, dims = 1:envs$DoubletFinder$PCs)
|
|
29
|
+
sobj <- FindClusters(sobj)
|
|
30
|
+
|
|
31
|
+
log_info("- pK Indentification ...")
|
|
32
|
+
sweep.res.list <- paramSweep(
|
|
33
|
+
sobj,
|
|
34
|
+
PCs = 1:envs$DoubletFinder$PCs,
|
|
35
|
+
sct = envs$use_sct,
|
|
36
|
+
num.cores = envs$DoubletFinder$ncores
|
|
37
|
+
)
|
|
38
|
+
sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
|
|
39
|
+
bcmvn <- find.pK(sweep.stats)
|
|
40
|
+
bcmvn$Selected <- bcmvn$pK == bcmvn$pK[which.max(bcmvn$BCmetric)[1]]
|
|
41
|
+
|
|
42
|
+
pK <- bcmvn$pK[which.max(bcmvn$BCmetric)[1]]
|
|
43
|
+
pK <- as.numeric(as.character(pK))
|
|
44
|
+
pN <- envs$DoubletFinder$pN
|
|
45
|
+
log_info("- Homotypic Doublet Proportion Estimate ...")
|
|
46
|
+
homotypic.prop <- modelHomotypic(Idents(sobj))
|
|
47
|
+
nExp_poi <- round(nrow(sobj@meta.data) * envs$DoubletFinder$doublets)
|
|
48
|
+
nExp_poi.adj <- round(nExp_poi * (1 - homotypic.prop))
|
|
49
|
+
|
|
50
|
+
log_info("- Running DoubletFinder ...")
|
|
51
|
+
sobj <- doubletFinder(
|
|
52
|
+
sobj,
|
|
53
|
+
PCs = 1:envs$DoubletFinder$PCs,
|
|
54
|
+
pN = pN,
|
|
55
|
+
pK = pK,
|
|
56
|
+
nExp = nExp_poi.adj,
|
|
57
|
+
reuse.pANN = FALSE,
|
|
58
|
+
sct = envs$use_sct
|
|
59
|
+
)
|
|
60
|
+
pANN_col <- paste0("pANN_", pN, "_", pK)
|
|
61
|
+
pANN_col <- colnames(sobj@meta.data)[grepl(pANN_col, colnames(sobj@meta.data))]
|
|
62
|
+
DF_col <- paste0("DF.classifications_", pN, "_", pK)
|
|
63
|
+
DF_col <- colnames(sobj@meta.data)[grepl(DF_col, colnames(sobj@meta.data))]
|
|
64
|
+
doublets <- sobj@meta.data[, c(pANN_col, DF_col), drop = FALSE]
|
|
65
|
+
colnames(doublets) <- c("DoubletFinder_score","DoubletFinder_DropletType")
|
|
66
|
+
doublets$DoubletFinder_DropletType <- tolower(doublets$DoubletFinder_DropletType)
|
|
67
|
+
|
|
68
|
+
pk_plot <- ggplot(bcmvn, aes(x = pK, y = BCmetric, color = Selected)) +
|
|
69
|
+
geom_point() +
|
|
70
|
+
# rotate x axis labels
|
|
71
|
+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
|
|
72
|
+
list(doublets = doublets, pk_plot = pk_plot)
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
.run_scdblfinder <- function() {
|
|
76
|
+
library(scDblFinder)
|
|
77
|
+
if (is.null(envs$scDblFinder$ncores)) {
|
|
78
|
+
envs$scDblFinder$ncores <- envs$ncores
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
envs$scDblFinder$sce <- GetAssayData(sobj, layer = "counts")
|
|
82
|
+
if (envs$scDblFinder$ncores > 1) {
|
|
83
|
+
envs$scDblFinder$BPPARAM <- BiocParallel::MulticoreParam(envs$scDblFinder$ncores, RNGseed = 8525)
|
|
84
|
+
}
|
|
85
|
+
envs$scDblFinder$returnType <- "table"
|
|
86
|
+
envs$scDblFinder$ncores <- NULL
|
|
87
|
+
|
|
88
|
+
doublets <- do_call(scDblFinder, envs$scDblFinder)
|
|
89
|
+
doublets <- doublets[doublets$type == "real", , drop = FALSE]
|
|
90
|
+
doublets <- doublets[, c("score", "class"), drop = FALSE]
|
|
91
|
+
colnames(doublets) <- c("scDblFinder_score", "scDblFinder_DropletType")
|
|
92
|
+
|
|
93
|
+
list(doublets = doublets)
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
run_dd <- function(detector) {
|
|
97
|
+
log_info("Running {detector} ...")
|
|
98
|
+
if (detector == "DoubletFinder") {
|
|
99
|
+
envs_cache_fun <- .get_envs_cached_doubletfinder
|
|
100
|
+
run_fun <- .run_doubletfinder
|
|
101
|
+
} else if (detector == "scDblFinder") {
|
|
102
|
+
envs_cache_fun <- .get_envs_cached_scdblfinder
|
|
103
|
+
run_fun <- .run_scdblfinder
|
|
104
|
+
} else {
|
|
105
|
+
stop("Unknown doublet detector: ", detector)
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
cached <- get_cached(envs_cache_fun(), detector, cache_dir)
|
|
109
|
+
if (!is.null(cached$data)) {
|
|
110
|
+
log_info("- Loading cached results ...")
|
|
111
|
+
results <- cached$data
|
|
112
|
+
} else {
|
|
113
|
+
results <- run_fun()
|
|
114
|
+
|
|
115
|
+
cached$data <- results
|
|
116
|
+
save_to_cache(cached, detector, cache_dir)
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
results
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
save_dd <- function(dd, detector) {
|
|
123
|
+
doublets <- dd$doublets
|
|
124
|
+
write.table(
|
|
125
|
+
doublets,
|
|
126
|
+
file.path(joboutdir, paste0(detector, "_doublets_singlets.txt")),
|
|
127
|
+
row.names = FALSE,
|
|
128
|
+
quote = FALSE,
|
|
129
|
+
sep = "\t"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
summary <- as.data.frame(table(dd$doublets[[paste0(detector, "_DropletType")]]))
|
|
133
|
+
colnames(summary) <- c("Classification", "Droplet_N")
|
|
134
|
+
write.table(
|
|
135
|
+
summary,
|
|
136
|
+
file.path(joboutdir, paste0(detector, "_summary.txt")),
|
|
137
|
+
row.names = FALSE,
|
|
138
|
+
quote = FALSE,
|
|
139
|
+
sep = "\t"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
n_doublet <- summary$Droplet_N[summary$Classification == 'doublet']
|
|
143
|
+
log_info("- {n_doublet}/{sum(summary$Droplet_N)} doublets detected.")
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
add_dd_to_seurat <- function(sobj, dd) {
|
|
147
|
+
AddMetaData(sobj, metadata = as.data.frame(dd$doublets))
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
plot_dd <- function(sobj, dd, detector) {
|
|
151
|
+
if (detector == "DoubletFinder") {
|
|
152
|
+
log_debug("- Plotting pK vs BCmetric ...")
|
|
153
|
+
ggsave(dd$pk_plot, filename = file.path(plotsdir, "DoubletFinder_pK_BCmetric.png"))
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
log_info("- Plotting dimension reduction ...")
|
|
157
|
+
dimp <- DimPlot(
|
|
158
|
+
sobj, group.by = paste0(detector, "_DropletType"), order = "doublet",
|
|
159
|
+
cols = c("#333333", "#FF3333"), pt.size = 0.8, alpha = 0.5)
|
|
160
|
+
ggsave(dimp, filename = file.path(plotsdir, paste0(detector, "_dimplot.png")))
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
filter_dd <- function(sobj, dd, detector) {
|
|
164
|
+
subset(sobj,
|
|
165
|
+
cells = rownames(dd$doublets[
|
|
166
|
+
dd$doublets[[paste0(detector, "_DropletType")]] == "singlet", ,
|
|
167
|
+
drop = FALSE
|
|
168
|
+
]))
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
report_dd <- function(detector) {
|
|
172
|
+
add_report(
|
|
173
|
+
list(
|
|
174
|
+
kind = "descr",
|
|
175
|
+
content = "The table contains the number of cells classified as singlets and doublets."
|
|
176
|
+
),
|
|
177
|
+
list(
|
|
178
|
+
kind = "table",
|
|
179
|
+
data = list(path = file.path(joboutdir, paste0(detector, "_summary.txt")))
|
|
180
|
+
),
|
|
181
|
+
h1 = paste0(detector, " Results"),
|
|
182
|
+
h2 = paste0("The ", detector, " Summary")
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
if (detector == "DoubletFinder") {
|
|
186
|
+
add_report(
|
|
187
|
+
list(name = "pK vs BCmetric", src = file.path(plotsdir, "pK_BCmetric.png")),
|
|
188
|
+
list(name = "Dimension Reduction Plot", src = file.path(plotsdir, "DoubletFinder_dimplot.png")),
|
|
189
|
+
ui = "table_of_images",
|
|
190
|
+
h1 = "DoubletFinder Results",
|
|
191
|
+
h2 = "Plots"
|
|
192
|
+
)
|
|
193
|
+
} else {
|
|
194
|
+
add_report(
|
|
195
|
+
list(name = "Dimension Reduction Plot",src = file.path(plotsdir, "scDblFinder_dimplot.png")),
|
|
196
|
+
ui = "table_of_images",
|
|
197
|
+
h1 = "scDblFinder Results",
|
|
198
|
+
h2 = "Plots"
|
|
199
|
+
)
|
|
200
|
+
}
|
|
201
|
+
}
|