biopipen 0.29.2__py3-none-any.whl → 0.30.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +2 -0
- biopipen/core/filters.py +21 -0
- biopipen/ns/plot.py +55 -0
- biopipen/ns/scrna.py +49 -13
- biopipen/ns/web.py +87 -5
- biopipen/scripts/bam/CNAClinic.R +2 -1
- biopipen/scripts/cellranger/CellRangerCount.py +3 -3
- biopipen/scripts/cellranger/CellRangerSummary.R +2 -1
- biopipen/scripts/cnv/AneuploidyScore.R +1 -1
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +2 -2
- biopipen/scripts/delim/RowsBinder.R +1 -1
- biopipen/scripts/delim/SampleInfo.R +3 -2
- biopipen/scripts/gene/GeneNameConversion.R +2 -2
- biopipen/scripts/gsea/Enrichr.R +3 -3
- biopipen/scripts/gsea/FGSEA.R +2 -2
- biopipen/scripts/gsea/GSEA.R +2 -2
- biopipen/scripts/gsea/PreRank.R +2 -2
- biopipen/scripts/plot/Heatmap.R +3 -3
- biopipen/scripts/plot/Manhattan.R +2 -1
- biopipen/scripts/plot/QQPlot.R +1 -1
- biopipen/scripts/plot/ROC.R +1 -1
- biopipen/scripts/plot/Scatter.R +112 -0
- biopipen/scripts/plot/VennDiagram.R +3 -3
- biopipen/scripts/regulatory/MotifAffinityTest.R +3 -7
- biopipen/scripts/rnaseq/Simulation.R +1 -1
- biopipen/scripts/rnaseq/UnitConversion.R +2 -1
- biopipen/scripts/scrna/AnnData2Seurat.R +1 -1
- biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +24 -8
- biopipen/scripts/scrna/CellTypeAnnotation-common.R +10 -0
- biopipen/scripts/scrna/CellTypeAnnotation-direct.R +9 -1
- biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +12 -8
- biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +15 -2
- biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +38 -15
- biopipen/scripts/scrna/CellTypeAnnotation.R +3 -0
- biopipen/scripts/scrna/CellsDistribution.R +3 -2
- biopipen/scripts/scrna/DimPlots.R +1 -1
- biopipen/scripts/scrna/ExprImputation-alra.R +1 -1
- biopipen/scripts/scrna/MarkersFinder.R +5 -5
- biopipen/scripts/scrna/MetaMarkers.R +4 -4
- biopipen/scripts/scrna/ModuleScoreCalculator.R +2 -1
- biopipen/scripts/scrna/RadarPlots.R +1 -1
- biopipen/scripts/scrna/ScFGSEA.R +4 -3
- biopipen/scripts/scrna/Seurat2AnnData.R +1 -1
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +73 -0
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +4 -3
- biopipen/scripts/scrna/SeuratClusterStats-features.R +6 -5
- biopipen/scripts/scrna/SeuratClusterStats-hists.R +6 -5
- biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +4 -3
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +4 -3
- biopipen/scripts/scrna/SeuratClusterStats.R +24 -8
- biopipen/scripts/scrna/SeuratClustering-common.R +213 -0
- biopipen/scripts/scrna/SeuratClustering.R +10 -170
- biopipen/scripts/scrna/SeuratMap2Ref.R +65 -31
- biopipen/scripts/scrna/SeuratMetadataMutater.R +2 -2
- biopipen/scripts/scrna/SeuratPreparing-common.R +452 -0
- biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +201 -0
- biopipen/scripts/scrna/SeuratPreparing.R +22 -562
- biopipen/scripts/scrna/SeuratSubClustering.R +24 -39
- biopipen/scripts/scrna/TopExpressingGenes.R +1 -1
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +2 -2
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +2 -2
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +3 -3
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +3 -3
- biopipen/scripts/snp/MatrixEQTL.R +1 -1
- biopipen/scripts/snp/PlinkCallRate.R +2 -2
- biopipen/scripts/snp/PlinkFreq.R +2 -2
- biopipen/scripts/snp/PlinkHWE.R +2 -2
- biopipen/scripts/snp/PlinkHet.R +2 -2
- biopipen/scripts/snp/PlinkIBD.R +2 -2
- biopipen/scripts/stats/ChowTest.R +1 -1
- biopipen/scripts/stats/DiffCoexpr.R +1 -1
- biopipen/scripts/stats/LiquidAssoc.R +1 -1
- biopipen/scripts/stats/Mediation.R +11 -9
- biopipen/scripts/stats/MetaPvalue.R +4 -1
- biopipen/scripts/stats/MetaPvalue1.R +4 -1
- biopipen/scripts/tcr/Attach2Seurat.R +1 -1
- biopipen/scripts/tcr/CDR3AAPhyschem.R +1 -1
- biopipen/scripts/tcr/CloneResidency.R +2 -2
- biopipen/scripts/tcr/CloneSizeQQPlot.R +2 -2
- biopipen/scripts/tcr/Immunarch-basic.R +0 -4
- biopipen/scripts/tcr/Immunarch-clonality.R +0 -4
- biopipen/scripts/tcr/Immunarch-diversity.R +2 -24
- biopipen/scripts/tcr/Immunarch-geneusage.R +0 -2
- biopipen/scripts/tcr/Immunarch-kmer.R +0 -2
- biopipen/scripts/tcr/Immunarch-overlap.R +0 -2
- biopipen/scripts/tcr/Immunarch-spectratyping.R +0 -2
- biopipen/scripts/tcr/Immunarch-tracking.R +0 -2
- biopipen/scripts/tcr/Immunarch-vjjunc.R +0 -2
- biopipen/scripts/tcr/Immunarch.R +43 -11
- biopipen/scripts/tcr/ImmunarchFilter.R +1 -1
- biopipen/scripts/tcr/ImmunarchLoading.R +2 -2
- biopipen/scripts/tcr/SampleDiversity.R +1 -1
- biopipen/scripts/tcr/TCRClusterStats.R +2 -2
- biopipen/scripts/tcr/TCRClustering.R +2 -2
- biopipen/scripts/tcr/TESSA.R +2 -2
- biopipen/scripts/vcf/TruvariBenchSummary.R +2 -2
- biopipen/scripts/vcf/TruvariConsistency.R +1 -1
- biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
- biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
- biopipen/scripts/web/gcloud_common.py +49 -0
- {biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/METADATA +1 -1
- {biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/RECORD +105 -96
- {biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/WHEEL +0 -0
- {biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
{{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
|
|
2
|
+
{{ biopipen_dir | joinpaths: "utils", "caching.R" | source_r }}
|
|
3
3
|
|
|
4
4
|
library(Seurat)
|
|
5
5
|
library(future)
|
|
@@ -26,9 +26,7 @@ options(future.rng.onMisuse="ignore")
|
|
|
26
26
|
options(Seurat.object.assay.version = "v5")
|
|
27
27
|
plan(strategy = "multicore", workers = envs$ncores)
|
|
28
28
|
|
|
29
|
-
.
|
|
30
|
-
paste(sapply(names(x), function(n) paste(n, x[[n]], sep = " = ") ), collapse = "; ")
|
|
31
|
-
}
|
|
29
|
+
{{ biopipen_dir | joinpaths: "scripts", "scrna", "SeuratPreparing-common.R" | source_r }}
|
|
32
30
|
|
|
33
31
|
add_report(
|
|
34
32
|
list(
|
|
@@ -36,7 +34,7 @@ add_report(
|
|
|
36
34
|
name = "Filters applied",
|
|
37
35
|
content = paste0(
|
|
38
36
|
"<p>Cell filters: ", html_escape(envs$cell_qc), "</p>",
|
|
39
|
-
"<p>Gene filters: ", html_escape(
|
|
37
|
+
"<p>Gene filters: ", html_escape(stringify_list(envs$gene_qc)), "</p>"
|
|
40
38
|
)
|
|
41
39
|
),
|
|
42
40
|
h1 = "Filters and QC"
|
|
@@ -82,302 +80,16 @@ feats = c(
|
|
|
82
80
|
"percent.mt", "percent.ribo", "percent.hb", "percent.plat"
|
|
83
81
|
)
|
|
84
82
|
|
|
85
|
-
|
|
86
|
-
tmpdatadir = file.path(joboutdir, "renamed", sample)
|
|
87
|
-
if (dir.exists(tmpdatadir)) {
|
|
88
|
-
unlink(tmpdatadir, recursive = TRUE)
|
|
89
|
-
}
|
|
90
|
-
dir.create(tmpdatadir, recursive = TRUE, showWarnings = FALSE)
|
|
91
|
-
barcodefile = Sys.glob(file.path(path, "*barcodes.tsv.gz"))[1]
|
|
92
|
-
file.symlink(
|
|
93
|
-
normalizePath(barcodefile),
|
|
94
|
-
file.path(tmpdatadir, "barcodes.tsv.gz")
|
|
95
|
-
)
|
|
96
|
-
genefile = glob(file.path(path, "*{genes,features}.tsv.gz"))[1]
|
|
97
|
-
file.symlink(
|
|
98
|
-
normalizePath(genefile),
|
|
99
|
-
file.path(tmpdatadir, "features.tsv.gz")
|
|
100
|
-
)
|
|
101
|
-
matrixfile = Sys.glob(file.path(path, "*matrix.mtx.gz"))[1]
|
|
102
|
-
file.symlink(
|
|
103
|
-
normalizePath(matrixfile),
|
|
104
|
-
file.path(tmpdatadir, "matrix.mtx.gz")
|
|
105
|
-
)
|
|
106
|
-
Read10X(data.dir = tmpdatadir)
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
perform_cell_qc <- function(sobj, per_sample = FALSE) {
|
|
111
|
-
log_prefix <- ifelse(per_sample, " ", "- ")
|
|
112
|
-
log_info("{log_prefix}Adding metadata for QC ...")
|
|
113
|
-
sobj$percent.mt <- PercentageFeatureSet(sobj, pattern = "^MT-")
|
|
114
|
-
sobj$percent.ribo <- PercentageFeatureSet(sobj, pattern = "^RP[SL]")
|
|
115
|
-
sobj$percent.hb <- PercentageFeatureSet(sobj, pattern = "^HB[^(P)]")
|
|
116
|
-
sobj$percent.plat <- PercentageFeatureSet(sobj, pattern = "PECAM1|PF4")
|
|
117
|
-
|
|
118
|
-
if (is.null(envs$cell_qc) || length(envs$cell_qc) == 0) {
|
|
119
|
-
log_warn("{log_prefix}No cell QC criteria is provided. All cells will be kept.")
|
|
120
|
-
cell_qc <- "TRUE"
|
|
121
|
-
} else {
|
|
122
|
-
cell_qc <- envs$cell_qc
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
sobj@meta.data <- sobj@meta.data %>% mutate(.QC = !!rlang::parse_expr(cell_qc))
|
|
126
|
-
|
|
127
|
-
if (is.null(cell_qc_df)) {
|
|
128
|
-
cell_qc_df <<- sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE]
|
|
129
|
-
} else {
|
|
130
|
-
cell_qc_df <<- rbind(cell_qc_df, sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE])
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
# Do the filtering
|
|
134
|
-
log_info("{log_prefix}Filtering cells using QC criteria ...")
|
|
135
|
-
sobj <- subset(sobj, subset = .QC)
|
|
136
|
-
sobj$.QC <- NULL
|
|
137
|
-
|
|
138
|
-
return(sobj)
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
report_cell_qc = function(ngenes) {
|
|
142
|
-
# uses cell_qc_df
|
|
143
|
-
|
|
144
|
-
# Violin plots
|
|
145
|
-
log_info("- Plotting violin plots ...")
|
|
146
|
-
add_report(
|
|
147
|
-
list(
|
|
148
|
-
kind = "descr",
|
|
149
|
-
content = paste(
|
|
150
|
-
"The violin plots for each feature. The cells are grouped by sample.",
|
|
151
|
-
"The cells that fail the QC criteria are colored in red, and",
|
|
152
|
-
"the cells that pass the QC criteria are colored in black.",
|
|
153
|
-
"The cells that fail the QC criteria are filtered out in the returned Seurat object."
|
|
154
|
-
)
|
|
155
|
-
),
|
|
156
|
-
h1 = "Violin Plots"
|
|
157
|
-
)
|
|
158
|
-
for (feat in feats) {
|
|
159
|
-
log_info(" For feature: {feat}")
|
|
160
|
-
vln_p <- ggplot(cell_qc_df, aes(x = Sample, y = !!sym(feat), color = .QC)) +
|
|
161
|
-
geom_violin(fill = "white", width = 0.5) +
|
|
162
|
-
geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
|
|
163
|
-
scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
|
|
164
|
-
labs(x = "Sample", y = feat) +
|
|
165
|
-
theme_minimal()
|
|
166
|
-
|
|
167
|
-
vlnplot = file.path(plotsdir, paste0(slugify(feat), ".vln.png"))
|
|
168
|
-
png(
|
|
169
|
-
vlnplot,
|
|
170
|
-
width = 800 + length(samples) * 15, height = 600, res = 100
|
|
171
|
-
)
|
|
172
|
-
print(vln_p)
|
|
173
|
-
dev.off()
|
|
174
|
-
|
|
175
|
-
add_report(
|
|
176
|
-
list(
|
|
177
|
-
src = vlnplot,
|
|
178
|
-
name = feat,
|
|
179
|
-
descr = paste0("Distribution of ", feat, " for each sample.")
|
|
180
|
-
),
|
|
181
|
-
h1 = "Violin Plots",
|
|
182
|
-
ui = "table_of_images"
|
|
183
|
-
)
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
# Scatter plots against nCount_RNA
|
|
187
|
-
log_info("- Plotting scatter plots ...")
|
|
188
|
-
add_report(
|
|
189
|
-
list(
|
|
190
|
-
kind = "descr",
|
|
191
|
-
content = paste(
|
|
192
|
-
"The scatter plots for each feature against nCount_RNA. ",
|
|
193
|
-
"The cells that fail the QC criteria are colored in red, and",
|
|
194
|
-
"the cells that pass the QC criteria are colored in black.",
|
|
195
|
-
"The cells that fail the QC criteria are filtered out in the returned Seurat object."
|
|
196
|
-
)
|
|
197
|
-
),
|
|
198
|
-
h1 = "Scatter Plots"
|
|
199
|
-
)
|
|
200
|
-
for (feat in setdiff(feats, "nCount_RNA")) {
|
|
201
|
-
log_info(" For feature: {feat}, against nCount_RNA")
|
|
202
|
-
scat_p <- ggplot(cell_qc_df, aes(x = nCount_RNA, y = !!sym(feat), color = .QC)) +
|
|
203
|
-
geom_point() +
|
|
204
|
-
scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
|
|
205
|
-
labs(x = "nCount_RNA", y = feat) +
|
|
206
|
-
theme_minimal()
|
|
207
|
-
|
|
208
|
-
scatfile = file.path(plotsdir, paste0(slugify(feat), "-nCount_RNA.scatter.png"))
|
|
209
|
-
png(scatfile, width = 800, height = 600, res = 100)
|
|
210
|
-
print(scat_p)
|
|
211
|
-
dev.off()
|
|
212
|
-
|
|
213
|
-
add_report(
|
|
214
|
-
list(
|
|
215
|
-
src = scatfile,
|
|
216
|
-
name = paste0(feat, " vs nCount_RNA"),
|
|
217
|
-
descr = paste0("Scatter plot for ", feat, " against nCount_RNA")
|
|
218
|
-
),
|
|
219
|
-
h1 = "Scatter Plots",
|
|
220
|
-
ui = "table_of_images"
|
|
221
|
-
)
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
# return the dim_df calculated from the cell_qc_df
|
|
225
|
-
rbind(
|
|
226
|
-
cell_qc_df %>%
|
|
227
|
-
# group_by(Sample) %>%
|
|
228
|
-
summarise(
|
|
229
|
-
when = "Before_Cell_QC",
|
|
230
|
-
nCells = dplyr::n(),
|
|
231
|
-
nGenes = ngenes
|
|
232
|
-
) %>%
|
|
233
|
-
ungroup(),
|
|
234
|
-
cell_qc_df %>%
|
|
235
|
-
filter(.QC) %>%
|
|
236
|
-
# group_by(Sample) %>%
|
|
237
|
-
summarise(
|
|
238
|
-
when = "After_Cell_QC",
|
|
239
|
-
nCells = dplyr::n(),
|
|
240
|
-
nGenes = ngenes
|
|
241
|
-
) %>%
|
|
242
|
-
ungroup()
|
|
243
|
-
)
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
load_sample = function(sample) {
|
|
247
|
-
log_info("- Loading sample: {sample} ...")
|
|
248
|
-
mdata = as.data.frame(metadata)[metadata$Sample == sample, , drop=TRUE]
|
|
249
|
-
path = as.character(mdata$RNAData)
|
|
250
|
-
if (is.na(path) || !is.character(path) || nchar(path) == 0 || path == "NA") {
|
|
251
|
-
warning(paste0("No path found for sample: ", sample))
|
|
252
|
-
return (NULL)
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
# obj_list = list()
|
|
256
|
-
if (dir.exists(path)) {
|
|
257
|
-
exprs = tryCatch(
|
|
258
|
-
# Read10X requires
|
|
259
|
-
# - barcodes.tsv.gz
|
|
260
|
-
# - genes.tsv.gz
|
|
261
|
-
# - matrix.mtx.gz
|
|
262
|
-
# But sometimes, they are prefixed with sample name
|
|
263
|
-
# e.g.GSM4143656_SAM24345863-ln1.barcodes.tsv.gz
|
|
264
|
-
{ Read10X(data.dir = path) },
|
|
265
|
-
error = function(e) rename_files(e, sample, path)
|
|
266
|
-
)
|
|
267
|
-
} else {
|
|
268
|
-
exprs = Read10X_h5(path)
|
|
269
|
-
}
|
|
270
|
-
if ("Gene Expression" %in% names(exprs)) {
|
|
271
|
-
exprs = exprs[["Gene Expression"]]
|
|
272
|
-
}
|
|
273
|
-
obj <- CreateSeuratObject(exprs, project=sample)
|
|
274
|
-
# filter the cells that don't have any gene expressions
|
|
275
|
-
# cell_exprs = colSums(obj@assays$RNA)
|
|
276
|
-
# obj = subset(obj, cells = names(cell_exprs[cell_exprs > 0]))
|
|
277
|
-
obj = RenameCells(obj, add.cell.id = sample)
|
|
278
|
-
# Attach meta data
|
|
279
|
-
for (mname in names(mdata)) {
|
|
280
|
-
if (mname %in% c("RNAData", "TCRData")) { next }
|
|
281
|
-
mdt = mdata[[mname]]
|
|
282
|
-
if (is.factor(mdt)) { mdt = levels(mdt)[mdt] }
|
|
283
|
-
obj[[mname]] = mdt
|
|
284
|
-
}
|
|
285
|
-
|
|
286
|
-
if (isTRUE(envs$cell_qc_per_sample)) {
|
|
287
|
-
log_info("- Perform cell QC for sample: {sample} ...")
|
|
288
|
-
obj = perform_cell_qc(obj, TRUE)
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
if (isTRUE(envs$use_sct)) {
|
|
292
|
-
# so that we have data and scale.data layers on RNA assay
|
|
293
|
-
# useful for visualization in case some genes are not in
|
|
294
|
-
# the SCT assay
|
|
295
|
-
obj = NormalizeData(obj, verbose = FALSE)
|
|
296
|
-
obj = FindVariableFeatures(obj, verbose = FALSE)
|
|
297
|
-
obj = ScaleData(obj, verbose = FALSE)
|
|
298
|
-
}
|
|
299
|
-
obj
|
|
300
|
-
}
|
|
301
|
-
|
|
302
|
-
cached <- get_cached(
|
|
303
|
-
list(cell_qc = envs$cell_qc, cell_qc_per_sample = envs$cell_qc_per_sample, use_sct = envs$use_sct),
|
|
304
|
-
"CellQC",
|
|
305
|
-
cache_dir
|
|
306
|
-
)
|
|
307
|
-
if (!is.null(cached$data)) {
|
|
308
|
-
log_info("Loading cell-QC'ed object from cache ...")
|
|
309
|
-
sobj <- cached$data$sobj
|
|
310
|
-
cell_qc_df <- cached$data$cell_qc_df
|
|
311
|
-
cached$data$sobj <- NULL
|
|
312
|
-
cached$data$cell_qc_df <- NULL
|
|
313
|
-
cached$data <- NULL
|
|
314
|
-
rm(cached)
|
|
315
|
-
gc()
|
|
316
|
-
} else {
|
|
317
|
-
# Load data
|
|
318
|
-
log_info("Reading samples individually ...")
|
|
319
|
-
obj_list = lapply(samples, load_sample)
|
|
320
|
-
|
|
321
|
-
log_info("Merging samples ...")
|
|
322
|
-
sobj = Reduce(merge, obj_list)
|
|
323
|
-
rm(obj_list)
|
|
324
|
-
gc()
|
|
325
|
-
|
|
326
|
-
if (!envs$cell_qc_per_sample) {
|
|
327
|
-
log_info("Performing cell QC ...")
|
|
328
|
-
sobj = perform_cell_qc(sobj)
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
cached$data = list(sobj = sobj, cell_qc_df = cell_qc_df)
|
|
332
|
-
save_to_cache(cached, "CellQC", cache_dir)
|
|
333
|
-
}
|
|
83
|
+
sobj <- run_cell_qc(sobj)
|
|
334
84
|
|
|
335
85
|
# plot and report the QC
|
|
336
86
|
log_info("Plotting and reporting QC ...")
|
|
337
87
|
dim_df = report_cell_qc(nrow(sobj))
|
|
338
88
|
|
|
339
89
|
if (is.list(envs$gene_qc)) {
|
|
340
|
-
|
|
341
|
-
list(
|
|
342
|
-
cell_qc = envs$cell_qc,
|
|
343
|
-
gene_qc = envs$gene_qc,
|
|
344
|
-
cell_qc_per_sample = envs$cell_qc_per_sample,
|
|
345
|
-
use_sct = envs$use_sct
|
|
346
|
-
),
|
|
347
|
-
"GeneQC",
|
|
348
|
-
cache_dir
|
|
349
|
-
)
|
|
350
|
-
if (!is.null(cached$data)) {
|
|
351
|
-
log_info("Loading gene-QC'ed object from cache ...")
|
|
352
|
-
sobj <- cached$data
|
|
353
|
-
cached$data <- NULL
|
|
354
|
-
rm(cached)
|
|
355
|
-
gc()
|
|
356
|
-
} else {
|
|
357
|
-
log_info("Filtering genes ...")
|
|
358
|
-
genes <- rownames(sobj)
|
|
359
|
-
filtered <- FALSE
|
|
360
|
-
if (!is.null(envs$gene_qc$min_cells) && envs$gene_qc$min_cells > 0) {
|
|
361
|
-
genes = genes[Matrix::rowSums(sobj) >= envs$gene_qc$min_cells]
|
|
362
|
-
filtered <- TRUE
|
|
363
|
-
}
|
|
364
|
-
excludes <- envs$gene_qc$excludes
|
|
365
|
-
if (!is.null(excludes)) {
|
|
366
|
-
if (length(excludes) == 1) {
|
|
367
|
-
excludes <- trimws(unlist(strsplit(excludes, ",")))
|
|
368
|
-
}
|
|
369
|
-
for (ex in excludes) {
|
|
370
|
-
genes <- genes[!grepl(ex, genes)]
|
|
371
|
-
}
|
|
372
|
-
filtered <- TRUE
|
|
373
|
-
}
|
|
374
|
-
if (filtered) {
|
|
375
|
-
sobj = subset(sobj, features = genes)
|
|
376
|
-
}
|
|
377
|
-
cached$data <- sobj
|
|
378
|
-
save_to_cache(cached, "GeneQC", cache_dir)
|
|
379
|
-
}
|
|
90
|
+
sobj <- run_gene_qc(sobj)
|
|
380
91
|
}
|
|
92
|
+
|
|
381
93
|
dim_df = rbind(
|
|
382
94
|
dim_df,
|
|
383
95
|
data.frame(
|
|
@@ -405,277 +117,25 @@ add_report(
|
|
|
405
117
|
h1 = "Filters and QC"
|
|
406
118
|
)
|
|
407
119
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
}
|
|
411
|
-
|
|
412
|
-
envs_cache <- envs
|
|
413
|
-
envs_cache$ncores <- NULL
|
|
414
|
-
envs_cache$DoubletFinder <- NULL
|
|
415
|
-
envs_cache$IntegrateLayers <- NULL
|
|
416
|
-
cached <- get_cached(envs_cache, "Transformed", cache_dir)
|
|
417
|
-
if (!is.null(cached$data)) {
|
|
418
|
-
log_info("Loading transformed object from cache ...")
|
|
419
|
-
sobj <- cached$data
|
|
420
|
-
cached$data <- NULL
|
|
421
|
-
rm(cached)
|
|
422
|
-
gc()
|
|
423
|
-
} else {
|
|
424
|
-
log_info("Performing transformation/scaling ...")
|
|
425
|
-
# Not joined yet
|
|
426
|
-
# sobj[["RNA"]] <- split(sobj[["RNA"]], f = sobj$Sample)
|
|
427
|
-
if (envs$use_sct) {
|
|
428
|
-
log_info("- Running SCTransform ...")
|
|
429
|
-
SCTransformArgs <- envs$SCTransform
|
|
430
|
-
# log to stdout but don't populate it to running log
|
|
431
|
-
print(paste0(" SCTransform: ", .formatArgs(SCTransformArgs)))
|
|
432
|
-
log_debug(" SCTransform: {.formatArgs(SCTransformArgs)}")
|
|
433
|
-
SCTransformArgs$object <- sobj
|
|
434
|
-
sobj <- do_call(SCTransform, SCTransformArgs)
|
|
435
|
-
# Default is to use the SCT assay
|
|
436
|
-
|
|
437
|
-
# Cleanup memory
|
|
438
|
-
SCTransformArgs$object <- NULL
|
|
439
|
-
rm(SCTransformArgs)
|
|
440
|
-
gc()
|
|
441
|
-
} else {
|
|
442
|
-
log_info("- Running NormalizeData ...")
|
|
443
|
-
NormalizeDataArgs <- envs$NormalizeData
|
|
444
|
-
print(paste0(" NormalizeData: ", .formatArgs(NormalizeDataArgs)))
|
|
445
|
-
log_debug(" NormalizeData: {.formatArgs(NormalizeDataArgs)}")
|
|
446
|
-
NormalizeDataArgs$object <- sobj
|
|
447
|
-
sobj <- do_call(NormalizeData, NormalizeDataArgs)
|
|
448
|
-
|
|
449
|
-
# Cleanup memory
|
|
450
|
-
NormalizeDataArgs$object <- NULL
|
|
451
|
-
rm(NormalizeDataArgs)
|
|
452
|
-
gc()
|
|
453
|
-
|
|
454
|
-
log_info("- Running FindVariableFeatures ...")
|
|
455
|
-
FindVariableFeaturesArgs <- envs$FindVariableFeatures
|
|
456
|
-
print(paste0(" FindVariableFeatures: ", .formatArgs(FindVariableFeaturesArgs)))
|
|
457
|
-
log_debug(" FindVariableFeatures: {.formatArgs(FindVariableFeaturesArgs)}")
|
|
458
|
-
FindVariableFeaturesArgs$object <- sobj
|
|
459
|
-
sobj <- do_call(FindVariableFeatures, FindVariableFeaturesArgs)
|
|
460
|
-
|
|
461
|
-
# Cleanup memory
|
|
462
|
-
FindVariableFeaturesArgs$object <- NULL
|
|
463
|
-
rm(FindVariableFeaturesArgs)
|
|
464
|
-
gc()
|
|
465
|
-
|
|
466
|
-
log_info("- Running ScaleData ...")
|
|
467
|
-
ScaleDataArgs <- envs$ScaleData
|
|
468
|
-
print(paste0(" ScaleData: ", .formatArgs(ScaleDataArgs)))
|
|
469
|
-
log_debug(" ScaleData: {.formatArgs(ScaleDataArgs)}")
|
|
470
|
-
ScaleDataArgs$object <- sobj
|
|
471
|
-
sobj <- do_call(ScaleData, ScaleDataArgs)
|
|
472
|
-
|
|
473
|
-
# Cleanup memory
|
|
474
|
-
ScaleDataArgs$object <- NULL
|
|
475
|
-
rm(ScaleDataArgs)
|
|
476
|
-
gc()
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
log_info("- Running RunPCA ...")
|
|
480
|
-
RunPCAArgs <- envs$RunPCA
|
|
481
|
-
RunPCAArgs$npcs <- if (is.null(RunPCAArgs$npcs)) { 50 } else { min(RunPCAArgs$npcs, ncol(sobj) - 1) }
|
|
482
|
-
print(paste0(" RunPCA: ", .formatArgs(RunPCAArgs)))
|
|
483
|
-
log_debug(" RunPCA: {.formatArgs(RunPCAArgs)}")
|
|
484
|
-
RunPCAArgs$object <- sobj
|
|
485
|
-
sobj <- do_call(RunPCA, RunPCAArgs)
|
|
486
|
-
|
|
487
|
-
# Cleanup memory
|
|
488
|
-
RunPCAArgs$object <- NULL
|
|
489
|
-
rm(RunPCAArgs)
|
|
490
|
-
gc()
|
|
491
|
-
|
|
492
|
-
cached$data <- sobj
|
|
493
|
-
save_to_cache(cached, "Transformed", cache_dir)
|
|
494
|
-
}
|
|
495
|
-
|
|
496
|
-
envs_cache <- envs
|
|
497
|
-
envs_cache$ncores <- NULL
|
|
498
|
-
envs_cache$DoubletFinder <- NULL
|
|
499
|
-
cached <- get_cached(envs_cache, "Integrated", cache_dir)
|
|
500
|
-
|
|
501
|
-
if (!is.null(cached$data)) {
|
|
502
|
-
log_info("Loading integrated/layer-joined object from cache ...")
|
|
503
|
-
sobj <- cached$data
|
|
504
|
-
cached$data <- NULL
|
|
505
|
-
rm(cached)
|
|
506
|
-
gc()
|
|
507
|
-
|
|
508
|
-
} else {
|
|
509
|
-
|
|
510
|
-
if (!envs$no_integration) {
|
|
511
|
-
log_info("- Running IntegrateLayers (method = {envs$IntegrateLayers$method}) ...")
|
|
512
|
-
IntegrateLayersArgs <- envs$IntegrateLayers
|
|
513
|
-
method <- IntegrateLayersArgs$method
|
|
514
|
-
if (!is.null(IntegrateLayersArgs$reference) && is.character(IntegrateLayersArgs$reference)) {
|
|
515
|
-
log_info(" Using reference samples: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
|
|
516
|
-
IntegrateLayersArgs$reference <- match(IntegrateLayersArgs$reference, samples)
|
|
517
|
-
log_info(" Transferred to indices: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
|
|
518
|
-
}
|
|
519
|
-
if (method %in% c("CCA", "cca")) { method <- "CCAIntegration" } else
|
|
520
|
-
if (method %in% c("RPCA", "rpca")) { method <- "RPCAIntegration" } else
|
|
521
|
-
if (method %in% c("Harmony", "harmony")) { method <- "HarmonyIntegration" } else
|
|
522
|
-
if (method %in% c("FastMNN", "fastmnn")) { method <- "FastMNNIntegration" } else
|
|
523
|
-
if (method %in% c("scVI", "scvi")) { method <- "scVIIntegration" } else
|
|
524
|
-
{ stop(paste0("Unknown integration method: ", method)) }
|
|
525
|
-
if (envs$use_sct && is.null(IntegrateLayersArgs$normalization.method)) {
|
|
526
|
-
IntegrateLayersArgs$normalization.method <- "SCT"
|
|
527
|
-
}
|
|
528
|
-
IntegrateLayersArgs$method <- eval(parse(text = method))
|
|
529
|
-
new_reductions <- list(
|
|
530
|
-
"CCAIntegration" = "integrated.cca",
|
|
531
|
-
"RPCAIntegration" = "integrated.rpca",
|
|
532
|
-
"HarmonyIntegration" = "harmony",
|
|
533
|
-
"FastMNNIntegration" = "integration.mnn",
|
|
534
|
-
"scVIIntegration" = "integrated.scvi"
|
|
535
|
-
)
|
|
536
|
-
if (is.null(IntegrateLayersArgs$new.reduction)) {
|
|
537
|
-
IntegrateLayersArgs$new.reduction <- new_reductions[[method]]
|
|
538
|
-
}
|
|
539
|
-
print(paste0(" IntegrateLayers: ", .formatArgs(IntegrateLayersArgs)))
|
|
540
|
-
log_debug(" IntegrateLayers: {.formatArgs(IntegrateLayersArgs)}")
|
|
541
|
-
IntegrateLayersArgs$object <- sobj
|
|
542
|
-
sobj <- do_call(IntegrateLayers, IntegrateLayersArgs)
|
|
543
|
-
# Save it for dimension reduction plots
|
|
544
|
-
sobj@misc$integrated_new_reduction <- IntegrateLayersArgs$new.reduction
|
|
545
|
-
|
|
546
|
-
# Cleanup memory
|
|
547
|
-
IntegrateLayersArgs$object <- NULL
|
|
548
|
-
rm(IntegrateLayersArgs)
|
|
549
|
-
gc()
|
|
550
|
-
}
|
|
551
|
-
|
|
552
|
-
if (!envs$use_sct) {
|
|
553
|
-
log_info("- Joining layers ...")
|
|
554
|
-
sobj <- JoinLayers(sobj)
|
|
555
|
-
}
|
|
556
|
-
|
|
557
|
-
cached$data <- sobj
|
|
558
|
-
save_to_cache(cached, "Integrated", cache_dir)
|
|
559
|
-
}
|
|
560
|
-
|
|
120
|
+
sobj <- run_transformation(sobj)
|
|
121
|
+
sobj <- run_integration(sobj)
|
|
561
122
|
|
|
562
123
|
# This is the last step, doesn't need to be cached
|
|
563
|
-
if (!is.null(envs$
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
sobj <- FindClusters(sobj)
|
|
576
|
-
|
|
577
|
-
log_info("- pK Indentification ...")
|
|
578
|
-
sweep.res.list <- paramSweep(
|
|
579
|
-
sobj,
|
|
580
|
-
PCs = 1:envs$DoubletFinder$PCs,
|
|
581
|
-
sct = envs$use_sct,
|
|
582
|
-
num.cores = envs$DoubletFinder$ncores
|
|
583
|
-
)
|
|
584
|
-
sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
|
|
585
|
-
bcmvn <- find.pK(sweep.stats)
|
|
586
|
-
|
|
587
|
-
bcmvn$Selected <- bcmvn$pK == bcmvn$pK[which.max(bcmvn$BCmetric)[1]]
|
|
588
|
-
plot <- ggplot(bcmvn, aes(x = pK, y = BCmetric, color = Selected)) +
|
|
589
|
-
geom_point() +
|
|
590
|
-
# rotate x axis labels
|
|
591
|
-
theme(axis.text.x = element_text(angle = 90, hjust = 1))
|
|
592
|
-
ggsave(plot, filename = file.path(plotsdir, "pK_BCmetric.png"))
|
|
593
|
-
|
|
594
|
-
pK <- bcmvn$pK[which.max(bcmvn$BCmetric)[1]]
|
|
595
|
-
pK <- as.numeric(as.character(pK))
|
|
596
|
-
pN <- envs$DoubletFinder$pN
|
|
597
|
-
log_info("- Homotypic Doublet Proportion Estimate ...")
|
|
598
|
-
homotypic.prop <- modelHomotypic(Idents(sobj))
|
|
599
|
-
nExp_poi <- round(nrow(sobj@meta.data) * envs$DoubletFinder$doublets)
|
|
600
|
-
nExp_poi.adj <- round(nExp_poi * (1 - homotypic.prop))
|
|
601
|
-
|
|
602
|
-
log_info("- Running DoubletFinder ...")
|
|
603
|
-
sobj <- doubletFinder(
|
|
604
|
-
sobj,
|
|
605
|
-
PCs = 1:envs$DoubletFinder$PCs,
|
|
606
|
-
pN = pN,
|
|
607
|
-
pK = pK,
|
|
608
|
-
nExp = nExp_poi.adj,
|
|
609
|
-
reuse.pANN = FALSE,
|
|
610
|
-
sct = envs$use_sct
|
|
611
|
-
)
|
|
612
|
-
pANN_col <- paste0("pANN_", pN, "_", pK)
|
|
613
|
-
pANN_col <- colnames(sobj@meta.data)[grepl(pANN_col, colnames(sobj@meta.data))]
|
|
614
|
-
DF_col <- paste0("DF.classifications_", pN, "_", pK)
|
|
615
|
-
DF_col <- colnames(sobj@meta.data)[grepl(DF_col, colnames(sobj@meta.data))]
|
|
616
|
-
doublets <- as.data.frame(
|
|
617
|
-
cbind(
|
|
618
|
-
colnames(sobj),
|
|
619
|
-
sobj@meta.data[, pANN_col],
|
|
620
|
-
sobj@meta.data[, DF_col]
|
|
621
|
-
)
|
|
622
|
-
)
|
|
623
|
-
colnames(doublets) <- c("Barcode","DoubletFinder_score","DoubletFinder_DropletType")
|
|
624
|
-
write.table(
|
|
625
|
-
doublets,
|
|
626
|
-
file.path(joboutdir, "DoubletFinder_doublets_singlets.txt"),
|
|
627
|
-
row.names = FALSE,
|
|
628
|
-
quote = FALSE,
|
|
629
|
-
sep = "\t"
|
|
630
|
-
)
|
|
631
|
-
|
|
632
|
-
summary <- as.data.frame(table(doublets$DoubletFinder_DropletType))
|
|
633
|
-
colnames(summary) <- c("Classification", "Droplet_N")
|
|
634
|
-
write.table(
|
|
635
|
-
summary,
|
|
636
|
-
file.path(joboutdir, "DoubletFinder_summary.txt"),
|
|
637
|
-
row.names = FALSE,
|
|
638
|
-
quote = FALSE,
|
|
639
|
-
sep = "\t"
|
|
640
|
-
)
|
|
641
|
-
|
|
642
|
-
# Do a dimplot
|
|
643
|
-
log_info("- Plotting dimension reduction ...")
|
|
644
|
-
dimp <- DimPlot(
|
|
645
|
-
sobj, group.by = DF_col, order = "Doublet",
|
|
646
|
-
cols = c("#333333", "#FF3333"), pt.size = 0.8, alpha = 0.5)
|
|
647
|
-
ggsave(dimp, filename = file.path(plotsdir, "DoubletFinder_dimplot.png"))
|
|
648
|
-
|
|
649
|
-
log_info("- Filtering doublets ...")
|
|
650
|
-
sobj <- subset(sobj, cells = doublets$Barcode[doublets$DoubletFinder_DropletType == "Singlet"])
|
|
651
|
-
|
|
652
|
-
add_report(
|
|
653
|
-
list(
|
|
654
|
-
kind = "descr",
|
|
655
|
-
content = "The table contains the number of cells classified as singlets and doublets."
|
|
656
|
-
),
|
|
657
|
-
list(
|
|
658
|
-
kind = "table",
|
|
659
|
-
data = list(path = file.path(joboutdir, "DoubletFinder_summary.txt"))
|
|
660
|
-
),
|
|
661
|
-
h1 = "DoubletFinder Results",
|
|
662
|
-
h2 = "The DoubletFinder Summary"
|
|
663
|
-
)
|
|
664
|
-
add_report(
|
|
665
|
-
list(
|
|
666
|
-
name = "pK vs BCmetric",
|
|
667
|
-
src = file.path(plotsdir, "pK_BCmetric.png")
|
|
668
|
-
),
|
|
669
|
-
list(
|
|
670
|
-
name = "Dimension Reduction Plot",
|
|
671
|
-
src = file.path(plotsdir, "DoubletFinder_dimplot.png")
|
|
672
|
-
),
|
|
673
|
-
ui = "table_of_images",
|
|
674
|
-
h1 = "DoubletFinder Results",
|
|
675
|
-
h2 = "Plots"
|
|
676
|
-
)
|
|
124
|
+
if (!is.null(envs$doublet_detector) && envs$doublet_detector != "none") {
|
|
125
|
+
{{* biopipen_dir | joinpaths: "scripts", "scrna", "SeuratPreparing-doublet_detection.R" | source_r }}
|
|
126
|
+
|
|
127
|
+
detector <- tolower(envs$doublet_detector)
|
|
128
|
+
if (detector == "doubletfinder") detector <- "DoubletFinder"
|
|
129
|
+
if (detector == "scdblfinder") detector <- "scDblFinder"
|
|
130
|
+
dd <- run_dd(detector)
|
|
131
|
+
save_dd(dd, detector)
|
|
132
|
+
sobj <- add_dd_to_seurat(sobj, dd)
|
|
133
|
+
plot_dd(sobj, dd, detector)
|
|
134
|
+
sobj <- filter_dd(sobj, dd, detector)
|
|
135
|
+
report_dd(detector)
|
|
677
136
|
}
|
|
678
137
|
|
|
138
|
+
|
|
679
139
|
log_info("Saving QC'ed seurat object ...")
|
|
680
140
|
saveRDS(sobj, rdsfile)
|
|
681
141
|
|