biopipen 0.29.2__py3-none-any.whl → 0.30.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +2 -0
- biopipen/core/filters.py +21 -0
- biopipen/ns/plot.py +55 -0
- biopipen/ns/scrna.py +49 -13
- biopipen/ns/web.py +87 -5
- biopipen/scripts/bam/CNAClinic.R +2 -1
- biopipen/scripts/cellranger/CellRangerCount.py +3 -3
- biopipen/scripts/cellranger/CellRangerSummary.R +2 -1
- biopipen/scripts/cnv/AneuploidyScore.R +1 -1
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +2 -2
- biopipen/scripts/delim/RowsBinder.R +1 -1
- biopipen/scripts/delim/SampleInfo.R +3 -2
- biopipen/scripts/gene/GeneNameConversion.R +2 -2
- biopipen/scripts/gsea/Enrichr.R +3 -3
- biopipen/scripts/gsea/FGSEA.R +2 -2
- biopipen/scripts/gsea/GSEA.R +2 -2
- biopipen/scripts/gsea/PreRank.R +2 -2
- biopipen/scripts/plot/Heatmap.R +3 -3
- biopipen/scripts/plot/Manhattan.R +2 -1
- biopipen/scripts/plot/QQPlot.R +1 -1
- biopipen/scripts/plot/ROC.R +1 -1
- biopipen/scripts/plot/Scatter.R +112 -0
- biopipen/scripts/plot/VennDiagram.R +3 -3
- biopipen/scripts/regulatory/MotifAffinityTest.R +3 -7
- biopipen/scripts/rnaseq/Simulation.R +1 -1
- biopipen/scripts/rnaseq/UnitConversion.R +2 -1
- biopipen/scripts/scrna/AnnData2Seurat.R +1 -1
- biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +24 -8
- biopipen/scripts/scrna/CellTypeAnnotation-common.R +10 -0
- biopipen/scripts/scrna/CellTypeAnnotation-direct.R +9 -1
- biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +12 -8
- biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +15 -2
- biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +38 -15
- biopipen/scripts/scrna/CellTypeAnnotation.R +3 -0
- biopipen/scripts/scrna/CellsDistribution.R +3 -2
- biopipen/scripts/scrna/DimPlots.R +1 -1
- biopipen/scripts/scrna/ExprImputation-alra.R +1 -1
- biopipen/scripts/scrna/MarkersFinder.R +5 -5
- biopipen/scripts/scrna/MetaMarkers.R +4 -4
- biopipen/scripts/scrna/ModuleScoreCalculator.R +2 -1
- biopipen/scripts/scrna/RadarPlots.R +1 -1
- biopipen/scripts/scrna/ScFGSEA.R +4 -3
- biopipen/scripts/scrna/Seurat2AnnData.R +1 -1
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +73 -0
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +4 -3
- biopipen/scripts/scrna/SeuratClusterStats-features.R +6 -5
- biopipen/scripts/scrna/SeuratClusterStats-hists.R +6 -5
- biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +4 -3
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +4 -3
- biopipen/scripts/scrna/SeuratClusterStats.R +24 -8
- biopipen/scripts/scrna/SeuratClustering-common.R +213 -0
- biopipen/scripts/scrna/SeuratClustering.R +10 -170
- biopipen/scripts/scrna/SeuratMap2Ref.R +65 -31
- biopipen/scripts/scrna/SeuratMetadataMutater.R +2 -2
- biopipen/scripts/scrna/SeuratPreparing-common.R +452 -0
- biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +201 -0
- biopipen/scripts/scrna/SeuratPreparing.R +22 -562
- biopipen/scripts/scrna/SeuratSubClustering.R +24 -39
- biopipen/scripts/scrna/TopExpressingGenes.R +1 -1
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +2 -2
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +2 -2
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +3 -3
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +3 -3
- biopipen/scripts/snp/MatrixEQTL.R +1 -1
- biopipen/scripts/snp/PlinkCallRate.R +2 -2
- biopipen/scripts/snp/PlinkFreq.R +2 -2
- biopipen/scripts/snp/PlinkHWE.R +2 -2
- biopipen/scripts/snp/PlinkHet.R +2 -2
- biopipen/scripts/snp/PlinkIBD.R +2 -2
- biopipen/scripts/stats/ChowTest.R +1 -1
- biopipen/scripts/stats/DiffCoexpr.R +1 -1
- biopipen/scripts/stats/LiquidAssoc.R +1 -1
- biopipen/scripts/stats/Mediation.R +11 -9
- biopipen/scripts/stats/MetaPvalue.R +4 -1
- biopipen/scripts/stats/MetaPvalue1.R +4 -1
- biopipen/scripts/tcr/Attach2Seurat.R +1 -1
- biopipen/scripts/tcr/CDR3AAPhyschem.R +1 -1
- biopipen/scripts/tcr/CloneResidency.R +2 -2
- biopipen/scripts/tcr/CloneSizeQQPlot.R +2 -2
- biopipen/scripts/tcr/Immunarch-basic.R +0 -4
- biopipen/scripts/tcr/Immunarch-clonality.R +0 -4
- biopipen/scripts/tcr/Immunarch-diversity.R +2 -24
- biopipen/scripts/tcr/Immunarch-geneusage.R +0 -2
- biopipen/scripts/tcr/Immunarch-kmer.R +0 -2
- biopipen/scripts/tcr/Immunarch-overlap.R +0 -2
- biopipen/scripts/tcr/Immunarch-spectratyping.R +0 -2
- biopipen/scripts/tcr/Immunarch-tracking.R +0 -2
- biopipen/scripts/tcr/Immunarch-vjjunc.R +0 -2
- biopipen/scripts/tcr/Immunarch.R +43 -11
- biopipen/scripts/tcr/ImmunarchFilter.R +1 -1
- biopipen/scripts/tcr/ImmunarchLoading.R +2 -2
- biopipen/scripts/tcr/SampleDiversity.R +1 -1
- biopipen/scripts/tcr/TCRClusterStats.R +2 -2
- biopipen/scripts/tcr/TCRClustering.R +2 -2
- biopipen/scripts/tcr/TESSA.R +2 -2
- biopipen/scripts/vcf/TruvariBenchSummary.R +2 -2
- biopipen/scripts/vcf/TruvariConsistency.R +1 -1
- biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
- biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
- biopipen/scripts/web/gcloud_common.py +49 -0
- {biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/METADATA +1 -1
- {biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/RECORD +105 -96
- {biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/WHEEL +0 -0
- {biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
{{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
|
|
2
2
|
|
|
3
3
|
library(parallel)
|
|
4
4
|
library(Seurat)
|
|
@@ -17,6 +17,7 @@ refnorm = {{envs.refnorm | r}}
|
|
|
17
17
|
ncores = {{envs.ncores | r}}
|
|
18
18
|
split_by = {{envs.split_by | r}}
|
|
19
19
|
mutaters = {{envs.mutaters | r}}
|
|
20
|
+
skip_if_normalized = {{envs.skip_if_normalized | r}}
|
|
20
21
|
sctransform_args = {{envs.SCTransform | r: todot="-"}}
|
|
21
22
|
normalizedata_args = {{envs.NormalizeData | r: todot="-"}}
|
|
22
23
|
findtransferanchors_args = {{envs.FindTransferAnchors | r: todot="-"}}
|
|
@@ -40,7 +41,7 @@ mapquery_args$refdata[[use]] = use
|
|
|
40
41
|
|
|
41
42
|
outdir = dirname(outfile)
|
|
42
43
|
if (is.null(split_by)) {
|
|
43
|
-
options(future.globals.maxSize =
|
|
44
|
+
options(future.globals.maxSize = 8 * 1024 ^ 4)
|
|
44
45
|
future::plan(strategy = "multicore", workers = ncores)
|
|
45
46
|
}
|
|
46
47
|
|
|
@@ -98,6 +99,7 @@ if (refnorm == "SCTransform") {
|
|
|
98
99
|
# Load Seurat object
|
|
99
100
|
log_info("- Loading Seurat object")
|
|
100
101
|
sobj = readRDS(sobjfile)
|
|
102
|
+
defassay <- DefaultAssay(sobj)
|
|
101
103
|
|
|
102
104
|
if (!is.null(mutaters) && length(mutaters) > 0) {
|
|
103
105
|
log_info("- Applying mutaters")
|
|
@@ -126,43 +128,61 @@ if (!is.null(split_by)) {
|
|
|
126
128
|
# Normalize data
|
|
127
129
|
log_info("- Normalizing data")
|
|
128
130
|
if (refnorm == "SCTransform") {
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
sctransform_args$object = sobj
|
|
133
|
-
query = do_call(SCTransform, sctransform_args)
|
|
131
|
+
if (defassay == "SCT" && skip_if_normalized) {
|
|
132
|
+
log_warn(" Skipping normalization as the object is already SCTransform'ed")
|
|
133
|
+
query = sobj
|
|
134
134
|
} else {
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
135
|
+
log_info(" Using SCTransform normalization")
|
|
136
|
+
sctransform_args$residual.features = rownames(x = reference)
|
|
137
|
+
if (is.null(split_by)) {
|
|
138
|
+
sctransform_args$object = sobj
|
|
139
|
+
query = do_call(SCTransform, sctransform_args)
|
|
140
|
+
sctransform_args$object <- NULL
|
|
141
|
+
rm(sctransform_args)
|
|
142
|
+
gc()
|
|
143
|
+
} else {
|
|
144
|
+
query = mclapply(
|
|
145
|
+
X = sobj,
|
|
146
|
+
FUN = function(x) {
|
|
147
|
+
sctransform_args$object = x
|
|
148
|
+
do_call(SCTransform, sctransform_args)
|
|
149
|
+
},
|
|
150
|
+
mc.cores = ncores
|
|
151
|
+
)
|
|
152
|
+
if (any(unlist(lapply(query, class)) == "try-error")) {
|
|
153
|
+
stop(paste0("\nmclapply (SCTransform) error:", query))
|
|
154
|
+
}
|
|
145
155
|
}
|
|
146
156
|
}
|
|
147
157
|
} else {
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
query = do_call(NormalizeData, normalizedata_args)
|
|
158
|
+
if (defassay == "RNA" && skip_if_normalized) {
|
|
159
|
+
log_warn(" Skipping normalization as the object is already LogNormalize'd")
|
|
160
|
+
query = sobj
|
|
152
161
|
} else {
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
162
|
+
log_info(" Using NormalizeData normalization")
|
|
163
|
+
if (is.null(split_by)) {
|
|
164
|
+
normalizedata_args$object = sobj
|
|
165
|
+
query = do_call(NormalizeData, normalizedata_args)
|
|
166
|
+
} else {
|
|
167
|
+
query = mclapply(
|
|
168
|
+
X = sobj,
|
|
169
|
+
FUN = function(x) {
|
|
170
|
+
normalizedata_args$object = x
|
|
171
|
+
do_call(NormalizeData, normalizedata_args)
|
|
172
|
+
},
|
|
173
|
+
mc.cores = ncores
|
|
174
|
+
)
|
|
175
|
+
if (any(unlist(lapply(query, class)) == "try-error")) {
|
|
176
|
+
stop(paste0("\nmclapply (NormalizeData) error:", query))
|
|
177
|
+
}
|
|
163
178
|
}
|
|
179
|
+
normalizedata_args$object <- NULL
|
|
180
|
+
rm(normalizedata_args)
|
|
181
|
+
gc()
|
|
164
182
|
}
|
|
165
183
|
}
|
|
184
|
+
rm(sobj)
|
|
185
|
+
gc()
|
|
166
186
|
|
|
167
187
|
# Find anchors between query and reference
|
|
168
188
|
log_info("- Finding anchors")
|
|
@@ -170,6 +190,10 @@ findtransferanchors_args$reference = reference
|
|
|
170
190
|
if (is.null(split_by)) {
|
|
171
191
|
findtransferanchors_args$query = query
|
|
172
192
|
anchors = do_call(FindTransferAnchors, findtransferanchors_args)
|
|
193
|
+
findtransferanchors_args$reference = NULL
|
|
194
|
+
findtransferanchors_args$query = NULL
|
|
195
|
+
rm(findtransferanchors_args)
|
|
196
|
+
gc()
|
|
173
197
|
} else {
|
|
174
198
|
anchors = mclapply(
|
|
175
199
|
X = query,
|
|
@@ -191,6 +215,10 @@ if (is.null(split_by)) {
|
|
|
191
215
|
mapquery_args$query = query
|
|
192
216
|
mapquery_args$anchorset = anchors
|
|
193
217
|
query = do_call(MapQuery, mapquery_args)
|
|
218
|
+
mapquery_args$reference = NULL
|
|
219
|
+
mapquery_args$query = NULL
|
|
220
|
+
mapquery_args$anchorset = NULL
|
|
221
|
+
gc()
|
|
194
222
|
} else {
|
|
195
223
|
query = mclapply(
|
|
196
224
|
X = seq_along(query),
|
|
@@ -221,6 +249,9 @@ if (is.null(split_by)) {
|
|
|
221
249
|
if (e$message == "subscript out of bounds") stop(mappingscore_sob_msg)
|
|
222
250
|
stop(e)
|
|
223
251
|
})
|
|
252
|
+
mappingscore_args$anchors = NULL
|
|
253
|
+
rm(mappingscore_args)
|
|
254
|
+
gc()
|
|
224
255
|
} else {
|
|
225
256
|
mappingscore = mclapply(
|
|
226
257
|
X = seq_along(query),
|
|
@@ -266,6 +297,9 @@ if (is.null(split_by)) {
|
|
|
266
297
|
|
|
267
298
|
# Combine the results
|
|
268
299
|
log_info("- Merging the results")
|
|
300
|
+
gc()
|
|
301
|
+
# Memory efficient way to merge the results
|
|
302
|
+
# query = Reduce(function(x, y) merge(x, y, merge.dr = "ref.umap"), query)
|
|
269
303
|
query = merge(query[[1]], query[2:length(query)], merge.dr = "ref.umap")
|
|
270
304
|
}
|
|
271
305
|
|
|
@@ -0,0 +1,452 @@
|
|
|
1
|
+
|
|
2
|
+
stringify_list <- function(x) {
|
|
3
|
+
paste(sapply(names(x), function(n) paste(n, x[[n]], sep = " = ") ), collapse = "; ")
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
format_args <- function(args) {
|
|
7
|
+
paste(capture.output(str(args)), collapse = ", ")
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
rename_files = function(e, sample, path) {
|
|
11
|
+
tmpdatadir = file.path(joboutdir, "renamed", sample)
|
|
12
|
+
if (dir.exists(tmpdatadir)) {
|
|
13
|
+
unlink(tmpdatadir, recursive = TRUE)
|
|
14
|
+
}
|
|
15
|
+
dir.create(tmpdatadir, recursive = TRUE, showWarnings = FALSE)
|
|
16
|
+
barcodefile = Sys.glob(file.path(path, "*barcodes.tsv.gz"))[1]
|
|
17
|
+
file.symlink(
|
|
18
|
+
normalizePath(barcodefile),
|
|
19
|
+
file.path(tmpdatadir, "barcodes.tsv.gz")
|
|
20
|
+
)
|
|
21
|
+
genefile = glob(file.path(path, "*{genes,features}.tsv.gz"))[1]
|
|
22
|
+
file.symlink(
|
|
23
|
+
normalizePath(genefile),
|
|
24
|
+
file.path(tmpdatadir, "features.tsv.gz")
|
|
25
|
+
)
|
|
26
|
+
matrixfile = Sys.glob(file.path(path, "*matrix.mtx.gz"))[1]
|
|
27
|
+
file.symlink(
|
|
28
|
+
normalizePath(matrixfile),
|
|
29
|
+
file.path(tmpdatadir, "matrix.mtx.gz")
|
|
30
|
+
)
|
|
31
|
+
Read10X(data.dir = tmpdatadir)
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
perform_cell_qc <- function(sobj, per_sample = FALSE) {
|
|
36
|
+
log_prefix <- ifelse(per_sample, " ", "- ")
|
|
37
|
+
log_info("{log_prefix}Adding metadata for QC ...")
|
|
38
|
+
sobj$percent.mt <- PercentageFeatureSet(sobj, pattern = "^MT-")
|
|
39
|
+
sobj$percent.ribo <- PercentageFeatureSet(sobj, pattern = "^RP[SL]")
|
|
40
|
+
sobj$percent.hb <- PercentageFeatureSet(sobj, pattern = "^HB[^(P)]")
|
|
41
|
+
sobj$percent.plat <- PercentageFeatureSet(sobj, pattern = "PECAM1|PF4")
|
|
42
|
+
|
|
43
|
+
if (is.null(envs$cell_qc) || length(envs$cell_qc) == 0) {
|
|
44
|
+
log_warn("{log_prefix}No cell QC criteria is provided. All cells will be kept.")
|
|
45
|
+
cell_qc <- "TRUE"
|
|
46
|
+
} else {
|
|
47
|
+
cell_qc <- envs$cell_qc
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
sobj@meta.data <- sobj@meta.data %>% mutate(.QC = !!rlang::parse_expr(cell_qc))
|
|
51
|
+
|
|
52
|
+
if (is.null(cell_qc_df)) {
|
|
53
|
+
cell_qc_df <<- sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE]
|
|
54
|
+
} else {
|
|
55
|
+
cell_qc_df <<- rbind(cell_qc_df, sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE])
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# Do the filtering
|
|
59
|
+
log_info("{log_prefix}Filtering cells using QC criteria ...")
|
|
60
|
+
sobj <- subset(sobj, subset = .QC)
|
|
61
|
+
sobj$.QC <- NULL
|
|
62
|
+
|
|
63
|
+
return(sobj)
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
report_cell_qc = function(ngenes) {
|
|
67
|
+
# uses cell_qc_df
|
|
68
|
+
|
|
69
|
+
# Violin plots
|
|
70
|
+
log_info("- Plotting violin plots ...")
|
|
71
|
+
add_report(
|
|
72
|
+
list(
|
|
73
|
+
kind = "descr",
|
|
74
|
+
content = paste(
|
|
75
|
+
"The violin plots for each feature. The cells are grouped by sample.",
|
|
76
|
+
"The cells that fail the QC criteria are colored in red, and",
|
|
77
|
+
"the cells that pass the QC criteria are colored in black.",
|
|
78
|
+
"The cells that fail the QC criteria are filtered out in the returned Seurat object."
|
|
79
|
+
)
|
|
80
|
+
),
|
|
81
|
+
h1 = "Violin Plots"
|
|
82
|
+
)
|
|
83
|
+
for (feat in feats) {
|
|
84
|
+
log_info(" For feature: {feat}")
|
|
85
|
+
vln_p <- ggplot(cell_qc_df, aes(x = Sample, y = !!sym(feat), color = .QC)) +
|
|
86
|
+
geom_violin(fill = "white", width = 0.5) +
|
|
87
|
+
geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
|
|
88
|
+
scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
|
|
89
|
+
labs(x = "Sample", y = feat) +
|
|
90
|
+
theme_minimal()
|
|
91
|
+
|
|
92
|
+
vlnplot = file.path(plotsdir, paste0(slugify(feat), ".vln.png"))
|
|
93
|
+
png(
|
|
94
|
+
vlnplot,
|
|
95
|
+
width = 800 + length(samples) * 15, height = 600, res = 100
|
|
96
|
+
)
|
|
97
|
+
print(vln_p)
|
|
98
|
+
dev.off()
|
|
99
|
+
|
|
100
|
+
add_report(
|
|
101
|
+
list(
|
|
102
|
+
src = vlnplot,
|
|
103
|
+
name = feat,
|
|
104
|
+
descr = paste0("Distribution of ", feat, " for each sample.")
|
|
105
|
+
),
|
|
106
|
+
h1 = "Violin Plots",
|
|
107
|
+
ui = "table_of_images"
|
|
108
|
+
)
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# Scatter plots against nCount_RNA
|
|
112
|
+
log_info("- Plotting scatter plots ...")
|
|
113
|
+
add_report(
|
|
114
|
+
list(
|
|
115
|
+
kind = "descr",
|
|
116
|
+
content = paste(
|
|
117
|
+
"The scatter plots for each feature against nCount_RNA. ",
|
|
118
|
+
"The cells that fail the QC criteria are colored in red, and",
|
|
119
|
+
"the cells that pass the QC criteria are colored in black.",
|
|
120
|
+
"The cells that fail the QC criteria are filtered out in the returned Seurat object."
|
|
121
|
+
)
|
|
122
|
+
),
|
|
123
|
+
h1 = "Scatter Plots"
|
|
124
|
+
)
|
|
125
|
+
for (feat in setdiff(feats, "nCount_RNA")) {
|
|
126
|
+
log_info(" For feature: {feat}, against nCount_RNA")
|
|
127
|
+
scat_p <- ggplot(cell_qc_df, aes(x = nCount_RNA, y = !!sym(feat), color = .QC)) +
|
|
128
|
+
geom_point() +
|
|
129
|
+
scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
|
|
130
|
+
labs(x = "nCount_RNA", y = feat) +
|
|
131
|
+
theme_minimal()
|
|
132
|
+
|
|
133
|
+
scatfile = file.path(plotsdir, paste0(slugify(feat), "-nCount_RNA.scatter.png"))
|
|
134
|
+
png(scatfile, width = 800, height = 600, res = 100)
|
|
135
|
+
print(scat_p)
|
|
136
|
+
dev.off()
|
|
137
|
+
|
|
138
|
+
add_report(
|
|
139
|
+
list(
|
|
140
|
+
src = scatfile,
|
|
141
|
+
name = paste0(feat, " vs nCount_RNA"),
|
|
142
|
+
descr = paste0("Scatter plot for ", feat, " against nCount_RNA")
|
|
143
|
+
),
|
|
144
|
+
h1 = "Scatter Plots",
|
|
145
|
+
ui = "table_of_images"
|
|
146
|
+
)
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
# return the dim_df calculated from the cell_qc_df
|
|
150
|
+
rbind(
|
|
151
|
+
cell_qc_df %>%
|
|
152
|
+
# group_by(Sample) %>%
|
|
153
|
+
summarise(
|
|
154
|
+
when = "Before_Cell_QC",
|
|
155
|
+
nCells = dplyr::n(),
|
|
156
|
+
nGenes = ngenes
|
|
157
|
+
) %>%
|
|
158
|
+
ungroup(),
|
|
159
|
+
cell_qc_df %>%
|
|
160
|
+
filter(.QC) %>%
|
|
161
|
+
# group_by(Sample) %>%
|
|
162
|
+
summarise(
|
|
163
|
+
when = "After_Cell_QC",
|
|
164
|
+
nCells = dplyr::n(),
|
|
165
|
+
nGenes = ngenes
|
|
166
|
+
) %>%
|
|
167
|
+
ungroup()
|
|
168
|
+
)
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
load_sample = function(sample) {
|
|
172
|
+
log_info("- Loading sample: {sample} ...")
|
|
173
|
+
mdata = as.data.frame(metadata)[metadata$Sample == sample, , drop=TRUE]
|
|
174
|
+
path = as.character(mdata$RNAData)
|
|
175
|
+
if (is.na(path) || !is.character(path) || nchar(path) == 0 || path == "NA") {
|
|
176
|
+
warning(paste0("No path found for sample: ", sample))
|
|
177
|
+
return (NULL)
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
# obj_list = list()
|
|
181
|
+
if (dir.exists(path)) {
|
|
182
|
+
exprs = tryCatch(
|
|
183
|
+
# Read10X requires
|
|
184
|
+
# - barcodes.tsv.gz
|
|
185
|
+
# - genes.tsv.gz
|
|
186
|
+
# - matrix.mtx.gz
|
|
187
|
+
# But sometimes, they are prefixed with sample name
|
|
188
|
+
# e.g.GSM4143656_SAM24345863-ln1.barcodes.tsv.gz
|
|
189
|
+
{ Read10X(data.dir = path) },
|
|
190
|
+
error = function(e) rename_files(e, sample, path)
|
|
191
|
+
)
|
|
192
|
+
} else {
|
|
193
|
+
exprs = Read10X_h5(path)
|
|
194
|
+
}
|
|
195
|
+
if ("Gene Expression" %in% names(exprs)) {
|
|
196
|
+
exprs = exprs[["Gene Expression"]]
|
|
197
|
+
}
|
|
198
|
+
obj <- CreateSeuratObject(exprs, project=sample)
|
|
199
|
+
# filter the cells that don't have any gene expressions
|
|
200
|
+
# cell_exprs = colSums(obj@assays$RNA)
|
|
201
|
+
# obj = subset(obj, cells = names(cell_exprs[cell_exprs > 0]))
|
|
202
|
+
obj = RenameCells(obj, add.cell.id = sample)
|
|
203
|
+
# Attach meta data
|
|
204
|
+
for (mname in names(mdata)) {
|
|
205
|
+
if (mname %in% c("RNAData", "TCRData")) { next }
|
|
206
|
+
mdt = mdata[[mname]]
|
|
207
|
+
if (is.factor(mdt)) { mdt = levels(mdt)[mdt] }
|
|
208
|
+
obj[[mname]] = mdt
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
if (isTRUE(envs$cell_qc_per_sample)) {
|
|
212
|
+
log_info("- Perform cell QC for sample: {sample} ...")
|
|
213
|
+
obj = perform_cell_qc(obj, TRUE)
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
if (isTRUE(envs$use_sct)) {
|
|
217
|
+
# so that we have data and scale.data layers on RNA assay
|
|
218
|
+
# useful for visualization in case some genes are not in
|
|
219
|
+
# the SCT assay
|
|
220
|
+
obj = NormalizeData(obj, verbose = FALSE)
|
|
221
|
+
obj = FindVariableFeatures(obj, verbose = FALSE)
|
|
222
|
+
obj = ScaleData(obj, verbose = FALSE)
|
|
223
|
+
}
|
|
224
|
+
obj
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
run_gene_qc <- function(sobj) {
|
|
228
|
+
cached <- get_cached(
|
|
229
|
+
list(
|
|
230
|
+
cell_qc = envs$cell_qc,
|
|
231
|
+
gene_qc = envs$gene_qc,
|
|
232
|
+
cell_qc_per_sample = envs$cell_qc_per_sample,
|
|
233
|
+
use_sct = envs$use_sct
|
|
234
|
+
),
|
|
235
|
+
"GeneQC",
|
|
236
|
+
cache_dir
|
|
237
|
+
)
|
|
238
|
+
if (!is.null(cached$data)) {
|
|
239
|
+
log_info("Loading gene-QC'ed object from cache ...")
|
|
240
|
+
sobj <- cached$data
|
|
241
|
+
} else {
|
|
242
|
+
log_info("Filtering genes ...")
|
|
243
|
+
genes <- rownames(sobj)
|
|
244
|
+
filtered <- FALSE
|
|
245
|
+
if (!is.null(envs$gene_qc$min_cells) && envs$gene_qc$min_cells > 0) {
|
|
246
|
+
genes = genes[Matrix::rowSums(sobj) >= envs$gene_qc$min_cells]
|
|
247
|
+
filtered <- TRUE
|
|
248
|
+
}
|
|
249
|
+
excludes <- envs$gene_qc$excludes
|
|
250
|
+
if (!is.null(excludes)) {
|
|
251
|
+
if (length(excludes) == 1) {
|
|
252
|
+
excludes <- trimws(unlist(strsplit(excludes, ",")))
|
|
253
|
+
}
|
|
254
|
+
for (ex in excludes) {
|
|
255
|
+
genes <- genes[!grepl(ex, genes)]
|
|
256
|
+
}
|
|
257
|
+
filtered <- TRUE
|
|
258
|
+
}
|
|
259
|
+
if (filtered) {
|
|
260
|
+
sobj = subset(sobj, features = genes)
|
|
261
|
+
}
|
|
262
|
+
cached$data <- sobj
|
|
263
|
+
save_to_cache(cached, "GeneQC", cache_dir)
|
|
264
|
+
}
|
|
265
|
+
sobj
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
run_cell_qc <- function(sobj) {
|
|
269
|
+
cached <- get_cached(
|
|
270
|
+
list(cell_qc = envs$cell_qc, cell_qc_per_sample = envs$cell_qc_per_sample, use_sct = envs$use_sct),
|
|
271
|
+
"CellQC",
|
|
272
|
+
cache_dir
|
|
273
|
+
)
|
|
274
|
+
if (!is.null(cached$data)) {
|
|
275
|
+
log_info("Loading cell-QC'ed object from cache ...")
|
|
276
|
+
sobj <- cached$data$sobj
|
|
277
|
+
cell_qc_df <<- cached$data$cell_qc_df
|
|
278
|
+
} else {
|
|
279
|
+
# Load data
|
|
280
|
+
log_info("Reading samples individually ...")
|
|
281
|
+
obj_list = lapply(samples, load_sample)
|
|
282
|
+
|
|
283
|
+
log_info("Merging samples ...")
|
|
284
|
+
sobj = Reduce(merge, obj_list)
|
|
285
|
+
rm(obj_list)
|
|
286
|
+
gc()
|
|
287
|
+
|
|
288
|
+
if (!envs$cell_qc_per_sample) {
|
|
289
|
+
log_info("Performing cell QC ...")
|
|
290
|
+
sobj = perform_cell_qc(sobj)
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
cached$data <- list(sobj = sobj, cell_qc_df = cell_qc_df)
|
|
294
|
+
save_to_cache(cached, "CellQC", cache_dir)
|
|
295
|
+
}
|
|
296
|
+
sobj
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
run_transformation <- function(sobj) {
|
|
300
|
+
envs_cache <- envs
|
|
301
|
+
envs_cache$ncores <- NULL
|
|
302
|
+
envs_cache$doublet_detector <- NULL
|
|
303
|
+
envs_cache$DoubletFinder <- NULL
|
|
304
|
+
envs_cache$scDblFinder <- NULL
|
|
305
|
+
envs_cache$IntegrateLayers <- NULL
|
|
306
|
+
cached <- get_cached(envs_cache, "Transformed", cache_dir)
|
|
307
|
+
if (!is.null(cached$data)) {
|
|
308
|
+
log_info("Loading transformed object from cache ...")
|
|
309
|
+
sobj <- cached$data
|
|
310
|
+
} else {
|
|
311
|
+
log_info("Performing transformation/scaling ...")
|
|
312
|
+
# Not joined yet
|
|
313
|
+
# sobj[["RNA"]] <- split(sobj[["RNA"]], f = sobj$Sample)
|
|
314
|
+
if (envs$use_sct) {
|
|
315
|
+
log_info("- Running SCTransform ...")
|
|
316
|
+
SCTransformArgs <- envs$SCTransform
|
|
317
|
+
# log to stdout but don't populate it to running log
|
|
318
|
+
print(paste0(" SCTransform: ", format_args(SCTransformArgs)))
|
|
319
|
+
log_debug(" SCTransform: {format_args(SCTransformArgs)}")
|
|
320
|
+
SCTransformArgs$object <- sobj
|
|
321
|
+
sobj <- do_call(SCTransform, SCTransformArgs)
|
|
322
|
+
# Default is to use the SCT assay
|
|
323
|
+
|
|
324
|
+
# Cleanup memory
|
|
325
|
+
SCTransformArgs$object <- NULL
|
|
326
|
+
rm(SCTransformArgs)
|
|
327
|
+
gc()
|
|
328
|
+
} else {
|
|
329
|
+
log_info("- Running NormalizeData ...")
|
|
330
|
+
NormalizeDataArgs <- envs$NormalizeData
|
|
331
|
+
print(paste0(" NormalizeData: ", format_args(NormalizeDataArgs)))
|
|
332
|
+
log_debug(" NormalizeData: {format_args(NormalizeDataArgs)}")
|
|
333
|
+
NormalizeDataArgs$object <- sobj
|
|
334
|
+
sobj <- do_call(NormalizeData, NormalizeDataArgs)
|
|
335
|
+
|
|
336
|
+
# Cleanup memory
|
|
337
|
+
NormalizeDataArgs$object <- NULL
|
|
338
|
+
rm(NormalizeDataArgs)
|
|
339
|
+
gc()
|
|
340
|
+
|
|
341
|
+
log_info("- Running FindVariableFeatures ...")
|
|
342
|
+
FindVariableFeaturesArgs <- envs$FindVariableFeatures
|
|
343
|
+
print(paste0(" FindVariableFeatures: ", format_args(FindVariableFeaturesArgs)))
|
|
344
|
+
log_debug(" FindVariableFeatures: {format_args(FindVariableFeaturesArgs)}")
|
|
345
|
+
FindVariableFeaturesArgs$object <- sobj
|
|
346
|
+
sobj <- do_call(FindVariableFeatures, FindVariableFeaturesArgs)
|
|
347
|
+
|
|
348
|
+
# Cleanup memory
|
|
349
|
+
FindVariableFeaturesArgs$object <- NULL
|
|
350
|
+
rm(FindVariableFeaturesArgs)
|
|
351
|
+
gc()
|
|
352
|
+
|
|
353
|
+
log_info("- Running ScaleData ...")
|
|
354
|
+
ScaleDataArgs <- envs$ScaleData
|
|
355
|
+
print(paste0(" ScaleData: ", format_args(ScaleDataArgs)))
|
|
356
|
+
log_debug(" ScaleData: {format_args(ScaleDataArgs)}")
|
|
357
|
+
ScaleDataArgs$object <- sobj
|
|
358
|
+
sobj <- do_call(ScaleData, ScaleDataArgs)
|
|
359
|
+
|
|
360
|
+
# Cleanup memory
|
|
361
|
+
ScaleDataArgs$object <- NULL
|
|
362
|
+
rm(ScaleDataArgs)
|
|
363
|
+
gc()
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
log_info("- Running RunPCA ...")
|
|
367
|
+
RunPCAArgs <- envs$RunPCA
|
|
368
|
+
RunPCAArgs$npcs <- if (is.null(RunPCAArgs$npcs)) { 50 } else { min(RunPCAArgs$npcs, ncol(sobj) - 1) }
|
|
369
|
+
print(paste0(" RunPCA: ", format_args(RunPCAArgs)))
|
|
370
|
+
log_debug(" RunPCA: {format_args(RunPCAArgs)}")
|
|
371
|
+
RunPCAArgs$object <- sobj
|
|
372
|
+
sobj <- do_call(RunPCA, RunPCAArgs)
|
|
373
|
+
|
|
374
|
+
# Cleanup memory
|
|
375
|
+
RunPCAArgs$object <- NULL
|
|
376
|
+
rm(RunPCAArgs)
|
|
377
|
+
gc()
|
|
378
|
+
|
|
379
|
+
cached$data <- sobj
|
|
380
|
+
save_to_cache(cached, "Transformed", cache_dir)
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
sobj
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
run_integration <- function(sobj) {
|
|
387
|
+
|
|
388
|
+
envs_cache <- envs
|
|
389
|
+
envs_cache$ncores <- NULL
|
|
390
|
+
envs_cache$doublet_detector <- NULL
|
|
391
|
+
envs_cache$DoubletFinder <- NULL
|
|
392
|
+
envs_cache$scDblFinder <- NULL
|
|
393
|
+
cached <- get_cached(envs_cache, "Integrated", cache_dir)
|
|
394
|
+
|
|
395
|
+
if (!is.null(cached$data)) {
|
|
396
|
+
log_info("Loading integrated/layer-joined object from cache ...")
|
|
397
|
+
sobj <- cached$data
|
|
398
|
+
} else {
|
|
399
|
+
|
|
400
|
+
if (!envs$no_integration) {
|
|
401
|
+
log_info("- Running IntegrateLayers (method = {envs$IntegrateLayers$method}) ...")
|
|
402
|
+
IntegrateLayersArgs <- envs$IntegrateLayers
|
|
403
|
+
method <- IntegrateLayersArgs$method
|
|
404
|
+
if (!is.null(IntegrateLayersArgs$reference) && is.character(IntegrateLayersArgs$reference)) {
|
|
405
|
+
log_info(" Using reference samples: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
|
|
406
|
+
IntegrateLayersArgs$reference <- match(IntegrateLayersArgs$reference, samples)
|
|
407
|
+
log_info(" Transferred to indices: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
|
|
408
|
+
}
|
|
409
|
+
if (method %in% c("CCA", "cca")) { method <- "CCAIntegration" } else
|
|
410
|
+
if (method %in% c("RPCA", "rpca")) { method <- "RPCAIntegration" } else
|
|
411
|
+
if (method %in% c("Harmony", "harmony")) { method <- "HarmonyIntegration" } else
|
|
412
|
+
if (method %in% c("FastMNN", "fastmnn")) { method <- "FastMNNIntegration" } else
|
|
413
|
+
if (method %in% c("scVI", "scvi")) { method <- "scVIIntegration" } else
|
|
414
|
+
{ stop(paste0("Unknown integration method: ", method)) }
|
|
415
|
+
if (envs$use_sct && is.null(IntegrateLayersArgs$normalization.method)) {
|
|
416
|
+
IntegrateLayersArgs$normalization.method <- "SCT"
|
|
417
|
+
}
|
|
418
|
+
IntegrateLayersArgs$method <- eval(parse(text = method))
|
|
419
|
+
new_reductions <- list(
|
|
420
|
+
"CCAIntegration" = "integrated.cca",
|
|
421
|
+
"RPCAIntegration" = "integrated.rpca",
|
|
422
|
+
"HarmonyIntegration" = "harmony",
|
|
423
|
+
"FastMNNIntegration" = "integration.mnn",
|
|
424
|
+
"scVIIntegration" = "integrated.scvi"
|
|
425
|
+
)
|
|
426
|
+
if (is.null(IntegrateLayersArgs$new.reduction)) {
|
|
427
|
+
IntegrateLayersArgs$new.reduction <- new_reductions[[method]]
|
|
428
|
+
}
|
|
429
|
+
print(paste0(" IntegrateLayers: ", format_args(IntegrateLayersArgs)))
|
|
430
|
+
log_debug(" IntegrateLayers: {format_args(IntegrateLayersArgs)}")
|
|
431
|
+
IntegrateLayersArgs$object <- sobj
|
|
432
|
+
sobj <- do_call(IntegrateLayers, IntegrateLayersArgs)
|
|
433
|
+
# Save it for dimension reduction plots
|
|
434
|
+
sobj@misc$integrated_new_reduction <- IntegrateLayersArgs$new.reduction
|
|
435
|
+
|
|
436
|
+
# Cleanup memory
|
|
437
|
+
IntegrateLayersArgs$object <- NULL
|
|
438
|
+
rm(IntegrateLayersArgs)
|
|
439
|
+
gc()
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
if (!envs$use_sct) {
|
|
443
|
+
log_info("- Joining layers ...")
|
|
444
|
+
sobj <- JoinLayers(sobj)
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
cached$data <- sobj
|
|
448
|
+
save_to_cache(cached, "Integrated", cache_dir)
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
sobj
|
|
452
|
+
}
|