biopipen 0.32.3__py3-none-any.whl → 0.33.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +6 -0
- biopipen/core/filters.py +35 -23
- biopipen/core/testing.py +6 -1
- biopipen/ns/bam.py +39 -0
- biopipen/ns/cellranger.py +5 -0
- biopipen/ns/cellranger_pipeline.py +2 -2
- biopipen/ns/cnvkit_pipeline.py +4 -1
- biopipen/ns/delim.py +33 -27
- biopipen/ns/protein.py +99 -0
- biopipen/ns/scrna.py +428 -250
- biopipen/ns/snp.py +16 -3
- biopipen/ns/tcr.py +125 -1
- biopipen/ns/vcf.py +34 -0
- biopipen/ns/web.py +5 -1
- biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
- biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
- biopipen/reports/tcr/ClonalStats.svelte +15 -0
- biopipen/reports/utils/misc.liq +20 -7
- biopipen/scripts/bam/BamMerge.py +2 -2
- biopipen/scripts/bam/BamSampling.py +4 -4
- biopipen/scripts/bam/BamSort.py +141 -0
- biopipen/scripts/bam/BamSplitChroms.py +10 -10
- biopipen/scripts/bam/BamSubsetByBed.py +3 -3
- biopipen/scripts/bam/CNVpytor.py +10 -10
- biopipen/scripts/bam/ControlFREEC.py +11 -11
- biopipen/scripts/bed/Bed2Vcf.py +5 -5
- biopipen/scripts/bed/BedConsensus.py +5 -5
- biopipen/scripts/bed/BedLiftOver.sh +6 -4
- biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
- biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
- biopipen/scripts/bed/BedtoolsMerge.py +4 -4
- biopipen/scripts/cellranger/CellRangerCount.py +20 -9
- biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
- biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
- biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
- biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
- biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
- biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
- biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
- biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
- biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
- biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
- biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
- biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
- biopipen/scripts/cnvkit/guess_baits.py +166 -93
- biopipen/scripts/delim/SampleInfo.R +94 -148
- biopipen/scripts/misc/Config2File.py +2 -2
- biopipen/scripts/misc/Str2File.py +2 -2
- biopipen/scripts/protein/MMCIF2PDB.py +33 -0
- biopipen/scripts/protein/PDB2Fasta.py +60 -0
- biopipen/scripts/protein/Prodigy.py +4 -4
- biopipen/scripts/protein/RMSD.py +178 -0
- biopipen/scripts/regulatory/MotifScan.py +8 -8
- biopipen/scripts/scrna/CellCellCommunication.py +59 -22
- biopipen/scripts/scrna/LoomTo10X.R +51 -0
- biopipen/scripts/scrna/MarkersFinder.R +273 -654
- biopipen/scripts/scrna/RadarPlots.R +73 -53
- biopipen/scripts/scrna/SCP-plot.R +15202 -0
- biopipen/scripts/scrna/ScVelo.py +0 -0
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -31
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -54
- biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -403
- biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +32 -17
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -239
- biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
- biopipen/scripts/scrna/SeuratMap2Ref.R +16 -12
- biopipen/scripts/scrna/SeuratPreparing.R +138 -81
- biopipen/scripts/scrna/SlingShot.R +71 -0
- biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
- biopipen/scripts/snp/Plink2GTMat.py +26 -11
- biopipen/scripts/snp/PlinkFilter.py +7 -7
- biopipen/scripts/snp/PlinkFromVcf.py +8 -5
- biopipen/scripts/snp/PlinkSimulation.py +4 -4
- biopipen/scripts/snp/PlinkUpdateName.py +4 -4
- biopipen/scripts/stats/ChowTest.R +48 -22
- biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
- biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
- biopipen/scripts/tcr/ClonalStats.R +484 -0
- biopipen/scripts/tcr/ScRepLoading.R +127 -0
- biopipen/scripts/tcr/TCRDock.py +10 -6
- biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
- biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
- biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
- biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
- biopipen/scripts/vcf/BcftoolsSort.py +4 -4
- biopipen/scripts/vcf/BcftoolsView.py +5 -5
- biopipen/scripts/vcf/Vcf2Bed.py +2 -2
- biopipen/scripts/vcf/VcfAnno.py +11 -11
- biopipen/scripts/vcf/VcfDownSample.sh +22 -10
- biopipen/scripts/vcf/VcfFilter.py +5 -5
- biopipen/scripts/vcf/VcfFix.py +7 -7
- biopipen/scripts/vcf/VcfFix_utils.py +12 -3
- biopipen/scripts/vcf/VcfIndex.py +3 -3
- biopipen/scripts/vcf/VcfIntersect.py +3 -3
- biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
- biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
- biopipen/scripts/vcf/bcftools_utils.py +3 -3
- biopipen/scripts/web/Download.py +8 -4
- biopipen/scripts/web/DownloadList.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
- biopipen/scripts/web/gcloud_common.py +1 -1
- biopipen/utils/gsea.R +75 -35
- biopipen/utils/misc.R +205 -7
- biopipen/utils/misc.py +17 -8
- biopipen/utils/reference.py +11 -11
- biopipen/utils/repr.R +146 -0
- biopipen/utils/vcf.py +1 -1
- {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/METADATA +8 -8
- {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/RECORD +115 -105
- {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/WHEEL +1 -1
- biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -144
- biopipen/scripts/scrna/SeuratPreparing-common.R +0 -467
- biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -204
- {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/entry_points.txt +0 -0
|
@@ -1,467 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
stringify_list <- function(x) {
|
|
3
|
-
paste(sapply(names(x), function(n) paste(n, x[[n]], sep = " = ") ), collapse = "; ")
|
|
4
|
-
}
|
|
5
|
-
|
|
6
|
-
format_args <- function(args) {
|
|
7
|
-
paste(capture.output(str(args)), collapse = ", ")
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
rename_files = function(e, sample, path) {
|
|
11
|
-
tmpdatadir = file.path(joboutdir, "renamed", sample)
|
|
12
|
-
if (dir.exists(tmpdatadir)) {
|
|
13
|
-
unlink(tmpdatadir, recursive = TRUE)
|
|
14
|
-
}
|
|
15
|
-
dir.create(tmpdatadir, recursive = TRUE, showWarnings = FALSE)
|
|
16
|
-
barcodefile = Sys.glob(file.path(path, "*barcodes.tsv.gz"))[1]
|
|
17
|
-
file.symlink(
|
|
18
|
-
normalizePath(barcodefile),
|
|
19
|
-
file.path(tmpdatadir, "barcodes.tsv.gz")
|
|
20
|
-
)
|
|
21
|
-
genefile = glob(file.path(path, "*{genes,features}.tsv.gz"))[1]
|
|
22
|
-
file.symlink(
|
|
23
|
-
normalizePath(genefile),
|
|
24
|
-
file.path(tmpdatadir, "features.tsv.gz")
|
|
25
|
-
)
|
|
26
|
-
matrixfile = Sys.glob(file.path(path, "*matrix.mtx.gz"))[1]
|
|
27
|
-
file.symlink(
|
|
28
|
-
normalizePath(matrixfile),
|
|
29
|
-
file.path(tmpdatadir, "matrix.mtx.gz")
|
|
30
|
-
)
|
|
31
|
-
Read10X(data.dir = tmpdatadir)
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
perform_cell_qc <- function(sobj, per_sample = FALSE) {
|
|
36
|
-
log_prefix <- ifelse(per_sample, " ", "- ")
|
|
37
|
-
log_info("{log_prefix}Adding metadata for QC ...")
|
|
38
|
-
sobj$percent.mt <- PercentageFeatureSet(sobj, pattern = "^MT-|^Mt-|^mt-")
|
|
39
|
-
sobj$percent.ribo <- PercentageFeatureSet(sobj, pattern = "^RP[SL]|^Rp[sl]")
|
|
40
|
-
sobj$percent.hb <- PercentageFeatureSet(sobj, pattern = "^HB[^P]|^Hb[^p]")
|
|
41
|
-
sobj$percent.plat <- PercentageFeatureSet(sobj, pattern = "PECAM1|PF4|Pecam1|Pf4")
|
|
42
|
-
|
|
43
|
-
if (is.null(envs$cell_qc) || length(envs$cell_qc) == 0) {
|
|
44
|
-
log_warn("{log_prefix}No cell QC criteria is provided. All cells will be kept.")
|
|
45
|
-
cell_qc <- "TRUE"
|
|
46
|
-
} else {
|
|
47
|
-
cell_qc <- envs$cell_qc
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
sobj@meta.data <- sobj@meta.data %>% mutate(.QC = !!rlang::parse_expr(cell_qc))
|
|
51
|
-
|
|
52
|
-
if (is.null(cell_qc_df)) {
|
|
53
|
-
cell_qc_df <<- sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE]
|
|
54
|
-
} else {
|
|
55
|
-
cell_qc_df <<- rbind(cell_qc_df, sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE])
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
# Do the filtering
|
|
59
|
-
log_info("{log_prefix}Filtering cells using QC criteria ...")
|
|
60
|
-
sobj <- subset(sobj, subset = .QC)
|
|
61
|
-
sobj$.QC <- NULL
|
|
62
|
-
|
|
63
|
-
return(sobj)
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
report_cell_qc = function(ngenes) {
|
|
67
|
-
# uses cell_qc_df
|
|
68
|
-
|
|
69
|
-
# Violin plots
|
|
70
|
-
log_info("- Plotting violin plots ...")
|
|
71
|
-
add_report(
|
|
72
|
-
list(
|
|
73
|
-
kind = "descr",
|
|
74
|
-
content = paste(
|
|
75
|
-
"The violin plots for each feature. The cells are grouped by sample.",
|
|
76
|
-
"The cells that fail the QC criteria are colored in red, and",
|
|
77
|
-
"the cells that pass the QC criteria are colored in black.",
|
|
78
|
-
"The cells that fail the QC criteria are filtered out in the returned Seurat object."
|
|
79
|
-
)
|
|
80
|
-
),
|
|
81
|
-
h1 = "Violin Plots"
|
|
82
|
-
)
|
|
83
|
-
for (feat in feats) {
|
|
84
|
-
log_info(" For feature: {feat}")
|
|
85
|
-
vln_p <- ggplot(cell_qc_df, aes(x = Sample, y = !!sym(feat), color = .QC)) +
|
|
86
|
-
geom_violin(fill = "white", width = 0.5) +
|
|
87
|
-
geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
|
|
88
|
-
scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
|
|
89
|
-
labs(x = "Sample", y = feat) +
|
|
90
|
-
theme_minimal()
|
|
91
|
-
|
|
92
|
-
vlnplot = file.path(plotsdir, paste0(slugify(feat), ".vln.png"))
|
|
93
|
-
png(
|
|
94
|
-
vlnplot,
|
|
95
|
-
width = 800 + length(samples) * 15, height = 600, res = 100
|
|
96
|
-
)
|
|
97
|
-
print(vln_p)
|
|
98
|
-
dev.off()
|
|
99
|
-
|
|
100
|
-
vlnplot_pdf = file.path(plotsdir, paste0(slugify(feat), ".vln.pdf"))
|
|
101
|
-
pdf(
|
|
102
|
-
vlnplot_pdf,
|
|
103
|
-
width = (800 + length(samples) * 15) / 100, height = 600 / 100
|
|
104
|
-
)
|
|
105
|
-
print(vln_p)
|
|
106
|
-
dev.off()
|
|
107
|
-
|
|
108
|
-
add_report(
|
|
109
|
-
list(
|
|
110
|
-
src = vlnplot,
|
|
111
|
-
name = feat,
|
|
112
|
-
download = vlnplot_pdf,
|
|
113
|
-
descr = paste0("Distribution of ", feat, " for each sample.")
|
|
114
|
-
),
|
|
115
|
-
h1 = "Violin Plots",
|
|
116
|
-
ui = "table_of_images"
|
|
117
|
-
)
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
# Scatter plots against nCount_RNA
|
|
121
|
-
log_info("- Plotting scatter plots ...")
|
|
122
|
-
add_report(
|
|
123
|
-
list(
|
|
124
|
-
kind = "descr",
|
|
125
|
-
content = paste(
|
|
126
|
-
"The scatter plots for each feature against nCount_RNA. ",
|
|
127
|
-
"The cells that fail the QC criteria are colored in red, and",
|
|
128
|
-
"the cells that pass the QC criteria are colored in black.",
|
|
129
|
-
"The cells that fail the QC criteria are filtered out in the returned Seurat object."
|
|
130
|
-
)
|
|
131
|
-
),
|
|
132
|
-
h1 = "Scatter Plots"
|
|
133
|
-
)
|
|
134
|
-
for (feat in setdiff(feats, "nCount_RNA")) {
|
|
135
|
-
log_info(" For feature: {feat}, against nCount_RNA")
|
|
136
|
-
scat_p <- ggplot(cell_qc_df, aes(x = nCount_RNA, y = !!sym(feat), color = .QC)) +
|
|
137
|
-
geom_point() +
|
|
138
|
-
scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
|
|
139
|
-
labs(x = "nCount_RNA", y = feat) +
|
|
140
|
-
theme_minimal()
|
|
141
|
-
|
|
142
|
-
scatfile = file.path(plotsdir, paste0(slugify(feat), "-nCount_RNA.scatter.png"))
|
|
143
|
-
png(scatfile, width = 800, height = 600, res = 100)
|
|
144
|
-
print(scat_p)
|
|
145
|
-
dev.off()
|
|
146
|
-
|
|
147
|
-
scatfile_pdf = file.path(plotsdir, paste0(slugify(feat), "-nCount_RNA.scatter.pdf"))
|
|
148
|
-
pdf(scatfile_pdf, width = 8, height = 6)
|
|
149
|
-
print(scat_p)
|
|
150
|
-
dev.off()
|
|
151
|
-
|
|
152
|
-
add_report(
|
|
153
|
-
list(
|
|
154
|
-
src = scatfile,
|
|
155
|
-
name = paste0(feat, " vs nCount_RNA"),
|
|
156
|
-
download = scatfile_pdf,
|
|
157
|
-
descr = paste0("Scatter plot for ", feat, " against nCount_RNA")
|
|
158
|
-
),
|
|
159
|
-
h1 = "Scatter Plots",
|
|
160
|
-
ui = "table_of_images"
|
|
161
|
-
)
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
# return the dim_df calculated from the cell_qc_df
|
|
165
|
-
rbind(
|
|
166
|
-
cell_qc_df %>%
|
|
167
|
-
# group_by(Sample) %>%
|
|
168
|
-
summarise(
|
|
169
|
-
when = "Before_Cell_QC",
|
|
170
|
-
nCells = dplyr::n(),
|
|
171
|
-
nGenes = ngenes
|
|
172
|
-
) %>%
|
|
173
|
-
ungroup(),
|
|
174
|
-
cell_qc_df %>%
|
|
175
|
-
filter(.QC) %>%
|
|
176
|
-
# group_by(Sample) %>%
|
|
177
|
-
summarise(
|
|
178
|
-
when = "After_Cell_QC",
|
|
179
|
-
nCells = dplyr::n(),
|
|
180
|
-
nGenes = ngenes
|
|
181
|
-
) %>%
|
|
182
|
-
ungroup()
|
|
183
|
-
)
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
load_sample = function(sample) {
|
|
187
|
-
log_info("- Loading sample: {sample} ...")
|
|
188
|
-
mdata = as.data.frame(metadata)[metadata$Sample == sample, , drop=TRUE]
|
|
189
|
-
path = as.character(mdata$RNAData)
|
|
190
|
-
if (is.na(path) || !is.character(path) || nchar(path) == 0 || path == "NA") {
|
|
191
|
-
warning(paste0("No path found for sample: ", sample))
|
|
192
|
-
return (NULL)
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
# obj_list = list()
|
|
196
|
-
if (dir.exists(path)) {
|
|
197
|
-
exprs = tryCatch(
|
|
198
|
-
# Read10X requires
|
|
199
|
-
# - barcodes.tsv.gz
|
|
200
|
-
# - genes.tsv.gz
|
|
201
|
-
# - matrix.mtx.gz
|
|
202
|
-
# But sometimes, they are prefixed with sample name
|
|
203
|
-
# e.g.GSM4143656_SAM24345863-ln1.barcodes.tsv.gz
|
|
204
|
-
{ Read10X(data.dir = path) },
|
|
205
|
-
error = function(e) rename_files(e, sample, path)
|
|
206
|
-
)
|
|
207
|
-
} else {
|
|
208
|
-
exprs = Read10X_h5(path)
|
|
209
|
-
}
|
|
210
|
-
if ("Gene Expression" %in% names(exprs)) {
|
|
211
|
-
exprs = exprs[["Gene Expression"]]
|
|
212
|
-
}
|
|
213
|
-
obj <- CreateSeuratObject(exprs, project=sample)
|
|
214
|
-
# filter the cells that don't have any gene expressions
|
|
215
|
-
# cell_exprs = colSums(obj@assays$RNA)
|
|
216
|
-
# obj = subset(obj, cells = names(cell_exprs[cell_exprs > 0]))
|
|
217
|
-
obj = RenameCells(obj, add.cell.id = sample)
|
|
218
|
-
# Attach meta data
|
|
219
|
-
for (mname in names(mdata)) {
|
|
220
|
-
if (mname %in% c("RNAData", "TCRData")) { next }
|
|
221
|
-
mdt = mdata[[mname]]
|
|
222
|
-
if (is.factor(mdt)) { mdt = levels(mdt)[mdt] }
|
|
223
|
-
obj[[mname]] = mdt
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
if (isTRUE(envs$cell_qc_per_sample)) {
|
|
227
|
-
log_info("- Perform cell QC for sample: {sample} ...")
|
|
228
|
-
obj = perform_cell_qc(obj, per_sample = TRUE)
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
if (isTRUE(envs$use_sct)) {
|
|
232
|
-
# so that we have data and scale.data layers on RNA assay
|
|
233
|
-
# useful for visualization in case some genes are not in
|
|
234
|
-
# the SCT assay
|
|
235
|
-
obj = NormalizeData(obj, verbose = FALSE)
|
|
236
|
-
obj = FindVariableFeatures(obj, verbose = FALSE)
|
|
237
|
-
obj = ScaleData(obj, verbose = FALSE)
|
|
238
|
-
}
|
|
239
|
-
obj
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
run_gene_qc <- function(sobj) {
|
|
243
|
-
cached <- get_cached(
|
|
244
|
-
list(
|
|
245
|
-
cell_qc = envs$cell_qc,
|
|
246
|
-
gene_qc = envs$gene_qc,
|
|
247
|
-
cell_qc_per_sample = envs$cell_qc_per_sample,
|
|
248
|
-
use_sct = envs$use_sct
|
|
249
|
-
),
|
|
250
|
-
"GeneQC",
|
|
251
|
-
cache_dir
|
|
252
|
-
)
|
|
253
|
-
if (!is.null(cached$data)) {
|
|
254
|
-
log_info("Loading gene-QC'ed object from cache ...")
|
|
255
|
-
sobj <- cached$data
|
|
256
|
-
} else {
|
|
257
|
-
log_info("Filtering genes ...")
|
|
258
|
-
genes <- rownames(sobj)
|
|
259
|
-
filtered <- FALSE
|
|
260
|
-
if (!is.null(envs$gene_qc$min_cells) && envs$gene_qc$min_cells > 0) {
|
|
261
|
-
genes = genes[Matrix::rowSums(sobj) >= envs$gene_qc$min_cells]
|
|
262
|
-
filtered <- TRUE
|
|
263
|
-
}
|
|
264
|
-
excludes <- envs$gene_qc$excludes
|
|
265
|
-
if (!is.null(excludes)) {
|
|
266
|
-
if (length(excludes) == 1) {
|
|
267
|
-
excludes <- trimws(unlist(strsplit(excludes, ",")))
|
|
268
|
-
}
|
|
269
|
-
for (ex in excludes) {
|
|
270
|
-
genes <- genes[!grepl(ex, genes)]
|
|
271
|
-
}
|
|
272
|
-
filtered <- TRUE
|
|
273
|
-
}
|
|
274
|
-
if (filtered) {
|
|
275
|
-
sobj = subset(sobj, features = genes)
|
|
276
|
-
}
|
|
277
|
-
cached$data <- sobj
|
|
278
|
-
save_to_cache(cached, "GeneQC", cache_dir)
|
|
279
|
-
}
|
|
280
|
-
sobj
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
run_cell_qc <- function(sobj) {
|
|
284
|
-
cached <- get_cached(
|
|
285
|
-
list(cell_qc = envs$cell_qc, cell_qc_per_sample = envs$cell_qc_per_sample, use_sct = envs$use_sct),
|
|
286
|
-
"CellQC",
|
|
287
|
-
cache_dir
|
|
288
|
-
)
|
|
289
|
-
if (!is.null(cached$data)) {
|
|
290
|
-
log_info("Loading cell-QC'ed object from cache ...")
|
|
291
|
-
sobj <- cached$data$sobj
|
|
292
|
-
cell_qc_df <<- cached$data$cell_qc_df
|
|
293
|
-
} else {
|
|
294
|
-
# Load data
|
|
295
|
-
log_info("Reading samples individually ...")
|
|
296
|
-
obj_list = lapply(samples, load_sample)
|
|
297
|
-
|
|
298
|
-
log_info("Merging samples ...")
|
|
299
|
-
sobj = Reduce(merge, obj_list)
|
|
300
|
-
rm(obj_list)
|
|
301
|
-
gc()
|
|
302
|
-
|
|
303
|
-
if (!envs$cell_qc_per_sample) {
|
|
304
|
-
log_info("Performing cell QC ...")
|
|
305
|
-
sobj = perform_cell_qc(sobj, per_sample = FALSE)
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
cached$data <- list(sobj = sobj, cell_qc_df = cell_qc_df)
|
|
309
|
-
save_to_cache(cached, "CellQC", cache_dir)
|
|
310
|
-
}
|
|
311
|
-
sobj
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
run_transformation <- function(sobj) {
|
|
315
|
-
envs_cache <- envs
|
|
316
|
-
envs_cache$ncores <- NULL
|
|
317
|
-
envs_cache$doublet_detector <- NULL
|
|
318
|
-
envs_cache$DoubletFinder <- NULL
|
|
319
|
-
envs_cache$scDblFinder <- NULL
|
|
320
|
-
envs_cache$IntegrateLayers <- NULL
|
|
321
|
-
cached <- get_cached(envs_cache, "Transformed", cache_dir)
|
|
322
|
-
if (!is.null(cached$data)) {
|
|
323
|
-
log_info("Loading transformed object from cache ...")
|
|
324
|
-
sobj <- cached$data
|
|
325
|
-
} else {
|
|
326
|
-
log_info("Performing transformation/scaling ...")
|
|
327
|
-
# Not joined yet
|
|
328
|
-
# sobj[["RNA"]] <- split(sobj[["RNA"]], f = sobj$Sample)
|
|
329
|
-
if (envs$use_sct) {
|
|
330
|
-
log_info("- Running SCTransform ...")
|
|
331
|
-
SCTransformArgs <- envs$SCTransform
|
|
332
|
-
# log to stdout but don't populate it to running log
|
|
333
|
-
print(paste0(" SCTransform: ", format_args(SCTransformArgs)))
|
|
334
|
-
log_debug(" SCTransform: {format_args(SCTransformArgs)}")
|
|
335
|
-
SCTransformArgs$object <- sobj
|
|
336
|
-
sobj <- do_call(SCTransform, SCTransformArgs)
|
|
337
|
-
# Default is to use the SCT assay
|
|
338
|
-
|
|
339
|
-
# Cleanup memory
|
|
340
|
-
SCTransformArgs$object <- NULL
|
|
341
|
-
rm(SCTransformArgs)
|
|
342
|
-
gc()
|
|
343
|
-
} else {
|
|
344
|
-
log_info("- Running NormalizeData ...")
|
|
345
|
-
NormalizeDataArgs <- envs$NormalizeData
|
|
346
|
-
print(paste0(" NormalizeData: ", format_args(NormalizeDataArgs)))
|
|
347
|
-
log_debug(" NormalizeData: {format_args(NormalizeDataArgs)}")
|
|
348
|
-
NormalizeDataArgs$object <- sobj
|
|
349
|
-
sobj <- do_call(NormalizeData, NormalizeDataArgs)
|
|
350
|
-
|
|
351
|
-
# Cleanup memory
|
|
352
|
-
NormalizeDataArgs$object <- NULL
|
|
353
|
-
rm(NormalizeDataArgs)
|
|
354
|
-
gc()
|
|
355
|
-
|
|
356
|
-
log_info("- Running FindVariableFeatures ...")
|
|
357
|
-
FindVariableFeaturesArgs <- envs$FindVariableFeatures
|
|
358
|
-
print(paste0(" FindVariableFeatures: ", format_args(FindVariableFeaturesArgs)))
|
|
359
|
-
log_debug(" FindVariableFeatures: {format_args(FindVariableFeaturesArgs)}")
|
|
360
|
-
FindVariableFeaturesArgs$object <- sobj
|
|
361
|
-
sobj <- do_call(FindVariableFeatures, FindVariableFeaturesArgs)
|
|
362
|
-
|
|
363
|
-
# Cleanup memory
|
|
364
|
-
FindVariableFeaturesArgs$object <- NULL
|
|
365
|
-
rm(FindVariableFeaturesArgs)
|
|
366
|
-
gc()
|
|
367
|
-
|
|
368
|
-
log_info("- Running ScaleData ...")
|
|
369
|
-
ScaleDataArgs <- envs$ScaleData
|
|
370
|
-
print(paste0(" ScaleData: ", format_args(ScaleDataArgs)))
|
|
371
|
-
log_debug(" ScaleData: {format_args(ScaleDataArgs)}")
|
|
372
|
-
ScaleDataArgs$object <- sobj
|
|
373
|
-
sobj <- do_call(ScaleData, ScaleDataArgs)
|
|
374
|
-
|
|
375
|
-
# Cleanup memory
|
|
376
|
-
ScaleDataArgs$object <- NULL
|
|
377
|
-
rm(ScaleDataArgs)
|
|
378
|
-
gc()
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
log_info("- Running RunPCA ...")
|
|
382
|
-
RunPCAArgs <- envs$RunPCA
|
|
383
|
-
RunPCAArgs$npcs <- if (is.null(RunPCAArgs$npcs)) { 50 } else { min(RunPCAArgs$npcs, ncol(sobj) - 1) }
|
|
384
|
-
print(paste0(" RunPCA: ", format_args(RunPCAArgs)))
|
|
385
|
-
log_debug(" RunPCA: {format_args(RunPCAArgs)}")
|
|
386
|
-
RunPCAArgs$object <- sobj
|
|
387
|
-
sobj <- do_call(RunPCA, RunPCAArgs)
|
|
388
|
-
|
|
389
|
-
# Cleanup memory
|
|
390
|
-
RunPCAArgs$object <- NULL
|
|
391
|
-
rm(RunPCAArgs)
|
|
392
|
-
gc()
|
|
393
|
-
|
|
394
|
-
cached$data <- sobj
|
|
395
|
-
save_to_cache(cached, "Transformed", cache_dir)
|
|
396
|
-
}
|
|
397
|
-
|
|
398
|
-
sobj
|
|
399
|
-
}
|
|
400
|
-
|
|
401
|
-
run_integration <- function(sobj) {
|
|
402
|
-
|
|
403
|
-
envs_cache <- envs
|
|
404
|
-
envs_cache$ncores <- NULL
|
|
405
|
-
envs_cache$doublet_detector <- NULL
|
|
406
|
-
envs_cache$DoubletFinder <- NULL
|
|
407
|
-
envs_cache$scDblFinder <- NULL
|
|
408
|
-
cached <- get_cached(envs_cache, "Integrated", cache_dir)
|
|
409
|
-
|
|
410
|
-
if (!is.null(cached$data)) {
|
|
411
|
-
log_info("Loading integrated/layer-joined object from cache ...")
|
|
412
|
-
sobj <- cached$data
|
|
413
|
-
} else {
|
|
414
|
-
|
|
415
|
-
if (!envs$no_integration) {
|
|
416
|
-
log_info("- Running IntegrateLayers (method = {envs$IntegrateLayers$method}) ...")
|
|
417
|
-
IntegrateLayersArgs <- envs$IntegrateLayers
|
|
418
|
-
method <- IntegrateLayersArgs$method
|
|
419
|
-
if (!is.null(IntegrateLayersArgs$reference) && is.character(IntegrateLayersArgs$reference)) {
|
|
420
|
-
log_info(" Using reference samples: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
|
|
421
|
-
IntegrateLayersArgs$reference <- match(IntegrateLayersArgs$reference, samples)
|
|
422
|
-
log_info(" Transferred to indices: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
|
|
423
|
-
}
|
|
424
|
-
if (method %in% c("CCA", "cca")) { method <- "CCAIntegration" } else
|
|
425
|
-
if (method %in% c("RPCA", "rpca")) { method <- "RPCAIntegration" } else
|
|
426
|
-
if (method %in% c("Harmony", "harmony")) { method <- "HarmonyIntegration" } else
|
|
427
|
-
if (method %in% c("FastMNN", "fastmnn")) { method <- "FastMNNIntegration" } else
|
|
428
|
-
if (method %in% c("scVI", "scvi")) { method <- "scVIIntegration" } else
|
|
429
|
-
{ stop(paste0("Unknown integration method: ", method)) }
|
|
430
|
-
if (envs$use_sct && is.null(IntegrateLayersArgs$normalization.method)) {
|
|
431
|
-
IntegrateLayersArgs$normalization.method <- "SCT"
|
|
432
|
-
}
|
|
433
|
-
IntegrateLayersArgs$method <- eval(parse(text = method))
|
|
434
|
-
new_reductions <- list(
|
|
435
|
-
"CCAIntegration" = "integrated.cca",
|
|
436
|
-
"RPCAIntegration" = "integrated.rpca",
|
|
437
|
-
"HarmonyIntegration" = "harmony",
|
|
438
|
-
"FastMNNIntegration" = "integration.mnn",
|
|
439
|
-
"scVIIntegration" = "integrated.scvi"
|
|
440
|
-
)
|
|
441
|
-
if (is.null(IntegrateLayersArgs$new.reduction)) {
|
|
442
|
-
IntegrateLayersArgs$new.reduction <- new_reductions[[method]]
|
|
443
|
-
}
|
|
444
|
-
print(paste0(" IntegrateLayers: ", format_args(IntegrateLayersArgs)))
|
|
445
|
-
log_debug(" IntegrateLayers: {format_args(IntegrateLayersArgs)}")
|
|
446
|
-
IntegrateLayersArgs$object <- sobj
|
|
447
|
-
sobj <- do_call(IntegrateLayers, IntegrateLayersArgs)
|
|
448
|
-
# Save it for dimension reduction plots
|
|
449
|
-
sobj@misc$integrated_new_reduction <- IntegrateLayersArgs$new.reduction
|
|
450
|
-
|
|
451
|
-
# Cleanup memory
|
|
452
|
-
IntegrateLayersArgs$object <- NULL
|
|
453
|
-
rm(IntegrateLayersArgs)
|
|
454
|
-
gc()
|
|
455
|
-
}
|
|
456
|
-
|
|
457
|
-
if (!envs$use_sct) {
|
|
458
|
-
log_info("- Joining layers ...")
|
|
459
|
-
sobj <- JoinLayers(sobj)
|
|
460
|
-
}
|
|
461
|
-
|
|
462
|
-
cached$data <- sobj
|
|
463
|
-
save_to_cache(cached, "Integrated", cache_dir)
|
|
464
|
-
}
|
|
465
|
-
|
|
466
|
-
sobj
|
|
467
|
-
}
|
|
@@ -1,204 +0,0 @@
|
|
|
1
|
-
.get_envs_cached_doubletfinder <- function() {
|
|
2
|
-
envs_cache <- envs
|
|
3
|
-
envs_cache$ncores <- NULL
|
|
4
|
-
envs_cache$doublet_detector <- NULL
|
|
5
|
-
envs_cache$scDblFinder <- NULL
|
|
6
|
-
envs_cache$DoubletFinder$ncores <- NULL
|
|
7
|
-
envs_cache
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
.get_envs_cached_scdblfinder <- function() {
|
|
11
|
-
envs_cache <- envs
|
|
12
|
-
envs_cache$ncores <- NULL
|
|
13
|
-
envs_cache$doublet_detector <- NULL
|
|
14
|
-
envs_cache$DoubletFinder <- NULL
|
|
15
|
-
envs_cache$scDblFinder$ncores <- NULL
|
|
16
|
-
envs_cache
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
.run_doubletfinder <- function() {
|
|
20
|
-
library(DoubletFinder)
|
|
21
|
-
log_info("- Preparing Seurat object ...")
|
|
22
|
-
|
|
23
|
-
if (is.null(envs$DoubletFinder$ncores)) {
|
|
24
|
-
envs$DoubletFinder$ncores <- envs$ncores
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
# More controls from envs?
|
|
28
|
-
sobj <- FindNeighbors(sobj, dims = 1:envs$DoubletFinder$PCs)
|
|
29
|
-
sobj <- FindClusters(sobj)
|
|
30
|
-
|
|
31
|
-
log_info("- pK Indentification ...")
|
|
32
|
-
sweep.res.list <- paramSweep(
|
|
33
|
-
sobj,
|
|
34
|
-
PCs = 1:envs$DoubletFinder$PCs,
|
|
35
|
-
sct = envs$use_sct,
|
|
36
|
-
num.cores = envs$DoubletFinder$ncores
|
|
37
|
-
)
|
|
38
|
-
sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
|
|
39
|
-
bcmvn <- find.pK(sweep.stats)
|
|
40
|
-
bcmvn$Selected <- bcmvn$pK == bcmvn$pK[which.max(bcmvn$BCmetric)[1]]
|
|
41
|
-
|
|
42
|
-
pK <- bcmvn$pK[which.max(bcmvn$BCmetric)[1]]
|
|
43
|
-
pK <- as.numeric(as.character(pK))
|
|
44
|
-
pN <- envs$DoubletFinder$pN
|
|
45
|
-
log_info("- Homotypic Doublet Proportion Estimate ...")
|
|
46
|
-
homotypic.prop <- modelHomotypic(Idents(sobj))
|
|
47
|
-
nExp_poi <- round(nrow(sobj@meta.data) * envs$DoubletFinder$doublets)
|
|
48
|
-
nExp_poi.adj <- round(nExp_poi * (1 - homotypic.prop))
|
|
49
|
-
|
|
50
|
-
log_info("- Running DoubletFinder ...")
|
|
51
|
-
sobj <- doubletFinder(
|
|
52
|
-
sobj,
|
|
53
|
-
PCs = 1:envs$DoubletFinder$PCs,
|
|
54
|
-
pN = pN,
|
|
55
|
-
pK = pK,
|
|
56
|
-
nExp = nExp_poi.adj,
|
|
57
|
-
reuse.pANN = FALSE,
|
|
58
|
-
sct = envs$use_sct
|
|
59
|
-
)
|
|
60
|
-
pANN_col <- paste0("pANN_", pN, "_", pK)
|
|
61
|
-
pANN_col <- colnames(sobj@meta.data)[grepl(pANN_col, colnames(sobj@meta.data))]
|
|
62
|
-
DF_col <- paste0("DF.classifications_", pN, "_", pK)
|
|
63
|
-
DF_col <- colnames(sobj@meta.data)[grepl(DF_col, colnames(sobj@meta.data))]
|
|
64
|
-
doublets <- sobj@meta.data[, c(pANN_col, DF_col), drop = FALSE]
|
|
65
|
-
colnames(doublets) <- c("DoubletFinder_score","DoubletFinder_DropletType")
|
|
66
|
-
doublets$DoubletFinder_DropletType <- tolower(doublets$DoubletFinder_DropletType)
|
|
67
|
-
|
|
68
|
-
pk_plot <- ggplot(bcmvn, aes(x = pK, y = BCmetric, color = Selected)) +
|
|
69
|
-
geom_point() +
|
|
70
|
-
# rotate x axis labels
|
|
71
|
-
theme(axis.text.x = element_text(angle = 90, hjust = 1))
|
|
72
|
-
list(doublets = doublets, pk_plot = pk_plot)
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
.run_scdblfinder <- function() {
|
|
76
|
-
library(scDblFinder)
|
|
77
|
-
if (is.null(envs$scDblFinder$ncores)) {
|
|
78
|
-
envs$scDblFinder$ncores <- envs$ncores
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
envs$scDblFinder$sce <- GetAssayData(sobj, layer = "counts")
|
|
82
|
-
if (envs$scDblFinder$ncores > 1) {
|
|
83
|
-
envs$scDblFinder$BPPARAM <- BiocParallel::MulticoreParam(envs$scDblFinder$ncores, RNGseed = 8525)
|
|
84
|
-
}
|
|
85
|
-
envs$scDblFinder$returnType <- "table"
|
|
86
|
-
envs$scDblFinder$ncores <- NULL
|
|
87
|
-
|
|
88
|
-
doublets <- do_call(scDblFinder, envs$scDblFinder)
|
|
89
|
-
doublets <- doublets[doublets$type == "real", , drop = FALSE]
|
|
90
|
-
doublets <- doublets[, c("score", "class"), drop = FALSE]
|
|
91
|
-
colnames(doublets) <- c("scDblFinder_score", "scDblFinder_DropletType")
|
|
92
|
-
|
|
93
|
-
list(doublets = doublets)
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
run_dd <- function(detector) {
|
|
97
|
-
log_info("Running {detector} ...")
|
|
98
|
-
if (detector == "DoubletFinder") {
|
|
99
|
-
envs_cache_fun <- .get_envs_cached_doubletfinder
|
|
100
|
-
run_fun <- .run_doubletfinder
|
|
101
|
-
} else if (detector == "scDblFinder") {
|
|
102
|
-
envs_cache_fun <- .get_envs_cached_scdblfinder
|
|
103
|
-
run_fun <- .run_scdblfinder
|
|
104
|
-
} else {
|
|
105
|
-
stop("Unknown doublet detector: ", detector)
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
cached <- get_cached(envs_cache_fun(), detector, cache_dir)
|
|
109
|
-
if (!is.null(cached$data)) {
|
|
110
|
-
log_info("- Loading cached results ...")
|
|
111
|
-
results <- cached$data
|
|
112
|
-
} else {
|
|
113
|
-
results <- run_fun()
|
|
114
|
-
|
|
115
|
-
cached$data <- results
|
|
116
|
-
save_to_cache(cached, detector, cache_dir)
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
results
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
save_dd <- function(dd, detector) {
|
|
123
|
-
doublets <- dd$doublets
|
|
124
|
-
write.table(
|
|
125
|
-
doublets,
|
|
126
|
-
file.path(joboutdir, paste0(detector, "_doublets_singlets.txt")),
|
|
127
|
-
row.names = FALSE,
|
|
128
|
-
quote = FALSE,
|
|
129
|
-
sep = "\t"
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
summary <- as.data.frame(table(dd$doublets[[paste0(detector, "_DropletType")]]))
|
|
133
|
-
colnames(summary) <- c("Classification", "Droplet_N")
|
|
134
|
-
write.table(
|
|
135
|
-
summary,
|
|
136
|
-
file.path(joboutdir, paste0(detector, "_summary.txt")),
|
|
137
|
-
row.names = FALSE,
|
|
138
|
-
quote = FALSE,
|
|
139
|
-
sep = "\t"
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
n_doublet <- summary$Droplet_N[summary$Classification == 'doublet']
|
|
143
|
-
log_info("- {n_doublet}/{sum(summary$Droplet_N)} doublets detected.")
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
add_dd_to_seurat <- function(sobj, dd) {
|
|
147
|
-
AddMetaData(sobj, metadata = as.data.frame(dd$doublets))
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
plot_dd <- function(sobj, dd, detector) {
|
|
151
|
-
if (detector == "DoubletFinder") {
|
|
152
|
-
log_debug("- Plotting pK vs BCmetric ...")
|
|
153
|
-
ggsave(dd$pk_plot, filename = file.path(plotsdir, "DoubletFinder_pK_BCmetric.png"))
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
log_info("- Plotting dimension reduction ...")
|
|
157
|
-
dimp <- DimPlot(
|
|
158
|
-
sobj, group.by = paste0(detector, "_DropletType"), order = "doublet",
|
|
159
|
-
cols = c("#333333", "#FF3333"), pt.size = 0.8, alpha = 0.5)
|
|
160
|
-
ggsave(dimp, filename = file.path(plotsdir, paste0(detector, "_dimplot.png")))
|
|
161
|
-
ggsave(dimp, filename = file.path(plotsdir, paste0(detector, "_dimplot.pdf")))
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
filter_dd <- function(sobj, dd, detector) {
|
|
165
|
-
subset(sobj,
|
|
166
|
-
cells = rownames(dd$doublets[
|
|
167
|
-
dd$doublets[[paste0(detector, "_DropletType")]] == "singlet", ,
|
|
168
|
-
drop = FALSE
|
|
169
|
-
]))
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
report_dd <- function(detector) {
|
|
173
|
-
add_report(
|
|
174
|
-
list(
|
|
175
|
-
kind = "descr",
|
|
176
|
-
content = "The table contains the number of cells classified as singlets and doublets."
|
|
177
|
-
),
|
|
178
|
-
list(
|
|
179
|
-
kind = "table",
|
|
180
|
-
data = list(path = file.path(joboutdir, paste0(detector, "_summary.txt")))
|
|
181
|
-
),
|
|
182
|
-
h1 = paste0(detector, " Results"),
|
|
183
|
-
h2 = paste0("The ", detector, " Summary")
|
|
184
|
-
)
|
|
185
|
-
|
|
186
|
-
if (detector == "DoubletFinder") {
|
|
187
|
-
add_report(
|
|
188
|
-
list(name = "pK vs BCmetric", src = file.path(plotsdir, "DoubletFinder_pK_BCmetric.png")),
|
|
189
|
-
list(name = "Dimension Reduction Plot", src = file.path(plotsdir, "DoubletFinder_dimplot.png"),
|
|
190
|
-
download = file.path(plotsdir, "DoubletFinder_dimplot.pdf")),
|
|
191
|
-
ui = "table_of_images",
|
|
192
|
-
h1 = "DoubletFinder Results",
|
|
193
|
-
h2 = "Plots"
|
|
194
|
-
)
|
|
195
|
-
} else {
|
|
196
|
-
add_report(
|
|
197
|
-
list(name = "Dimension Reduction Plot", src = file.path(plotsdir, "scDblFinder_dimplot.png"),
|
|
198
|
-
download = file.path(plotsdir, "scDblFinder_dimplot.pdf")),
|
|
199
|
-
ui = "table_of_images",
|
|
200
|
-
h1 = "scDblFinder Results",
|
|
201
|
-
h2 = "Plots"
|
|
202
|
-
)
|
|
203
|
-
}
|
|
204
|
-
}
|
|
File without changes
|