PyPI - biopipen - Versions diffs - 0.29.2__py3-none-any.whl → 0.30.0__py3-none-any.whl - Mend

biopipen 0.29.2py3-none-any.whl → 0.30.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biopipen might be problematic. Click here for more details.

Files changed (105) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +2 -0
biopipen/core/filters.py +21 -0
biopipen/ns/plot.py +55 -0
biopipen/ns/scrna.py +49 -13
biopipen/ns/web.py +87 -5
biopipen/scripts/bam/CNAClinic.R +2 -1
biopipen/scripts/cellranger/CellRangerCount.py +3 -3
biopipen/scripts/cellranger/CellRangerSummary.R +2 -1
biopipen/scripts/cnv/AneuploidyScore.R +1 -1
biopipen/scripts/cnv/AneuploidyScoreSummary.R +2 -2
biopipen/scripts/delim/RowsBinder.R +1 -1
biopipen/scripts/delim/SampleInfo.R +3 -2
biopipen/scripts/gene/GeneNameConversion.R +2 -2
biopipen/scripts/gsea/Enrichr.R +3 -3
biopipen/scripts/gsea/FGSEA.R +2 -2
biopipen/scripts/gsea/GSEA.R +2 -2
biopipen/scripts/gsea/PreRank.R +2 -2
biopipen/scripts/plot/Heatmap.R +3 -3
biopipen/scripts/plot/Manhattan.R +2 -1
biopipen/scripts/plot/QQPlot.R +1 -1
biopipen/scripts/plot/ROC.R +1 -1
biopipen/scripts/plot/Scatter.R +112 -0
biopipen/scripts/plot/VennDiagram.R +3 -3
biopipen/scripts/regulatory/MotifAffinityTest.R +3 -7
biopipen/scripts/rnaseq/Simulation.R +1 -1
biopipen/scripts/rnaseq/UnitConversion.R +2 -1
biopipen/scripts/scrna/AnnData2Seurat.R +1 -1
biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +24 -8
biopipen/scripts/scrna/CellTypeAnnotation-common.R +10 -0
biopipen/scripts/scrna/CellTypeAnnotation-direct.R +9 -1
biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +12 -8
biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +15 -2
biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +38 -15
biopipen/scripts/scrna/CellTypeAnnotation.R +3 -0
biopipen/scripts/scrna/CellsDistribution.R +3 -2
biopipen/scripts/scrna/DimPlots.R +1 -1
biopipen/scripts/scrna/ExprImputation-alra.R +1 -1
biopipen/scripts/scrna/MarkersFinder.R +5 -5
biopipen/scripts/scrna/MetaMarkers.R +4 -4
biopipen/scripts/scrna/ModuleScoreCalculator.R +2 -1
biopipen/scripts/scrna/RadarPlots.R +1 -1
biopipen/scripts/scrna/ScFGSEA.R +4 -3
biopipen/scripts/scrna/Seurat2AnnData.R +1 -1
biopipen/scripts/scrna/SeuratClusterStats-clustree.R +73 -0
biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +4 -3
biopipen/scripts/scrna/SeuratClusterStats-features.R +6 -5
biopipen/scripts/scrna/SeuratClusterStats-hists.R +6 -5
biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +4 -3
biopipen/scripts/scrna/SeuratClusterStats-stats.R +4 -3
biopipen/scripts/scrna/SeuratClusterStats.R +24 -8
biopipen/scripts/scrna/SeuratClustering-common.R +213 -0
biopipen/scripts/scrna/SeuratClustering.R +10 -170
biopipen/scripts/scrna/SeuratMap2Ref.R +65 -31
biopipen/scripts/scrna/SeuratMetadataMutater.R +2 -2
biopipen/scripts/scrna/SeuratPreparing-common.R +452 -0
biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +201 -0
biopipen/scripts/scrna/SeuratPreparing.R +22 -562
biopipen/scripts/scrna/SeuratSubClustering.R +24 -39
biopipen/scripts/scrna/TopExpressingGenes.R +1 -1
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +2 -2
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +2 -2
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +3 -3
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +3 -3
biopipen/scripts/snp/MatrixEQTL.R +1 -1
biopipen/scripts/snp/PlinkCallRate.R +2 -2
biopipen/scripts/snp/PlinkFreq.R +2 -2
biopipen/scripts/snp/PlinkHWE.R +2 -2
biopipen/scripts/snp/PlinkHet.R +2 -2
biopipen/scripts/snp/PlinkIBD.R +2 -2
biopipen/scripts/stats/ChowTest.R +1 -1
biopipen/scripts/stats/DiffCoexpr.R +1 -1
biopipen/scripts/stats/LiquidAssoc.R +1 -1
biopipen/scripts/stats/Mediation.R +11 -9
biopipen/scripts/stats/MetaPvalue.R +4 -1
biopipen/scripts/stats/MetaPvalue1.R +4 -1
biopipen/scripts/tcr/Attach2Seurat.R +1 -1
biopipen/scripts/tcr/CDR3AAPhyschem.R +1 -1
biopipen/scripts/tcr/CloneResidency.R +2 -2
biopipen/scripts/tcr/CloneSizeQQPlot.R +2 -2
biopipen/scripts/tcr/Immunarch-basic.R +0 -4
biopipen/scripts/tcr/Immunarch-clonality.R +0 -4
biopipen/scripts/tcr/Immunarch-diversity.R +2 -24
biopipen/scripts/tcr/Immunarch-geneusage.R +0 -2
biopipen/scripts/tcr/Immunarch-kmer.R +0 -2
biopipen/scripts/tcr/Immunarch-overlap.R +0 -2
biopipen/scripts/tcr/Immunarch-spectratyping.R +0 -2
biopipen/scripts/tcr/Immunarch-tracking.R +0 -2
biopipen/scripts/tcr/Immunarch-vjjunc.R +0 -2
biopipen/scripts/tcr/Immunarch.R +43 -11
biopipen/scripts/tcr/ImmunarchFilter.R +1 -1
biopipen/scripts/tcr/ImmunarchLoading.R +2 -2
biopipen/scripts/tcr/SampleDiversity.R +1 -1
biopipen/scripts/tcr/TCRClusterStats.R +2 -2
biopipen/scripts/tcr/TCRClustering.R +2 -2
biopipen/scripts/tcr/TESSA.R +2 -2
biopipen/scripts/vcf/TruvariBenchSummary.R +2 -2
biopipen/scripts/vcf/TruvariConsistency.R +1 -1
biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
biopipen/scripts/web/gcloud_common.py +49 -0
{biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/METADATA +1 -1
{biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/RECORD +105 -96
{biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/WHEEL +0 -0
{biopipen-0.29.2.dist-info → biopipen-0.30.0.dist-info}/entry_points.txt +0 -0

biopipen/scripts/scrna/SeuratMap2Ref.R CHANGED Viewed

@@ -1,4 +1,4 @@
-source("{{biopipen_dir}}/utils/misc.R")
+{{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
 library(parallel)
 library(Seurat)
@@ -17,6 +17,7 @@ refnorm = {{envs.refnorm | r}}
 ncores = {{envs.ncores | r}}
 split_by = {{envs.split_by | r}}
 mutaters = {{envs.mutaters | r}}
+skip_if_normalized = {{envs.skip_if_normalized | r}}
 sctransform_args = {{envs.SCTransform | r: todot="-"}}
 normalizedata_args = {{envs.NormalizeData | r: todot="-"}}
 findtransferanchors_args = {{envs.FindTransferAnchors | r: todot="-"}}
@@ -40,7 +41,7 @@ mapquery_args$refdata[[use]] = use
 outdir = dirname(outfile)
 if (is.null(split_by)) {
-    options(future.globals.maxSize = 80000 * 1024^2)
+    options(future.globals.maxSize = 8 * 1024 ^ 4)
     future::plan(strategy = "multicore", workers = ncores)
 }
@@ -98,6 +99,7 @@ if (refnorm == "SCTransform") {
 # Load Seurat object
 log_info("- Loading Seurat object")
 sobj = readRDS(sobjfile)
+defassay <- DefaultAssay(sobj)
 if (!is.null(mutaters) && length(mutaters) > 0) {
     log_info("- Applying mutaters")
@@ -126,43 +128,61 @@ if (!is.null(split_by)) {
 # Normalize data
 log_info("- Normalizing data")
 if (refnorm == "SCTransform") {
-    log_info("  Using SCTransform normalization")
-    sctransform_args$residual.features = rownames(x = reference)
-    if (is.null(split_by)) {
-        sctransform_args$object = sobj
-        query = do_call(SCTransform, sctransform_args)
+    if (defassay == "SCT" && skip_if_normalized) {
+        log_warn("  Skipping normalization as the object is already SCTransform'ed")
+        query = sobj
     } else {
-        query = mclapply(
-            X = sobj,
-            FUN = function(x) {
-                sctransform_args$object = x
-                do_call(SCTransform, sctransform_args)
-            },
-            mc.cores = ncores
-        )
-        if (any(unlist(lapply(query, class)) == "try-error")) {
-            stop(paste0("\nmclapply (SCTransform) error:", query))
+        log_info("  Using SCTransform normalization")
+        sctransform_args$residual.features = rownames(x = reference)
+        if (is.null(split_by)) {
+            sctransform_args$object = sobj
+            query = do_call(SCTransform, sctransform_args)
+            sctransform_args$object <- NULL
+            rm(sctransform_args)
+            gc()
+        } else {
+            query = mclapply(
+                X = sobj,
+                FUN = function(x) {
+                    sctransform_args$object = x
+                    do_call(SCTransform, sctransform_args)
+                },
+                mc.cores = ncores
+            )
+            if (any(unlist(lapply(query, class)) == "try-error")) {
+                stop(paste0("\nmclapply (SCTransform) error:", query))
+            }
         }
     }
 } else {
-    log_info("  Using NormalizeData normalization")
-    if (is.null(split_by)) {
-        normalizedata_args$object = sobj
-        query = do_call(NormalizeData, normalizedata_args)
+    if (defassay == "RNA" && skip_if_normalized) {
+        log_warn("  Skipping normalization as the object is already LogNormalize'd")
+        query = sobj
     } else {
-        query = mclapply(
-            X = sobj,
-            FUN = function(x) {
-                normalizedata_args$object = x
-                do_call(NormalizeData, normalizedata_args)
-            },
-            mc.cores = ncores
-        )
-        if (any(unlist(lapply(query, class)) == "try-error")) {
-            stop(paste0("\nmclapply (NormalizeData) error:", query))
+        log_info("  Using NormalizeData normalization")
+        if (is.null(split_by)) {
+            normalizedata_args$object = sobj
+            query = do_call(NormalizeData, normalizedata_args)
+        } else {
+            query = mclapply(
+                X = sobj,
+                FUN = function(x) {
+                    normalizedata_args$object = x
+                    do_call(NormalizeData, normalizedata_args)
+                },
+                mc.cores = ncores
+            )
+            if (any(unlist(lapply(query, class)) == "try-error")) {
+                stop(paste0("\nmclapply (NormalizeData) error:", query))
+            }
         }
+        normalizedata_args$object <- NULL
+        rm(normalizedata_args)
+        gc()
     }
 }
+rm(sobj)
+gc()
 # Find anchors between query and reference
 log_info("- Finding anchors")
@@ -170,6 +190,10 @@ findtransferanchors_args$reference = reference
 if (is.null(split_by)) {
     findtransferanchors_args$query = query
     anchors = do_call(FindTransferAnchors, findtransferanchors_args)
+    findtransferanchors_args$reference = NULL
+    findtransferanchors_args$query = NULL
+    rm(findtransferanchors_args)
+    gc()
 } else {
     anchors = mclapply(
         X = query,
@@ -191,6 +215,10 @@ if (is.null(split_by)) {
     mapquery_args$query = query
     mapquery_args$anchorset = anchors
     query = do_call(MapQuery, mapquery_args)
+    mapquery_args$reference = NULL
+    mapquery_args$query = NULL
+    mapquery_args$anchorset = NULL
+    gc()
 } else {
     query = mclapply(
         X = seq_along(query),
@@ -221,6 +249,9 @@ if (is.null(split_by)) {
         if (e$message == "subscript out of bounds") stop(mappingscore_sob_msg)
         stop(e)
     })
+    mappingscore_args$anchors = NULL
+    rm(mappingscore_args)
+    gc()
 } else {
     mappingscore = mclapply(
         X = seq_along(query),
@@ -266,6 +297,9 @@ if (is.null(split_by)) {
     # Combine the results
     log_info("- Merging the results")
+    gc()
+    # Memory efficient way to merge the results
+    # query = Reduce(function(x, y) merge(x, y, merge.dr = "ref.umap"), query)
     query = merge(query[[1]], query[2:length(query)], merge.dr = "ref.umap")
 }

biopipen/scripts/scrna/SeuratMetadataMutater.R CHANGED Viewed

@@ -1,5 +1,5 @@
-source("{{biopipen_dir}}/utils/misc.R")
-source("{{biopipen_dir}}/utils/mutate_helpers.R")
+{{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
+{{ biopipen_dir | joinpaths: "utils", "mutate_helpers.R" | source_r }}
 library(rlang)
 library(tibble)

biopipen/scripts/scrna/SeuratPreparing-common.R ADDED Viewed

@@ -0,0 +1,452 @@
+stringify_list <- function(x) {
+    paste(sapply(names(x), function(n) paste(n, x[[n]], sep = " = ") ), collapse = "; ")
+}
+format_args <- function(args) {
+    paste(capture.output(str(args)), collapse = ", ")
+}
+rename_files = function(e, sample, path) {
+    tmpdatadir = file.path(joboutdir, "renamed", sample)
+    if (dir.exists(tmpdatadir)) {
+        unlink(tmpdatadir, recursive = TRUE)
+    }
+    dir.create(tmpdatadir, recursive = TRUE, showWarnings = FALSE)
+    barcodefile = Sys.glob(file.path(path, "*barcodes.tsv.gz"))[1]
+    file.symlink(
+        normalizePath(barcodefile),
+        file.path(tmpdatadir, "barcodes.tsv.gz")
+    )
+    genefile = glob(file.path(path, "*{genes,features}.tsv.gz"))[1]
+    file.symlink(
+        normalizePath(genefile),
+        file.path(tmpdatadir, "features.tsv.gz")
+    )
+    matrixfile = Sys.glob(file.path(path, "*matrix.mtx.gz"))[1]
+    file.symlink(
+        normalizePath(matrixfile),
+        file.path(tmpdatadir, "matrix.mtx.gz")
+    )
+    Read10X(data.dir = tmpdatadir)
+}
+perform_cell_qc <- function(sobj, per_sample = FALSE) {
+    log_prefix <- ifelse(per_sample, "  ", "- ")
+    log_info("{log_prefix}Adding metadata for QC ...")
+    sobj$percent.mt <- PercentageFeatureSet(sobj, pattern = "^MT-")
+    sobj$percent.ribo <- PercentageFeatureSet(sobj, pattern = "^RP[SL]")
+    sobj$percent.hb <- PercentageFeatureSet(sobj, pattern = "^HB[^(P)]")
+    sobj$percent.plat <- PercentageFeatureSet(sobj, pattern = "PECAM1|PF4")
+    if (is.null(envs$cell_qc) || length(envs$cell_qc) == 0) {
+        log_warn("{log_prefix}No cell QC criteria is provided. All cells will be kept.")
+        cell_qc <- "TRUE"
+    } else {
+        cell_qc <- envs$cell_qc
+    }
+    sobj@meta.data <- sobj@meta.data %>% mutate(.QC = !!rlang::parse_expr(cell_qc))
+    if (is.null(cell_qc_df)) {
+        cell_qc_df <<- sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE]
+    } else {
+        cell_qc_df <<- rbind(cell_qc_df, sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE])
+    }
+    # Do the filtering
+    log_info("{log_prefix}Filtering cells using QC criteria ...")
+    sobj <- subset(sobj, subset = .QC)
+    sobj$.QC <- NULL
+    return(sobj)
+}
+report_cell_qc = function(ngenes) {
+    # uses cell_qc_df
+    # Violin plots
+    log_info("- Plotting violin plots ...")
+    add_report(
+        list(
+            kind = "descr",
+            content = paste(
+                "The violin plots for each feature. The cells are grouped by sample.",
+                "The cells that fail the QC criteria are colored in red, and",
+                "the cells that pass the QC criteria are colored in black.",
+                "The cells that fail the QC criteria are filtered out in the returned Seurat object."
+            )
+        ),
+        h1 = "Violin Plots"
+    )
+    for (feat in feats) {
+        log_info("  For feature: {feat}")
+        vln_p <- ggplot(cell_qc_df, aes(x = Sample, y = !!sym(feat), color = .QC)) +
+            geom_violin(fill = "white", width = 0.5) +
+            geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
+            scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
+            labs(x = "Sample", y = feat) +
+            theme_minimal()
+        vlnplot = file.path(plotsdir, paste0(slugify(feat), ".vln.png"))
+        png(
+            vlnplot,
+            width = 800 + length(samples) * 15, height = 600, res = 100
+        )
+        print(vln_p)
+        dev.off()
+        add_report(
+            list(
+                src = vlnplot,
+                name = feat,
+                descr = paste0("Distribution of ", feat, " for each sample.")
+            ),
+            h1 = "Violin Plots",
+            ui = "table_of_images"
+        )
+    }
+    # Scatter plots against nCount_RNA
+    log_info("- Plotting scatter plots ...")
+    add_report(
+        list(
+            kind = "descr",
+            content = paste(
+                "The scatter plots for each feature against nCount_RNA. ",
+                "The cells that fail the QC criteria are colored in red, and",
+                "the cells that pass the QC criteria are colored in black.",
+                "The cells that fail the QC criteria are filtered out in the returned Seurat object."
+            )
+        ),
+        h1 = "Scatter Plots"
+    )
+    for (feat in setdiff(feats, "nCount_RNA")) {
+        log_info("  For feature: {feat}, against nCount_RNA")
+        scat_p <- ggplot(cell_qc_df, aes(x = nCount_RNA, y = !!sym(feat), color = .QC)) +
+            geom_point() +
+            scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
+            labs(x = "nCount_RNA", y = feat) +
+            theme_minimal()
+        scatfile = file.path(plotsdir, paste0(slugify(feat), "-nCount_RNA.scatter.png"))
+        png(scatfile, width = 800, height = 600, res = 100)
+        print(scat_p)
+        dev.off()
+        add_report(
+            list(
+                src = scatfile,
+                name = paste0(feat, " vs nCount_RNA"),
+                descr = paste0("Scatter plot for ", feat, " against nCount_RNA")
+            ),
+            h1 = "Scatter Plots",
+            ui = "table_of_images"
+        )
+    }
+    # return the dim_df calculated from the cell_qc_df
+    rbind(
+        cell_qc_df %>%
+            # group_by(Sample) %>%
+            summarise(
+                when = "Before_Cell_QC",
+                nCells = dplyr::n(),
+                nGenes = ngenes
+            ) %>%
+            ungroup(),
+        cell_qc_df %>%
+            filter(.QC) %>%
+            # group_by(Sample) %>%
+            summarise(
+                when = "After_Cell_QC",
+                nCells = dplyr::n(),
+                nGenes = ngenes
+            ) %>%
+            ungroup()
+    )
+}
+load_sample = function(sample) {
+    log_info("- Loading sample: {sample} ...")
+    mdata = as.data.frame(metadata)[metadata$Sample == sample, , drop=TRUE]
+    path = as.character(mdata$RNAData)
+    if (is.na(path) || !is.character(path) || nchar(path) == 0 || path == "NA") {
+        warning(paste0("No path found for sample: ", sample))
+        return (NULL)
+    }
+    # obj_list = list()
+    if (dir.exists(path)) {
+        exprs = tryCatch(
+            # Read10X requires
+            # - barcodes.tsv.gz
+            # - genes.tsv.gz
+            # - matrix.mtx.gz
+            # But sometimes, they are prefixed with sample name
+            # e.g.GSM4143656_SAM24345863-ln1.barcodes.tsv.gz
+            { Read10X(data.dir = path) },
+            error = function(e) rename_files(e, sample, path)
+        )
+    } else {
+        exprs = Read10X_h5(path)
+    }
+    if ("Gene Expression" %in% names(exprs)) {
+        exprs = exprs[["Gene Expression"]]
+    }
+    obj <- CreateSeuratObject(exprs, project=sample)
+    # filter the cells that don't have any gene expressions
+    # cell_exprs = colSums(obj@assays$RNA)
+    # obj = subset(obj, cells = names(cell_exprs[cell_exprs > 0]))
+    obj = RenameCells(obj, add.cell.id = sample)
+    # Attach meta data
+    for (mname in names(mdata)) {
+        if (mname %in% c("RNAData", "TCRData")) { next }
+        mdt = mdata[[mname]]
+        if (is.factor(mdt)) { mdt = levels(mdt)[mdt] }
+        obj[[mname]] = mdt
+    }
+    if (isTRUE(envs$cell_qc_per_sample)) {
+        log_info("- Perform cell QC for sample: {sample} ...")
+        obj = perform_cell_qc(obj, TRUE)
+    }
+    if (isTRUE(envs$use_sct)) {
+        # so that we have data and scale.data layers on RNA assay
+        # useful for visualization in case some genes are not in
+        # the SCT assay
+        obj = NormalizeData(obj, verbose = FALSE)
+        obj = FindVariableFeatures(obj, verbose = FALSE)
+        obj = ScaleData(obj, verbose = FALSE)
+    }
+    obj
+}
+run_gene_qc <- function(sobj) {
+    cached <- get_cached(
+        list(
+            cell_qc = envs$cell_qc,
+            gene_qc = envs$gene_qc,
+            cell_qc_per_sample = envs$cell_qc_per_sample,
+            use_sct = envs$use_sct
+        ),
+        "GeneQC",
+        cache_dir
+    )
+    if (!is.null(cached$data)) {
+        log_info("Loading gene-QC'ed object from cache ...")
+        sobj <- cached$data
+    } else {
+        log_info("Filtering genes ...")
+        genes <- rownames(sobj)
+        filtered <- FALSE
+        if (!is.null(envs$gene_qc$min_cells) && envs$gene_qc$min_cells > 0) {
+            genes = genes[Matrix::rowSums(sobj) >= envs$gene_qc$min_cells]
+            filtered <- TRUE
+        }
+        excludes <- envs$gene_qc$excludes
+        if (!is.null(excludes)) {
+            if (length(excludes) == 1) {
+                excludes <- trimws(unlist(strsplit(excludes, ",")))
+            }
+            for (ex in excludes) {
+                genes <- genes[!grepl(ex, genes)]
+            }
+            filtered <- TRUE
+        }
+        if (filtered) {
+            sobj = subset(sobj, features = genes)
+        }
+        cached$data <- sobj
+        save_to_cache(cached, "GeneQC", cache_dir)
+    }
+    sobj
+}
+run_cell_qc <- function(sobj) {
+    cached <- get_cached(
+        list(cell_qc = envs$cell_qc, cell_qc_per_sample = envs$cell_qc_per_sample, use_sct = envs$use_sct),
+        "CellQC",
+        cache_dir
+    )
+    if (!is.null(cached$data)) {
+        log_info("Loading cell-QC'ed object from cache ...")
+        sobj <- cached$data$sobj
+        cell_qc_df <<- cached$data$cell_qc_df
+    } else {
+        # Load data
+        log_info("Reading samples individually ...")
+        obj_list = lapply(samples, load_sample)
+        log_info("Merging samples ...")
+        sobj = Reduce(merge, obj_list)
+        rm(obj_list)
+        gc()
+        if (!envs$cell_qc_per_sample) {
+            log_info("Performing cell QC ...")
+            sobj = perform_cell_qc(sobj)
+        }
+        cached$data <- list(sobj = sobj, cell_qc_df = cell_qc_df)
+        save_to_cache(cached, "CellQC", cache_dir)
+    }
+    sobj
+}
+run_transformation <- function(sobj) {
+    envs_cache <- envs
+    envs_cache$ncores <- NULL
+    envs_cache$doublet_detector <- NULL
+    envs_cache$DoubletFinder <- NULL
+    envs_cache$scDblFinder <- NULL
+    envs_cache$IntegrateLayers <- NULL
+    cached <- get_cached(envs_cache, "Transformed", cache_dir)
+    if (!is.null(cached$data)) {
+        log_info("Loading transformed object from cache ...")
+        sobj <- cached$data
+    } else {
+        log_info("Performing transformation/scaling ...")
+        # Not joined yet
+        # sobj[["RNA"]] <- split(sobj[["RNA"]], f = sobj$Sample)
+        if (envs$use_sct) {
+            log_info("- Running SCTransform ...")
+            SCTransformArgs <- envs$SCTransform
+            # log to stdout but don't populate it to running log
+            print(paste0("  SCTransform: ", format_args(SCTransformArgs)))
+            log_debug("  SCTransform: {format_args(SCTransformArgs)}")
+            SCTransformArgs$object <- sobj
+            sobj <- do_call(SCTransform, SCTransformArgs)
+            # Default is to use the SCT assay
+            # Cleanup memory
+            SCTransformArgs$object <- NULL
+            rm(SCTransformArgs)
+            gc()
+        } else {
+            log_info("- Running NormalizeData ...")
+            NormalizeDataArgs <- envs$NormalizeData
+            print(paste0("  NormalizeData: ", format_args(NormalizeDataArgs)))
+            log_debug("  NormalizeData: {format_args(NormalizeDataArgs)}")
+            NormalizeDataArgs$object <- sobj
+            sobj <- do_call(NormalizeData, NormalizeDataArgs)
+            # Cleanup memory
+            NormalizeDataArgs$object <- NULL
+            rm(NormalizeDataArgs)
+            gc()
+            log_info("- Running FindVariableFeatures ...")
+            FindVariableFeaturesArgs <- envs$FindVariableFeatures
+            print(paste0("  FindVariableFeatures: ", format_args(FindVariableFeaturesArgs)))
+            log_debug("  FindVariableFeatures: {format_args(FindVariableFeaturesArgs)}")
+            FindVariableFeaturesArgs$object <- sobj
+            sobj <- do_call(FindVariableFeatures, FindVariableFeaturesArgs)
+            # Cleanup memory
+            FindVariableFeaturesArgs$object <- NULL
+            rm(FindVariableFeaturesArgs)
+            gc()
+            log_info("- Running ScaleData ...")
+            ScaleDataArgs <- envs$ScaleData
+            print(paste0("  ScaleData: ", format_args(ScaleDataArgs)))
+            log_debug("  ScaleData: {format_args(ScaleDataArgs)}")
+            ScaleDataArgs$object <- sobj
+            sobj <- do_call(ScaleData, ScaleDataArgs)
+            # Cleanup memory
+            ScaleDataArgs$object <- NULL
+            rm(ScaleDataArgs)
+            gc()
+        }
+        log_info("- Running RunPCA ...")
+        RunPCAArgs <- envs$RunPCA
+        RunPCAArgs$npcs <- if (is.null(RunPCAArgs$npcs)) { 50 } else { min(RunPCAArgs$npcs, ncol(sobj) - 1) }
+        print(paste0("  RunPCA: ", format_args(RunPCAArgs)))
+        log_debug("  RunPCA: {format_args(RunPCAArgs)}")
+        RunPCAArgs$object <- sobj
+        sobj <- do_call(RunPCA, RunPCAArgs)
+        # Cleanup memory
+        RunPCAArgs$object <- NULL
+        rm(RunPCAArgs)
+        gc()
+        cached$data <- sobj
+        save_to_cache(cached, "Transformed", cache_dir)
+    }
+    sobj
+}
+run_integration <- function(sobj) {
+    envs_cache <- envs
+    envs_cache$ncores <- NULL
+    envs_cache$doublet_detector <- NULL
+    envs_cache$DoubletFinder <- NULL
+    envs_cache$scDblFinder <- NULL
+    cached <- get_cached(envs_cache, "Integrated", cache_dir)
+    if (!is.null(cached$data)) {
+        log_info("Loading integrated/layer-joined object from cache ...")
+        sobj <- cached$data
+    } else {
+        if (!envs$no_integration) {
+            log_info("- Running IntegrateLayers (method = {envs$IntegrateLayers$method}) ...")
+            IntegrateLayersArgs <- envs$IntegrateLayers
+            method <- IntegrateLayersArgs$method
+            if (!is.null(IntegrateLayersArgs$reference) && is.character(IntegrateLayersArgs$reference)) {
+                log_info("  Using reference samples: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
+                IntegrateLayersArgs$reference <- match(IntegrateLayersArgs$reference, samples)
+                log_info("  Transferred to indices: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
+            }
+            if (method %in% c("CCA", "cca")) { method <- "CCAIntegration" } else
+            if (method %in% c("RPCA", "rpca")) { method <- "RPCAIntegration" } else
+            if (method %in% c("Harmony", "harmony")) { method <- "HarmonyIntegration" } else
+            if (method %in% c("FastMNN", "fastmnn")) { method <- "FastMNNIntegration" } else
+            if (method %in% c("scVI", "scvi")) { method <- "scVIIntegration" } else
+            { stop(paste0("Unknown integration method: ", method)) }
+            if (envs$use_sct && is.null(IntegrateLayersArgs$normalization.method)) {
+                IntegrateLayersArgs$normalization.method <- "SCT"
+            }
+            IntegrateLayersArgs$method <- eval(parse(text = method))
+            new_reductions <- list(
+                "CCAIntegration" = "integrated.cca",
+                "RPCAIntegration" = "integrated.rpca",
+                "HarmonyIntegration" = "harmony",
+                "FastMNNIntegration" = "integration.mnn",
+                "scVIIntegration" = "integrated.scvi"
+            )
+            if (is.null(IntegrateLayersArgs$new.reduction)) {
+                IntegrateLayersArgs$new.reduction <- new_reductions[[method]]
+            }
+            print(paste0("  IntegrateLayers: ", format_args(IntegrateLayersArgs)))
+            log_debug("  IntegrateLayers: {format_args(IntegrateLayersArgs)}")
+            IntegrateLayersArgs$object <- sobj
+            sobj <- do_call(IntegrateLayers, IntegrateLayersArgs)
+            # Save it for dimension reduction plots
+            sobj@misc$integrated_new_reduction <- IntegrateLayersArgs$new.reduction
+            # Cleanup memory
+            IntegrateLayersArgs$object <- NULL
+            rm(IntegrateLayersArgs)
+            gc()
+        }
+        if (!envs$use_sct) {
+            log_info("- Joining layers ...")
+            sobj <- JoinLayers(sobj)
+        }
+        cached$data <- sobj
+        save_to_cache(cached, "Integrated", cache_dir)
+    }
+    sobj
+}

biopipen 0.29.2__py3-none-any.whl → 0.30.0__py3-none-any.whl

Potentially problematic release.

biopipen 0.29.2py3-none-any.whl → 0.30.0py3-none-any.whl