PyPI - biopipen - Versions diffs - 0.29.2__py3-none-any.whl → 0.31.0__py3-none-any.whl - Mend

biopipen 0.29.2py3-none-any.whl → 0.31.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biopipen might be problematic. Click here for more details.

Files changed (106) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +2 -0
biopipen/core/filters.py +21 -0
biopipen/ns/plot.py +55 -0
biopipen/ns/scrna.py +110 -21
biopipen/ns/web.py +87 -5
biopipen/scripts/bam/CNAClinic.R +2 -1
biopipen/scripts/cellranger/CellRangerCount.py +3 -3
biopipen/scripts/cellranger/CellRangerSummary.R +2 -1
biopipen/scripts/cnv/AneuploidyScore.R +1 -1
biopipen/scripts/cnv/AneuploidyScoreSummary.R +2 -2
biopipen/scripts/delim/RowsBinder.R +1 -1
biopipen/scripts/delim/SampleInfo.R +3 -2
biopipen/scripts/gene/GeneNameConversion.R +2 -2
biopipen/scripts/gsea/Enrichr.R +3 -3
biopipen/scripts/gsea/FGSEA.R +2 -2
biopipen/scripts/gsea/GSEA.R +2 -2
biopipen/scripts/gsea/PreRank.R +2 -2
biopipen/scripts/plot/Heatmap.R +3 -3
biopipen/scripts/plot/Manhattan.R +2 -1
biopipen/scripts/plot/QQPlot.R +1 -1
biopipen/scripts/plot/ROC.R +1 -1
biopipen/scripts/plot/Scatter.R +112 -0
biopipen/scripts/plot/VennDiagram.R +3 -3
biopipen/scripts/regulatory/MotifAffinityTest.R +3 -7
biopipen/scripts/rnaseq/Simulation.R +1 -1
biopipen/scripts/rnaseq/UnitConversion.R +2 -1
biopipen/scripts/scrna/AnnData2Seurat.R +1 -1
biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +24 -8
biopipen/scripts/scrna/CellTypeAnnotation-common.R +10 -0
biopipen/scripts/scrna/CellTypeAnnotation-direct.R +9 -1
biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +12 -8
biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +15 -2
biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +38 -15
biopipen/scripts/scrna/CellTypeAnnotation.R +3 -0
biopipen/scripts/scrna/CellsDistribution.R +4 -3
biopipen/scripts/scrna/DimPlots.R +1 -1
biopipen/scripts/scrna/ExprImputation-alra.R +1 -1
biopipen/scripts/scrna/MarkersFinder.R +5 -5
biopipen/scripts/scrna/MetaMarkers.R +4 -4
biopipen/scripts/scrna/ModuleScoreCalculator.R +2 -1
biopipen/scripts/scrna/RadarPlots.R +1 -1
biopipen/scripts/scrna/ScFGSEA.R +4 -3
biopipen/scripts/scrna/ScSimulation.R +64 -0
biopipen/scripts/scrna/Seurat2AnnData.R +1 -1
biopipen/scripts/scrna/SeuratClusterStats-clustree.R +73 -0
biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +4 -3
biopipen/scripts/scrna/SeuratClusterStats-features.R +6 -5
biopipen/scripts/scrna/SeuratClusterStats-hists.R +6 -5
biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +4 -3
biopipen/scripts/scrna/SeuratClusterStats-stats.R +20 -25
biopipen/scripts/scrna/SeuratClusterStats.R +24 -8
biopipen/scripts/scrna/SeuratClustering-common.R +213 -0
biopipen/scripts/scrna/SeuratClustering.R +10 -170
biopipen/scripts/scrna/SeuratMap2Ref.R +98 -54
biopipen/scripts/scrna/SeuratMetadataMutater.R +2 -2
biopipen/scripts/scrna/SeuratPreparing-common.R +452 -0
biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +201 -0
biopipen/scripts/scrna/SeuratPreparing.R +22 -562
biopipen/scripts/scrna/SeuratSubClustering.R +24 -39
biopipen/scripts/scrna/TopExpressingGenes.R +1 -1
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +2 -2
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +2 -2
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +3 -3
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +3 -3
biopipen/scripts/snp/MatrixEQTL.R +1 -1
biopipen/scripts/snp/PlinkCallRate.R +2 -2
biopipen/scripts/snp/PlinkFreq.R +2 -2
biopipen/scripts/snp/PlinkHWE.R +2 -2
biopipen/scripts/snp/PlinkHet.R +2 -2
biopipen/scripts/snp/PlinkIBD.R +2 -2
biopipen/scripts/stats/ChowTest.R +1 -1
biopipen/scripts/stats/DiffCoexpr.R +1 -1
biopipen/scripts/stats/LiquidAssoc.R +1 -1
biopipen/scripts/stats/Mediation.R +11 -9
biopipen/scripts/stats/MetaPvalue.R +4 -1
biopipen/scripts/stats/MetaPvalue1.R +4 -1
biopipen/scripts/tcr/Attach2Seurat.R +1 -1
biopipen/scripts/tcr/CDR3AAPhyschem.R +1 -1
biopipen/scripts/tcr/CloneResidency.R +2 -2
biopipen/scripts/tcr/CloneSizeQQPlot.R +2 -2
biopipen/scripts/tcr/Immunarch-basic.R +0 -4
biopipen/scripts/tcr/Immunarch-clonality.R +0 -4
biopipen/scripts/tcr/Immunarch-diversity.R +2 -24
biopipen/scripts/tcr/Immunarch-geneusage.R +0 -2
biopipen/scripts/tcr/Immunarch-kmer.R +0 -2
biopipen/scripts/tcr/Immunarch-overlap.R +0 -2
biopipen/scripts/tcr/Immunarch-spectratyping.R +0 -2
biopipen/scripts/tcr/Immunarch-tracking.R +0 -2
biopipen/scripts/tcr/Immunarch-vjjunc.R +0 -2
biopipen/scripts/tcr/Immunarch.R +43 -11
biopipen/scripts/tcr/ImmunarchFilter.R +1 -1
biopipen/scripts/tcr/ImmunarchLoading.R +2 -2
biopipen/scripts/tcr/SampleDiversity.R +1 -1
biopipen/scripts/tcr/TCRClusterStats.R +2 -2
biopipen/scripts/tcr/TCRClustering.R +2 -2
biopipen/scripts/tcr/TESSA.R +2 -2
biopipen/scripts/vcf/TruvariBenchSummary.R +2 -2
biopipen/scripts/vcf/TruvariConsistency.R +1 -1
biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
biopipen/scripts/web/gcloud_common.py +49 -0
{biopipen-0.29.2.dist-info → biopipen-0.31.0.dist-info}/METADATA +7 -7
{biopipen-0.29.2.dist-info → biopipen-0.31.0.dist-info}/RECORD +106 -96
{biopipen-0.29.2.dist-info → biopipen-0.31.0.dist-info}/WHEEL +0 -0
{biopipen-0.29.2.dist-info → biopipen-0.31.0.dist-info}/entry_points.txt +0 -0

biopipen/scripts/scrna/SeuratPreparing-common.R ADDED Viewed

@@ -0,0 +1,452 @@
+stringify_list <- function(x) {
+    paste(sapply(names(x), function(n) paste(n, x[[n]], sep = " = ") ), collapse = "; ")
+}
+format_args <- function(args) {
+    paste(capture.output(str(args)), collapse = ", ")
+}
+rename_files = function(e, sample, path) {
+    tmpdatadir = file.path(joboutdir, "renamed", sample)
+    if (dir.exists(tmpdatadir)) {
+        unlink(tmpdatadir, recursive = TRUE)
+    }
+    dir.create(tmpdatadir, recursive = TRUE, showWarnings = FALSE)
+    barcodefile = Sys.glob(file.path(path, "*barcodes.tsv.gz"))[1]
+    file.symlink(
+        normalizePath(barcodefile),
+        file.path(tmpdatadir, "barcodes.tsv.gz")
+    )
+    genefile = glob(file.path(path, "*{genes,features}.tsv.gz"))[1]
+    file.symlink(
+        normalizePath(genefile),
+        file.path(tmpdatadir, "features.tsv.gz")
+    )
+    matrixfile = Sys.glob(file.path(path, "*matrix.mtx.gz"))[1]
+    file.symlink(
+        normalizePath(matrixfile),
+        file.path(tmpdatadir, "matrix.mtx.gz")
+    )
+    Read10X(data.dir = tmpdatadir)
+}
+perform_cell_qc <- function(sobj, per_sample = FALSE) {
+    log_prefix <- ifelse(per_sample, "  ", "- ")
+    log_info("{log_prefix}Adding metadata for QC ...")
+    sobj$percent.mt <- PercentageFeatureSet(sobj, pattern = "^MT-")
+    sobj$percent.ribo <- PercentageFeatureSet(sobj, pattern = "^RP[SL]")
+    sobj$percent.hb <- PercentageFeatureSet(sobj, pattern = "^HB[^(P)]")
+    sobj$percent.plat <- PercentageFeatureSet(sobj, pattern = "PECAM1|PF4")
+    if (is.null(envs$cell_qc) || length(envs$cell_qc) == 0) {
+        log_warn("{log_prefix}No cell QC criteria is provided. All cells will be kept.")
+        cell_qc <- "TRUE"
+    } else {
+        cell_qc <- envs$cell_qc
+    }
+    sobj@meta.data <- sobj@meta.data %>% mutate(.QC = !!rlang::parse_expr(cell_qc))
+    if (is.null(cell_qc_df)) {
+        cell_qc_df <<- sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE]
+    } else {
+        cell_qc_df <<- rbind(cell_qc_df, sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE])
+    }
+    # Do the filtering
+    log_info("{log_prefix}Filtering cells using QC criteria ...")
+    sobj <- subset(sobj, subset = .QC)
+    sobj$.QC <- NULL
+    return(sobj)
+}
+report_cell_qc = function(ngenes) {
+    # uses cell_qc_df
+    # Violin plots
+    log_info("- Plotting violin plots ...")
+    add_report(
+        list(
+            kind = "descr",
+            content = paste(
+                "The violin plots for each feature. The cells are grouped by sample.",
+                "The cells that fail the QC criteria are colored in red, and",
+                "the cells that pass the QC criteria are colored in black.",
+                "The cells that fail the QC criteria are filtered out in the returned Seurat object."
+            )
+        ),
+        h1 = "Violin Plots"
+    )
+    for (feat in feats) {
+        log_info("  For feature: {feat}")
+        vln_p <- ggplot(cell_qc_df, aes(x = Sample, y = !!sym(feat), color = .QC)) +
+            geom_violin(fill = "white", width = 0.5) +
+            geom_jitter(width = 0.2, height = 0, alpha = 0.5) +
+            scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
+            labs(x = "Sample", y = feat) +
+            theme_minimal()
+        vlnplot = file.path(plotsdir, paste0(slugify(feat), ".vln.png"))
+        png(
+            vlnplot,
+            width = 800 + length(samples) * 15, height = 600, res = 100
+        )
+        print(vln_p)
+        dev.off()
+        add_report(
+            list(
+                src = vlnplot,
+                name = feat,
+                descr = paste0("Distribution of ", feat, " for each sample.")
+            ),
+            h1 = "Violin Plots",
+            ui = "table_of_images"
+        )
+    }
+    # Scatter plots against nCount_RNA
+    log_info("- Plotting scatter plots ...")
+    add_report(
+        list(
+            kind = "descr",
+            content = paste(
+                "The scatter plots for each feature against nCount_RNA. ",
+                "The cells that fail the QC criteria are colored in red, and",
+                "the cells that pass the QC criteria are colored in black.",
+                "The cells that fail the QC criteria are filtered out in the returned Seurat object."
+            )
+        ),
+        h1 = "Scatter Plots"
+    )
+    for (feat in setdiff(feats, "nCount_RNA")) {
+        log_info("  For feature: {feat}, against nCount_RNA")
+        scat_p <- ggplot(cell_qc_df, aes(x = nCount_RNA, y = !!sym(feat), color = .QC)) +
+            geom_point() +
+            scale_color_manual(values = c("#181818", pal_biopipen()(1)), breaks = c(TRUE, FALSE)) +
+            labs(x = "nCount_RNA", y = feat) +
+            theme_minimal()
+        scatfile = file.path(plotsdir, paste0(slugify(feat), "-nCount_RNA.scatter.png"))
+        png(scatfile, width = 800, height = 600, res = 100)
+        print(scat_p)
+        dev.off()
+        add_report(
+            list(
+                src = scatfile,
+                name = paste0(feat, " vs nCount_RNA"),
+                descr = paste0("Scatter plot for ", feat, " against nCount_RNA")
+            ),
+            h1 = "Scatter Plots",
+            ui = "table_of_images"
+        )
+    }
+    # return the dim_df calculated from the cell_qc_df
+    rbind(
+        cell_qc_df %>%
+            # group_by(Sample) %>%
+            summarise(
+                when = "Before_Cell_QC",
+                nCells = dplyr::n(),
+                nGenes = ngenes
+            ) %>%
+            ungroup(),
+        cell_qc_df %>%
+            filter(.QC) %>%
+            # group_by(Sample) %>%
+            summarise(
+                when = "After_Cell_QC",
+                nCells = dplyr::n(),
+                nGenes = ngenes
+            ) %>%
+            ungroup()
+    )
+}
+load_sample = function(sample) {
+    log_info("- Loading sample: {sample} ...")
+    mdata = as.data.frame(metadata)[metadata$Sample == sample, , drop=TRUE]
+    path = as.character(mdata$RNAData)
+    if (is.na(path) || !is.character(path) || nchar(path) == 0 || path == "NA") {
+        warning(paste0("No path found for sample: ", sample))
+        return (NULL)
+    }
+    # obj_list = list()
+    if (dir.exists(path)) {
+        exprs = tryCatch(
+            # Read10X requires
+            # - barcodes.tsv.gz
+            # - genes.tsv.gz
+            # - matrix.mtx.gz
+            # But sometimes, they are prefixed with sample name
+            # e.g.GSM4143656_SAM24345863-ln1.barcodes.tsv.gz
+            { Read10X(data.dir = path) },
+            error = function(e) rename_files(e, sample, path)
+        )
+    } else {
+        exprs = Read10X_h5(path)
+    }
+    if ("Gene Expression" %in% names(exprs)) {
+        exprs = exprs[["Gene Expression"]]
+    }
+    obj <- CreateSeuratObject(exprs, project=sample)
+    # filter the cells that don't have any gene expressions
+    # cell_exprs = colSums(obj@assays$RNA)
+    # obj = subset(obj, cells = names(cell_exprs[cell_exprs > 0]))
+    obj = RenameCells(obj, add.cell.id = sample)
+    # Attach meta data
+    for (mname in names(mdata)) {
+        if (mname %in% c("RNAData", "TCRData")) { next }
+        mdt = mdata[[mname]]
+        if (is.factor(mdt)) { mdt = levels(mdt)[mdt] }
+        obj[[mname]] = mdt
+    }
+    if (isTRUE(envs$cell_qc_per_sample)) {
+        log_info("- Perform cell QC for sample: {sample} ...")
+        obj = perform_cell_qc(obj, TRUE)
+    }
+    if (isTRUE(envs$use_sct)) {
+        # so that we have data and scale.data layers on RNA assay
+        # useful for visualization in case some genes are not in
+        # the SCT assay
+        obj = NormalizeData(obj, verbose = FALSE)
+        obj = FindVariableFeatures(obj, verbose = FALSE)
+        obj = ScaleData(obj, verbose = FALSE)
+    }
+    obj
+}
+run_gene_qc <- function(sobj) {
+    cached <- get_cached(
+        list(
+            cell_qc = envs$cell_qc,
+            gene_qc = envs$gene_qc,
+            cell_qc_per_sample = envs$cell_qc_per_sample,
+            use_sct = envs$use_sct
+        ),
+        "GeneQC",
+        cache_dir
+    )
+    if (!is.null(cached$data)) {
+        log_info("Loading gene-QC'ed object from cache ...")
+        sobj <- cached$data
+    } else {
+        log_info("Filtering genes ...")
+        genes <- rownames(sobj)
+        filtered <- FALSE
+        if (!is.null(envs$gene_qc$min_cells) && envs$gene_qc$min_cells > 0) {
+            genes = genes[Matrix::rowSums(sobj) >= envs$gene_qc$min_cells]
+            filtered <- TRUE
+        }
+        excludes <- envs$gene_qc$excludes
+        if (!is.null(excludes)) {
+            if (length(excludes) == 1) {
+                excludes <- trimws(unlist(strsplit(excludes, ",")))
+            }
+            for (ex in excludes) {
+                genes <- genes[!grepl(ex, genes)]
+            }
+            filtered <- TRUE
+        }
+        if (filtered) {
+            sobj = subset(sobj, features = genes)
+        }
+        cached$data <- sobj
+        save_to_cache(cached, "GeneQC", cache_dir)
+    }
+    sobj
+}
+run_cell_qc <- function(sobj) {
+    cached <- get_cached(
+        list(cell_qc = envs$cell_qc, cell_qc_per_sample = envs$cell_qc_per_sample, use_sct = envs$use_sct),
+        "CellQC",
+        cache_dir
+    )
+    if (!is.null(cached$data)) {
+        log_info("Loading cell-QC'ed object from cache ...")
+        sobj <- cached$data$sobj
+        cell_qc_df <<- cached$data$cell_qc_df
+    } else {
+        # Load data
+        log_info("Reading samples individually ...")
+        obj_list = lapply(samples, load_sample)
+        log_info("Merging samples ...")
+        sobj = Reduce(merge, obj_list)
+        rm(obj_list)
+        gc()
+        if (!envs$cell_qc_per_sample) {
+            log_info("Performing cell QC ...")
+            sobj = perform_cell_qc(sobj)
+        }
+        cached$data <- list(sobj = sobj, cell_qc_df = cell_qc_df)
+        save_to_cache(cached, "CellQC", cache_dir)
+    }
+    sobj
+}
+run_transformation <- function(sobj) {
+    envs_cache <- envs
+    envs_cache$ncores <- NULL
+    envs_cache$doublet_detector <- NULL
+    envs_cache$DoubletFinder <- NULL
+    envs_cache$scDblFinder <- NULL
+    envs_cache$IntegrateLayers <- NULL
+    cached <- get_cached(envs_cache, "Transformed", cache_dir)
+    if (!is.null(cached$data)) {
+        log_info("Loading transformed object from cache ...")
+        sobj <- cached$data
+    } else {
+        log_info("Performing transformation/scaling ...")
+        # Not joined yet
+        # sobj[["RNA"]] <- split(sobj[["RNA"]], f = sobj$Sample)
+        if (envs$use_sct) {
+            log_info("- Running SCTransform ...")
+            SCTransformArgs <- envs$SCTransform
+            # log to stdout but don't populate it to running log
+            print(paste0("  SCTransform: ", format_args(SCTransformArgs)))
+            log_debug("  SCTransform: {format_args(SCTransformArgs)}")
+            SCTransformArgs$object <- sobj
+            sobj <- do_call(SCTransform, SCTransformArgs)
+            # Default is to use the SCT assay
+            # Cleanup memory
+            SCTransformArgs$object <- NULL
+            rm(SCTransformArgs)
+            gc()
+        } else {
+            log_info("- Running NormalizeData ...")
+            NormalizeDataArgs <- envs$NormalizeData
+            print(paste0("  NormalizeData: ", format_args(NormalizeDataArgs)))
+            log_debug("  NormalizeData: {format_args(NormalizeDataArgs)}")
+            NormalizeDataArgs$object <- sobj
+            sobj <- do_call(NormalizeData, NormalizeDataArgs)
+            # Cleanup memory
+            NormalizeDataArgs$object <- NULL
+            rm(NormalizeDataArgs)
+            gc()
+            log_info("- Running FindVariableFeatures ...")
+            FindVariableFeaturesArgs <- envs$FindVariableFeatures
+            print(paste0("  FindVariableFeatures: ", format_args(FindVariableFeaturesArgs)))
+            log_debug("  FindVariableFeatures: {format_args(FindVariableFeaturesArgs)}")
+            FindVariableFeaturesArgs$object <- sobj
+            sobj <- do_call(FindVariableFeatures, FindVariableFeaturesArgs)
+            # Cleanup memory
+            FindVariableFeaturesArgs$object <- NULL
+            rm(FindVariableFeaturesArgs)
+            gc()
+            log_info("- Running ScaleData ...")
+            ScaleDataArgs <- envs$ScaleData
+            print(paste0("  ScaleData: ", format_args(ScaleDataArgs)))
+            log_debug("  ScaleData: {format_args(ScaleDataArgs)}")
+            ScaleDataArgs$object <- sobj
+            sobj <- do_call(ScaleData, ScaleDataArgs)
+            # Cleanup memory
+            ScaleDataArgs$object <- NULL
+            rm(ScaleDataArgs)
+            gc()
+        }
+        log_info("- Running RunPCA ...")
+        RunPCAArgs <- envs$RunPCA
+        RunPCAArgs$npcs <- if (is.null(RunPCAArgs$npcs)) { 50 } else { min(RunPCAArgs$npcs, ncol(sobj) - 1) }
+        print(paste0("  RunPCA: ", format_args(RunPCAArgs)))
+        log_debug("  RunPCA: {format_args(RunPCAArgs)}")
+        RunPCAArgs$object <- sobj
+        sobj <- do_call(RunPCA, RunPCAArgs)
+        # Cleanup memory
+        RunPCAArgs$object <- NULL
+        rm(RunPCAArgs)
+        gc()
+        cached$data <- sobj
+        save_to_cache(cached, "Transformed", cache_dir)
+    }
+    sobj
+}
+run_integration <- function(sobj) {
+    envs_cache <- envs
+    envs_cache$ncores <- NULL
+    envs_cache$doublet_detector <- NULL
+    envs_cache$DoubletFinder <- NULL
+    envs_cache$scDblFinder <- NULL
+    cached <- get_cached(envs_cache, "Integrated", cache_dir)
+    if (!is.null(cached$data)) {
+        log_info("Loading integrated/layer-joined object from cache ...")
+        sobj <- cached$data
+    } else {
+        if (!envs$no_integration) {
+            log_info("- Running IntegrateLayers (method = {envs$IntegrateLayers$method}) ...")
+            IntegrateLayersArgs <- envs$IntegrateLayers
+            method <- IntegrateLayersArgs$method
+            if (!is.null(IntegrateLayersArgs$reference) && is.character(IntegrateLayersArgs$reference)) {
+                log_info("  Using reference samples: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
+                IntegrateLayersArgs$reference <- match(IntegrateLayersArgs$reference, samples)
+                log_info("  Transferred to indices: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
+            }
+            if (method %in% c("CCA", "cca")) { method <- "CCAIntegration" } else
+            if (method %in% c("RPCA", "rpca")) { method <- "RPCAIntegration" } else
+            if (method %in% c("Harmony", "harmony")) { method <- "HarmonyIntegration" } else
+            if (method %in% c("FastMNN", "fastmnn")) { method <- "FastMNNIntegration" } else
+            if (method %in% c("scVI", "scvi")) { method <- "scVIIntegration" } else
+            { stop(paste0("Unknown integration method: ", method)) }
+            if (envs$use_sct && is.null(IntegrateLayersArgs$normalization.method)) {
+                IntegrateLayersArgs$normalization.method <- "SCT"
+            }
+            IntegrateLayersArgs$method <- eval(parse(text = method))
+            new_reductions <- list(
+                "CCAIntegration" = "integrated.cca",
+                "RPCAIntegration" = "integrated.rpca",
+                "HarmonyIntegration" = "harmony",
+                "FastMNNIntegration" = "integration.mnn",
+                "scVIIntegration" = "integrated.scvi"
+            )
+            if (is.null(IntegrateLayersArgs$new.reduction)) {
+                IntegrateLayersArgs$new.reduction <- new_reductions[[method]]
+            }
+            print(paste0("  IntegrateLayers: ", format_args(IntegrateLayersArgs)))
+            log_debug("  IntegrateLayers: {format_args(IntegrateLayersArgs)}")
+            IntegrateLayersArgs$object <- sobj
+            sobj <- do_call(IntegrateLayers, IntegrateLayersArgs)
+            # Save it for dimension reduction plots
+            sobj@misc$integrated_new_reduction <- IntegrateLayersArgs$new.reduction
+            # Cleanup memory
+            IntegrateLayersArgs$object <- NULL
+            rm(IntegrateLayersArgs)
+            gc()
+        }
+        if (!envs$use_sct) {
+            log_info("- Joining layers ...")
+            sobj <- JoinLayers(sobj)
+        }
+        cached$data <- sobj
+        save_to_cache(cached, "Integrated", cache_dir)
+    }
+    sobj
+}

biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R ADDED Viewed

@@ -0,0 +1,201 @@
+.get_envs_cached_doubletfinder <- function() {
+    envs_cache <- envs
+    envs_cache$ncores <- NULL
+    envs_cache$doublet_detector <- NULL
+    envs_cache$scDblFinder <- NULL
+    envs_cache$DoubletFinder$ncores <- NULL
+    envs_cache
+}
+.get_envs_cached_scdblfinder <- function() {
+    envs_cache <- envs
+    envs_cache$ncores <- NULL
+    envs_cache$doublet_detector <- NULL
+    envs_cache$DoubletFinder <- NULL
+    envs_cache$scDblFinder$ncores <- NULL
+    envs_cache
+}
+.run_doubletfinder <- function() {
+    library(DoubletFinder)
+    log_info("- Preparing Seurat object ...")
+    if (is.null(envs$DoubletFinder$ncores)) {
+        envs$DoubletFinder$ncores <- envs$ncores
+    }
+    # More controls from envs?
+    sobj <- FindNeighbors(sobj, dims = 1:envs$DoubletFinder$PCs)
+    sobj <- FindClusters(sobj)
+    log_info("- pK Indentification ...")
+    sweep.res.list <- paramSweep(
+        sobj,
+        PCs = 1:envs$DoubletFinder$PCs,
+        sct = envs$use_sct,
+        num.cores = envs$DoubletFinder$ncores
+    )
+    sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
+    bcmvn <- find.pK(sweep.stats)
+    bcmvn$Selected <- bcmvn$pK == bcmvn$pK[which.max(bcmvn$BCmetric)[1]]
+    pK <- bcmvn$pK[which.max(bcmvn$BCmetric)[1]]
+    pK <- as.numeric(as.character(pK))
+    pN <- envs$DoubletFinder$pN
+    log_info("- Homotypic Doublet Proportion Estimate ...")
+    homotypic.prop <- modelHomotypic(Idents(sobj))
+    nExp_poi <- round(nrow(sobj@meta.data) * envs$DoubletFinder$doublets)
+    nExp_poi.adj <- round(nExp_poi * (1 - homotypic.prop))
+    log_info("- Running DoubletFinder ...")
+    sobj <- doubletFinder(
+        sobj,
+        PCs = 1:envs$DoubletFinder$PCs,
+        pN = pN,
+        pK = pK,
+        nExp = nExp_poi.adj,
+        reuse.pANN = FALSE,
+        sct = envs$use_sct
+    )
+    pANN_col <- paste0("pANN_", pN, "_", pK)
+    pANN_col <- colnames(sobj@meta.data)[grepl(pANN_col, colnames(sobj@meta.data))]
+    DF_col <- paste0("DF.classifications_", pN, "_", pK)
+    DF_col <- colnames(sobj@meta.data)[grepl(DF_col, colnames(sobj@meta.data))]
+    doublets <- sobj@meta.data[, c(pANN_col, DF_col), drop = FALSE]
+    colnames(doublets) <-  c("DoubletFinder_score","DoubletFinder_DropletType")
+    doublets$DoubletFinder_DropletType <- tolower(doublets$DoubletFinder_DropletType)
+    pk_plot <- ggplot(bcmvn, aes(x = pK, y = BCmetric, color = Selected)) +
+        geom_point() +
+        # rotate x axis labels
+        theme(axis.text.x = element_text(angle = 90, hjust = 1))
+    list(doublets = doublets, pk_plot = pk_plot)
+}
+.run_scdblfinder <- function() {
+    library(scDblFinder)
+    if (is.null(envs$scDblFinder$ncores)) {
+        envs$scDblFinder$ncores <- envs$ncores
+    }
+    envs$scDblFinder$sce <- GetAssayData(sobj, layer = "counts")
+    if (envs$scDblFinder$ncores > 1) {
+        envs$scDblFinder$BPPARAM <- BiocParallel::MulticoreParam(envs$scDblFinder$ncores, RNGseed = 8525)
+    }
+    envs$scDblFinder$returnType <- "table"
+    envs$scDblFinder$ncores <- NULL
+    doublets <- do_call(scDblFinder, envs$scDblFinder)
+    doublets <- doublets[doublets$type == "real", , drop = FALSE]
+    doublets <- doublets[, c("score", "class"), drop = FALSE]
+    colnames(doublets) <- c("scDblFinder_score", "scDblFinder_DropletType")
+    list(doublets = doublets)
+}
+run_dd <- function(detector) {
+    log_info("Running {detector} ...")
+    if (detector == "DoubletFinder") {
+        envs_cache_fun <- .get_envs_cached_doubletfinder
+        run_fun <- .run_doubletfinder
+    } else if (detector == "scDblFinder") {
+        envs_cache_fun <- .get_envs_cached_scdblfinder
+        run_fun <- .run_scdblfinder
+    } else {
+        stop("Unknown doublet detector: ", detector)
+    }
+    cached <- get_cached(envs_cache_fun(), detector, cache_dir)
+    if (!is.null(cached$data)) {
+        log_info("- Loading cached results ...")
+        results <- cached$data
+    } else {
+        results <- run_fun()
+        cached$data <- results
+        save_to_cache(cached, detector, cache_dir)
+    }
+    results
+}
+save_dd <- function(dd, detector) {
+    doublets <- dd$doublets
+    write.table(
+        doublets,
+        file.path(joboutdir, paste0(detector, "_doublets_singlets.txt")),
+        row.names = FALSE,
+        quote = FALSE,
+        sep = "\t"
+    )
+    summary <- as.data.frame(table(dd$doublets[[paste0(detector, "_DropletType")]]))
+    colnames(summary) <- c("Classification", "Droplet_N")
+    write.table(
+        summary,
+        file.path(joboutdir, paste0(detector, "_summary.txt")),
+        row.names = FALSE,
+        quote = FALSE,
+        sep = "\t"
+    )
+    n_doublet <- summary$Droplet_N[summary$Classification == 'doublet']
+    log_info("- {n_doublet}/{sum(summary$Droplet_N)} doublets detected.")
+}
+add_dd_to_seurat <- function(sobj, dd) {
+    AddMetaData(sobj, metadata = as.data.frame(dd$doublets))
+}
+plot_dd <- function(sobj, dd, detector) {
+    if (detector == "DoubletFinder") {
+        log_debug("- Plotting pK vs BCmetric ...")
+        ggsave(dd$pk_plot, filename = file.path(plotsdir, "DoubletFinder_pK_BCmetric.png"))
+    }
+    log_info("- Plotting dimension reduction ...")
+    dimp <- DimPlot(
+        sobj, group.by = paste0(detector, "_DropletType"), order = "doublet",
+        cols = c("#333333", "#FF3333"), pt.size = 0.8, alpha = 0.5)
+    ggsave(dimp, filename = file.path(plotsdir, paste0(detector, "_dimplot.png")))
+}
+filter_dd <- function(sobj, dd, detector) {
+    subset(sobj,
+        cells = rownames(dd$doublets[
+            dd$doublets[[paste0(detector, "_DropletType")]] == "singlet", ,
+            drop = FALSE
+        ]))
+}
+report_dd <- function(detector) {
+    add_report(
+        list(
+            kind = "descr",
+            content = "The table contains the number of cells classified as singlets and doublets."
+        ),
+        list(
+            kind = "table",
+            data = list(path = file.path(joboutdir, paste0(detector, "_summary.txt")))
+        ),
+        h1 = paste0(detector, " Results"),
+        h2 = paste0("The ", detector, " Summary")
+    )
+    if (detector == "DoubletFinder") {
+        add_report(
+            list(name = "pK vs BCmetric", src = file.path(plotsdir, "pK_BCmetric.png")),
+            list(name = "Dimension Reduction Plot", src = file.path(plotsdir, "DoubletFinder_dimplot.png")),
+            ui = "table_of_images",
+            h1 = "DoubletFinder Results",
+            h2 = "Plots"
+        )
+    } else {
+        add_report(
+            list(name = "Dimension Reduction Plot",src = file.path(plotsdir, "scDblFinder_dimplot.png")),
+            ui = "table_of_images",
+            h1 = "scDblFinder Results",
+            h2 = "Plots"
+        )
+    }
+}

biopipen 0.29.2__py3-none-any.whl → 0.31.0__py3-none-any.whl

Potentially problematic release.

biopipen 0.29.2py3-none-any.whl → 0.31.0py3-none-any.whl