PyPI - biopipen - Versions diffs - 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl - Mend

biopipen 0.28.1py3-none-any.whl → 0.29.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biopipen might be problematic. Click here for more details.

Files changed (85) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +8 -0
biopipen/ns/bam.py +0 -2
biopipen/ns/bed.py +35 -0
biopipen/ns/cellranger_pipeline.py +5 -5
biopipen/ns/cnv.py +18 -2
biopipen/ns/cnvkit_pipeline.py +16 -11
biopipen/ns/gene.py +68 -23
biopipen/ns/misc.py +2 -15
biopipen/ns/plot.py +204 -0
biopipen/ns/regulatory.py +214 -0
biopipen/ns/scrna.py +31 -5
biopipen/ns/snp.py +516 -8
biopipen/ns/stats.py +167 -3
biopipen/ns/vcf.py +196 -0
biopipen/reports/snp/PlinkCallRate.svelte +24 -0
biopipen/reports/snp/PlinkFreq.svelte +18 -0
biopipen/reports/snp/PlinkHWE.svelte +18 -0
biopipen/reports/snp/PlinkHet.svelte +18 -0
biopipen/reports/snp/PlinkIBD.svelte +18 -0
biopipen/scripts/bam/CNVpytor.py +144 -46
biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
biopipen/scripts/bed/BedtoolsMerge.py +1 -1
biopipen/scripts/cnv/AneuploidyScore.R +30 -7
biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
biopipen/scripts/cnv/TMADScore.R +21 -5
biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
biopipen/scripts/delim/SampleInfo.R +10 -5
biopipen/scripts/gene/GeneNameConversion.R +65 -0
biopipen/scripts/gene/GenePromoters.R +61 -0
biopipen/scripts/misc/Shell.sh +15 -0
biopipen/scripts/plot/Manhattan.R +146 -0
biopipen/scripts/plot/QQPlot.R +146 -0
biopipen/scripts/regulatory/MotifAffinityTest.R +226 -0
biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +126 -0
biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +96 -0
biopipen/scripts/regulatory/MotifScan.py +159 -0
biopipen/scripts/regulatory/atSNP.R +33 -0
biopipen/scripts/regulatory/motifBreakR.R +1594 -0
biopipen/scripts/scrna/MarkersFinder.R +69 -67
biopipen/scripts/scrna/SeuratClustering.R +71 -29
biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
biopipen/scripts/scrna/SeuratPreparing.R +252 -122
biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
biopipen/scripts/snp/MatrixEQTL.R +85 -44
biopipen/scripts/snp/Plink2GTMat.py +133 -0
biopipen/scripts/snp/PlinkCallRate.R +190 -0
biopipen/scripts/snp/PlinkFilter.py +100 -0
biopipen/scripts/snp/PlinkFreq.R +298 -0
biopipen/scripts/snp/PlinkFromVcf.py +78 -0
biopipen/scripts/snp/PlinkHWE.R +80 -0
biopipen/scripts/snp/PlinkHet.R +92 -0
biopipen/scripts/snp/PlinkIBD.R +200 -0
biopipen/scripts/snp/PlinkUpdateName.py +124 -0
biopipen/scripts/stats/Mediation.R +94 -0
biopipen/scripts/stats/MetaPvalue.R +2 -1
biopipen/scripts/stats/MetaPvalue1.R +70 -0
biopipen/scripts/tcr/TCRClusterStats.R +12 -7
biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
biopipen/scripts/vcf/BcftoolsSort.py +113 -0
biopipen/scripts/vcf/BcftoolsView.py +73 -0
biopipen/scripts/vcf/VcfFix_utils.py +1 -1
biopipen/scripts/vcf/bcftools_utils.py +52 -0
biopipen/utils/gene.R +83 -37
biopipen/utils/gene.py +108 -60
biopipen/utils/misc.R +56 -0
biopipen/utils/misc.py +5 -2
biopipen/utils/reference.py +54 -10
{biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/METADATA +2 -2
{biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/RECORD +80 -51
{biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/entry_points.txt +1 -1
biopipen/ns/bcftools.py +0 -111
biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
biopipen/scripts/gene/GeneNameConversion.py +0 -66
{biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/WHEEL +0 -0

biopipen/scripts/scrna/SeuratPreparing.R CHANGED Viewed

@@ -1,19 +1,27 @@
 source("{{biopipen_dir}}/utils/misc.R")
+source("{{biopipen_dir}}/utils/caching.R")
 library(Seurat)
 library(future)
 library(bracer)
 library(ggplot2)
 library(dplyr)
-library(tidyseurat)
+# library(tidyseurat)
-metafile = {{in.metafile | quote}}
-rdsfile = {{out.rdsfile | quote}}
-joboutdir = {{job.outdir | quote}}
-envs = {{envs | r: todot = "-", skip = 1}}
+metafile <- {{in.metafile | quote}}
+rdsfile <- {{out.rdsfile | quote}}
+joboutdir <- {{job.outdir | quote}}
+envs <- {{envs | r: todot = "-", skip = 1}}
+if (isTRUE(envs$cache)) { envs$cache <- joboutdir }
+if (length(envs$cache) > 1) {
+    log_warn("Multiple cache directories (envs.cache) detected, using the first one.")
+    envs$cache <- envs$cache[1]
+}
 set.seed(8525)
-options(future.globals.maxSize = 80000 * 1024^2)
+# 8TB
+options(future.globals.maxSize = 8 * 1024 ^ 4)
 options(future.rng.onMisuse="ignore")
 options(Seurat.object.assay.version = "v5")
 plan(strategy = "multicore", workers = envs$ncores)
@@ -34,7 +42,7 @@ add_report(
     h1 = "Filters and QC"
 )
-metadata = read.table(
+metadata <- read.table(
     metafile,
     header = TRUE,
     row.names = NULL,
@@ -42,6 +50,16 @@ metadata = read.table(
     check.names = FALSE
 )
+cache_sig <- capture.output(str(metadata))
+dig_sig <- digest::digest(cache_sig, algo = "md5")
+dig_sig <- substr(dig_sig, 1, 8)
+cache_dir <- NULL
+if (is.character(envs$cache)) {
+    cache_dir <- file.path(envs$cache, paste0(dig_sig, ".seuratpreparing_cache"))
+    dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE)
+    writeLines(cache_sig, file.path(cache_dir, "signature.txt"))
+}
 meta_cols = colnames(metadata)
 if (!"Sample" %in% meta_cols) {
     stop("Error: Column `Sample` is not found in metafile.")
@@ -90,21 +108,21 @@ rename_files = function(e, sample, path) {
 perform_cell_qc <- function(sobj, per_sample = FALSE) {
-    log_prefix = ifelse(per_sample, "  ", "- ")
+    log_prefix <- ifelse(per_sample, "  ", "- ")
     log_info("{log_prefix}Adding metadata for QC ...")
-    sobj$percent.mt = PercentageFeatureSet(sobj, pattern = "^MT-")
-    sobj$percent.ribo = PercentageFeatureSet(sobj, pattern = "^RP[SL]")
-    sobj$percent.hb = PercentageFeatureSet(sobj, pattern = "^HB[^(P)]")
-    sobj$percent.plat = PercentageFeatureSet(sobj, pattern = "PECAM1|PF4")
+    sobj$percent.mt <- PercentageFeatureSet(sobj, pattern = "^MT-")
+    sobj$percent.ribo <- PercentageFeatureSet(sobj, pattern = "^RP[SL]")
+    sobj$percent.hb <- PercentageFeatureSet(sobj, pattern = "^HB[^(P)]")
+    sobj$percent.plat <- PercentageFeatureSet(sobj, pattern = "PECAM1|PF4")
     if (is.null(envs$cell_qc) || length(envs$cell_qc) == 0) {
         log_warn("{log_prefix}No cell QC criteria is provided. All cells will be kept.")
-        cell_qc = "TRUE"
+        cell_qc <- "TRUE"
     } else {
-        cell_qc = envs$cell_qc
+        cell_qc <- envs$cell_qc
     }
-    sobj = sobj %>% mutate(.QC = !!rlang::parse_expr(cell_qc))
+    sobj@meta.data <- sobj@meta.data %>% mutate(.QC = !!rlang::parse_expr(cell_qc))
     if (is.null(cell_qc_df)) {
         cell_qc_df <<- sobj@meta.data[, c("Sample", ".QC", feats), drop = FALSE]
@@ -114,8 +132,8 @@ perform_cell_qc <- function(sobj, per_sample = FALSE) {
     # Do the filtering
     log_info("{log_prefix}Filtering cells using QC criteria ...")
-    sobj = sobj %>% filter(.QC)
-    sobj$.QC = NULL
+    sobj <- subset(sobj, subset = .QC)
+    sobj$.QC <- NULL
     return(sobj)
 }
@@ -281,42 +299,83 @@ load_sample = function(sample) {
     obj
 }
-# Load data
-log_info("Reading samples individually ...")
-obj_list = lapply(samples, load_sample)
-log_info("Merging samples ...")
-sobj = Reduce(merge, obj_list)
+cached <- get_cached(
+    list(cell_qc = envs$cell_qc, cell_qc_per_sample = envs$cell_qc_per_sample, use_sct = envs$use_sct),
+    "CellQC",
+    cache_dir
+)
+if (!is.null(cached$data)) {
+    log_info("Loading cell-QC'ed object from cache ...")
+    sobj <- cached$data$sobj
+    cell_qc_df <- cached$data$cell_qc_df
+    cached$data$sobj <- NULL
+    cached$data$cell_qc_df <- NULL
+    cached$data <- NULL
+    rm(cached)
+    gc()
+} else {
+    # Load data
+    log_info("Reading samples individually ...")
+    obj_list = lapply(samples, load_sample)
+    log_info("Merging samples ...")
+    sobj = Reduce(merge, obj_list)
+    rm(obj_list)
+    gc()
+    if (!envs$cell_qc_per_sample) {
+        log_info("Performing cell QC ...")
+        sobj = perform_cell_qc(sobj)
+    }
-if (!envs$cell_qc_per_sample) {
-    log_info("Performing cell QC ...")
-    sobj = perform_cell_qc(sobj)
+    cached$data = list(sobj = sobj, cell_qc_df = cell_qc_df)
+    save_to_cache(cached, "CellQC", cache_dir)
 }
 # plot and report the QC
 log_info("Plotting and reporting QC ...")
 dim_df = report_cell_qc(nrow(sobj))
-log_info("Filtering genes ...")
 if (is.list(envs$gene_qc)) {
-    genes <- rownames(sobj)
-    filtered <- FALSE
-    if (!is.null(envs$gene_qc$min_cells) && envs$gene_qc$min_cells > 0) {
-        genes = genes[Matrix::rowSums(sobj) >= envs$gene_qc$min_cells]
-        filtered <- TRUE
-    }
-    excludes <- envs$gene_qc$excludes
-    if (!is.null(excludes)) {
-        if (length(excludes) == 1) {
-            excludes <- trimws(unlist(strsplit(excludes, ",")))
+    cached <- get_cached(
+        list(
+            cell_qc = envs$cell_qc,
+            gene_qc = envs$gene_qc,
+            cell_qc_per_sample = envs$cell_qc_per_sample,
+            use_sct = envs$use_sct
+        ),
+        "GeneQC",
+        cache_dir
+    )
+    if (!is.null(cached$data)) {
+        log_info("Loading gene-QC'ed object from cache ...")
+        sobj <- cached$data
+        cached$data <- NULL
+        rm(cached)
+        gc()
+    } else {
+        log_info("Filtering genes ...")
+        genes <- rownames(sobj)
+        filtered <- FALSE
+        if (!is.null(envs$gene_qc$min_cells) && envs$gene_qc$min_cells > 0) {
+            genes = genes[Matrix::rowSums(sobj) >= envs$gene_qc$min_cells]
+            filtered <- TRUE
         }
-        for (ex in excludes) {
-            genes <- genes[!grepl(ex, genes)]
+        excludes <- envs$gene_qc$excludes
+        if (!is.null(excludes)) {
+            if (length(excludes) == 1) {
+                excludes <- trimws(unlist(strsplit(excludes, ",")))
+            }
+            for (ex in excludes) {
+                genes <- genes[!grepl(ex, genes)]
+            }
+            filtered <- TRUE
         }
-        filtered <- TRUE
-    }
-    if (filtered) {
-        sobj = subset(sobj, features = genes)
+        if (filtered) {
+            sobj = subset(sobj, features = genes)
+        }
+        cached$data <- sobj
+        save_to_cache(cached, "GeneQC", cache_dir)
     }
 }
 dim_df = rbind(
@@ -350,96 +409,167 @@ add_report(
     paste(capture.output(str(args)), collapse = ", ")
 }
-log_info("Performing transformation/scaling ...")
-# Not joined yet
-# sobj[["RNA"]] <- split(sobj[["RNA"]], f = sobj$Sample)
-if (envs$use_sct) {
-    log_info("- Running SCTransform ...")
-    SCTransformArgs <- envs$SCTransform
-    # log to stdout but don't populate it to running log
-    print(paste0("  SCTransform: ", .formatArgs(SCTransformArgs)))
-    log_debug("  SCTransform: {.formatArgs(SCTransformArgs)}")
-    SCTransformArgs$object <- sobj
-    sobj <- do_call(SCTransform, SCTransformArgs)
-    # Default is to use the SCT assay
+envs_cache <- envs
+envs_cache$ncores <- NULL
+envs_cache$DoubletFinder <- NULL
+envs_cache$IntegrateLayers <- NULL
+cached <- get_cached(envs_cache, "Transformed", cache_dir)
+if (!is.null(cached$data)) {
+    log_info("Loading transformed object from cache ...")
+    sobj <- cached$data
+    cached$data <- NULL
+    rm(cached)
+    gc()
 } else {
-    log_info("- Running NormalizeData ...")
-    NormalizeDataArgs <- envs$NormalizeData
-    print(paste0("  NormalizeData: ", .formatArgs(NormalizeDataArgs)))
-    log_debug("  NormalizeData: {.formatArgs(NormalizeDataArgs)}")
-    NormalizeDataArgs$object <- sobj
-    sobj <- do_call(NormalizeData, NormalizeDataArgs)
-    log_info("- Running FindVariableFeatures ...")
-    FindVariableFeaturesArgs <- envs$FindVariableFeatures
-    print(paste0("  FindVariableFeatures: ", .formatArgs(FindVariableFeaturesArgs)))
-    log_debug("  FindVariableFeatures: {.formatArgs(FindVariableFeaturesArgs)}")
-    FindVariableFeaturesArgs$object <- sobj
-    sobj <- do_call(FindVariableFeatures, FindVariableFeaturesArgs)
-    log_info("- Running ScaleData ...")
-    ScaleDataArgs <- envs$ScaleData
-    print(paste0("  ScaleData: ", .formatArgs(ScaleDataArgs)))
-    log_debug("  ScaleData: {.formatArgs(ScaleDataArgs)}")
-    ScaleDataArgs$object <- sobj
-    sobj <- do_call(ScaleData, ScaleDataArgs)
+    log_info("Performing transformation/scaling ...")
+    # Not joined yet
+    # sobj[["RNA"]] <- split(sobj[["RNA"]], f = sobj$Sample)
+    if (envs$use_sct) {
+        log_info("- Running SCTransform ...")
+        SCTransformArgs <- envs$SCTransform
+        # log to stdout but don't populate it to running log
+        print(paste0("  SCTransform: ", .formatArgs(SCTransformArgs)))
+        log_debug("  SCTransform: {.formatArgs(SCTransformArgs)}")
+        SCTransformArgs$object <- sobj
+        sobj <- do_call(SCTransform, SCTransformArgs)
+        # Default is to use the SCT assay
+        # Cleanup memory
+        SCTransformArgs$object <- NULL
+        rm(SCTransformArgs)
+        gc()
+    } else {
+        log_info("- Running NormalizeData ...")
+        NormalizeDataArgs <- envs$NormalizeData
+        print(paste0("  NormalizeData: ", .formatArgs(NormalizeDataArgs)))
+        log_debug("  NormalizeData: {.formatArgs(NormalizeDataArgs)}")
+        NormalizeDataArgs$object <- sobj
+        sobj <- do_call(NormalizeData, NormalizeDataArgs)
+        # Cleanup memory
+        NormalizeDataArgs$object <- NULL
+        rm(NormalizeDataArgs)
+        gc()
+        log_info("- Running FindVariableFeatures ...")
+        FindVariableFeaturesArgs <- envs$FindVariableFeatures
+        print(paste0("  FindVariableFeatures: ", .formatArgs(FindVariableFeaturesArgs)))
+        log_debug("  FindVariableFeatures: {.formatArgs(FindVariableFeaturesArgs)}")
+        FindVariableFeaturesArgs$object <- sobj
+        sobj <- do_call(FindVariableFeatures, FindVariableFeaturesArgs)
+        # Cleanup memory
+        FindVariableFeaturesArgs$object <- NULL
+        rm(FindVariableFeaturesArgs)
+        gc()
+        log_info("- Running ScaleData ...")
+        ScaleDataArgs <- envs$ScaleData
+        print(paste0("  ScaleData: ", .formatArgs(ScaleDataArgs)))
+        log_debug("  ScaleData: {.formatArgs(ScaleDataArgs)}")
+        ScaleDataArgs$object <- sobj
+        sobj <- do_call(ScaleData, ScaleDataArgs)
+        # Cleanup memory
+        ScaleDataArgs$object <- NULL
+        rm(ScaleDataArgs)
+        gc()
+    }
+    log_info("- Running RunPCA ...")
+    RunPCAArgs <- envs$RunPCA
+    RunPCAArgs$npcs <- if (is.null(RunPCAArgs$npcs)) { 50 } else { min(RunPCAArgs$npcs, ncol(sobj) - 1) }
+    print(paste0("  RunPCA: ", .formatArgs(RunPCAArgs)))
+    log_debug("  RunPCA: {.formatArgs(RunPCAArgs)}")
+    RunPCAArgs$object <- sobj
+    sobj <- do_call(RunPCA, RunPCAArgs)
+    # Cleanup memory
+    RunPCAArgs$object <- NULL
+    rm(RunPCAArgs)
+    gc()
+    cached$data <- sobj
+    save_to_cache(cached, "Transformed", cache_dir)
 }
-log_info("- Running RunPCA ...")
-RunPCAArgs <- envs$RunPCA
-RunPCAArgs$npcs <- if (is.null(RunPCAArgs$npcs)) { 50 } else { min(RunPCAArgs$npcs, ncol(sobj) - 1) }
-print(paste0("  RunPCA: ", .formatArgs(RunPCAArgs)))
-log_debug("  RunPCA: {.formatArgs(RunPCAArgs)}")
-RunPCAArgs$object <- sobj
-sobj <- do_call(RunPCA, RunPCAArgs)
-if (!envs$no_integration) {
-    log_info("- Running IntegrateLayers (method = {envs$IntegrateLayers$method}) ...")
-    IntegrateLayersArgs <- envs$IntegrateLayers
-    method <- IntegrateLayersArgs$method
-    if (!is.null(IntegrateLayersArgs$reference) && is.character(IntegrateLayersArgs$reference)) {
-        log_info("  Using reference samples: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
-        IntegrateLayersArgs$reference <- match(IntegrateLayersArgs$reference, samples)
-        log_info("  Transferred to indices: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
-    }
-    if (method %in% c("CCA", "cca")) { method <- "CCAIntegration" } else
-    if (method %in% c("RPCA", "rpca")) { method <- "RPCAIntegration" } else
-    if (method %in% c("Harmony", "harmony")) { method <- "HarmonyIntegration" } else
-    if (method %in% c("FastMNN", "fastmnn")) { method <- "FastMNNIntegration" } else
-    if (method %in% c("scVI", "scvi")) { method <- "scVIIntegration" } else
-    { stop(paste0("Unknown integration method: ", method)) }
-    if (envs$use_sct && is.null(IntegrateLayersArgs$normalization.method)) {
-        IntegrateLayersArgs$normalization.method <- "SCT"
+envs_cache <- envs
+envs_cache$ncores <- NULL
+envs_cache$DoubletFinder <- NULL
+cached <- get_cached(envs_cache, "Integrated", cache_dir)
+if (!is.null(cached$data)) {
+    log_info("Loading integrated/layer-joined object from cache ...")
+    sobj <- cached$data
+    cached$data <- NULL
+    rm(cached)
+    gc()
+} else {
+    if (!envs$no_integration) {
+        log_info("- Running IntegrateLayers (method = {envs$IntegrateLayers$method}) ...")
+        IntegrateLayersArgs <- envs$IntegrateLayers
+        method <- IntegrateLayersArgs$method
+        if (!is.null(IntegrateLayersArgs$reference) && is.character(IntegrateLayersArgs$reference)) {
+            log_info("  Using reference samples: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
+            IntegrateLayersArgs$reference <- match(IntegrateLayersArgs$reference, samples)
+            log_info("  Transferred to indices: {paste(IntegrateLayersArgs$reference, collapse = ', ')}")
+        }
+        if (method %in% c("CCA", "cca")) { method <- "CCAIntegration" } else
+        if (method %in% c("RPCA", "rpca")) { method <- "RPCAIntegration" } else
+        if (method %in% c("Harmony", "harmony")) { method <- "HarmonyIntegration" } else
+        if (method %in% c("FastMNN", "fastmnn")) { method <- "FastMNNIntegration" } else
+        if (method %in% c("scVI", "scvi")) { method <- "scVIIntegration" } else
+        { stop(paste0("Unknown integration method: ", method)) }
+        if (envs$use_sct && is.null(IntegrateLayersArgs$normalization.method)) {
+            IntegrateLayersArgs$normalization.method <- "SCT"
+        }
+        IntegrateLayersArgs$method <- eval(parse(text = method))
+        new_reductions <- list(
+            "CCAIntegration" = "integrated.cca",
+            "RPCAIntegration" = "integrated.rpca",
+            "HarmonyIntegration" = "harmony",
+            "FastMNNIntegration" = "integration.mnn",
+            "scVIIntegration" = "integrated.scvi"
+        )
+        if (is.null(IntegrateLayersArgs$new.reduction)) {
+            IntegrateLayersArgs$new.reduction <- new_reductions[[method]]
+        }
+        print(paste0("  IntegrateLayers: ", .formatArgs(IntegrateLayersArgs)))
+        log_debug("  IntegrateLayers: {.formatArgs(IntegrateLayersArgs)}")
+        IntegrateLayersArgs$object <- sobj
+        sobj <- do_call(IntegrateLayers, IntegrateLayersArgs)
+        # Save it for dimension reduction plots
+        sobj@misc$integrated_new_reduction <- IntegrateLayersArgs$new.reduction
+        # Cleanup memory
+        IntegrateLayersArgs$object <- NULL
+        rm(IntegrateLayersArgs)
+        gc()
     }
-    IntegrateLayersArgs$method <- eval(parse(text = method))
-    new_reductions <- list(
-        "CCAIntegration" = "integrated.cca",
-        "RPCAIntegration" = "integrated.rpca",
-        "HarmonyIntegration" = "harmony",
-        "FastMNNIntegration" = "integration.mnn",
-        "scVIIntegration" = "integrated.scvi"
-    )
-    if (is.null(IntegrateLayersArgs$new.reduction)) {
-        IntegrateLayersArgs$new.reduction <- new_reductions[[method]]
+    if (!envs$use_sct) {
+        log_info("- Joining layers ...")
+        sobj <- JoinLayers(sobj)
     }
-    print(paste0("  IntegrateLayers: ", .formatArgs(IntegrateLayersArgs)))
-    log_debug("  IntegrateLayers: {.formatArgs(IntegrateLayersArgs)}")
-    IntegrateLayersArgs$object <- sobj
-    sobj <- do_call(IntegrateLayers, IntegrateLayersArgs)
-    # Save it for dimension reduction plots
-    sobj@misc$integrated_new_reduction <- IntegrateLayersArgs$new.reduction
-}
-if (!envs$use_sct) {
-    log_info("- Joining layers ...")
-    sobj <- JoinLayers(sobj)
+    cached$data <- sobj
+    save_to_cache(cached, "Integrated", cache_dir)
 }
+# This is the last step, doesn't need to be cached
 if (!is.null(envs$DoubletFinder) && is.list(envs$DoubletFinder) && envs$DoubletFinder$PCs > 0) {
     library(DoubletFinder)
     log_info("Running DoubletFinder ...")
     log_info("- Preparing Seurat object ...")
+    if (is.null(envs$DoubletFinder$ncores)) {
+        envs$DoubletFinder$ncores <- envs$ncores
+    }
     # More controls from envs?
     sobj <- FindNeighbors(sobj, dims = 1:envs$DoubletFinder$PCs)
     sobj <- FindClusters(sobj)
@@ -449,7 +579,7 @@ if (!is.null(envs$DoubletFinder) && is.list(envs$DoubletFinder) && envs$DoubletF
         sobj,
         PCs = 1:envs$DoubletFinder$PCs,
         sct = envs$use_sct,
-        num.cores = envs$ncores
+        num.cores = envs$DoubletFinder$ncores
     )
     sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
     bcmvn <- find.pK(sweep.stats)
@@ -546,7 +676,7 @@ if (!is.null(envs$DoubletFinder) && is.list(envs$DoubletFinder) && envs$DoubletF
     )
 }
-log_info("Saving filtered seurat object ...")
+log_info("Saving QC'ed seurat object ...")
 saveRDS(sobj, rdsfile)
 save_report(joboutdir)

biopipen/scripts/scrna/SeuratSubClustering.R CHANGED Viewed

@@ -8,6 +8,7 @@ library(tidyr)
 library(dplyr)
 library(tidyseurat)
 library(digest)
+library(clustree)
 set.seed(8525)
@@ -28,6 +29,40 @@ plan(strategy = "multicore", workers = envs$ncores)
     args
 }
+.expand_resolution <- function(resolution) {
+    expanded_res <- c()
+    for (res in resolution) {
+        if (is.numeric(res)) {
+            expanded_res <- c(expanded_res, res)
+        } else {
+            # is.character
+            parts <- trimws(unlist(strsplit(res, ",")))
+            for (part in parts) {
+                if (grepl(":", part)) {
+                    parts <- trimws(unlist(strsplit(part, ":")))
+                    if (length(parts) == 2) { parts <- c(parts, 0.1) }
+                    if (length(parts) != 3) {
+                        stop("Invalid resolution format: {part}. Expected 2 or 3 parts separated by ':' for a range.")
+                    }
+                    parts <- as.numeric(parts)
+                    expanded_res <- c(expanded_res, seq(parts[1], parts[2], by = parts[3]))
+                } else {
+                    expanded_res <- c(expanded_res, as.numeric(part))
+                }
+            }
+        }
+    }
+    # keep the last resolution at last
+    rev(unique(rev(expanded_res)))
+}
+# recode clusters from 0, 1, 2, ... to s1, s2, s3, ...
+.recode_clusters <- function(clusters) {
+    recode <- function(x) paste0("s", as.integer(as.character(x)) + 1)
+    clusters <- factor(recode(clusters), levels = recode(levels(clusters)))
+    clusters
+}
 envs$RunUMAP <- .expand_dims(envs$RunUMAP)
 envs$FindNeighbors <- .expand_dims(envs$FindNeighbors)
@@ -63,7 +98,8 @@ for (key in names(envs$cases)) {
             subset = envs$subset,
             RunUMAP = envs$RunUMAP,
             FindNeighbors = envs$FindNeighbors,
-            FindClusters = envs$FindClusters
+            FindClusters = envs$FindClusters,
+            clustree_devpars = envs$clustree_devpars
         ),
         case
     )
@@ -132,36 +168,49 @@ for (key in names(envs$cases)) {
     }
     case$FindClusters$random.seed <- case$FindClusters$random.seed %||% 8525
-    resolution <- case$FindClusters$resolution %||% 0.8
-    if (is.character(resolution)) {
-        if (grepl(",", resolution)) {
-            resolution <- as.numeric(trimws(unlist(strsplit(resolution, ","))))
-        } else {
-            resolution <- as.numeric(resolution)
+    resolution <- case$FindClusters$resolution <- .expand_resolution(case$FindClusters$resolution %||% 0.8)
+    cached <- get_cached(case$FindClusters, "FindClusters", cache_dir)
+    if (is.null(cached$data)) {
+        log_info("- Running FindClusters at resolution: {paste(resolution, collapse = ',')} ...")
+        case$FindClusters$object <- sobj
+        # avoid overwriting the previous clustering results (as they have the same graph name
+        sobj1 <- do_call(FindClusters, case$FindClusters)
+        graph_name <- case$FindClusters$graph.name %||% paste0(DefaultAssay(sobj), "_snn_res.")
+        for (res in resolution) {
+            cluster_name <- paste0(graph_name, res)
+            new_cluster_name <- paste0(key, ".", res)
+            sobj1@meta.data[[new_cluster_name]] <- .recode_clusters(sobj1@meta.data[[cluster_name]])
         }
+        sobj1@meta.data[[key]] <- .recode_clusters(sobj1@meta.data$seurat_clusters)
+        keys <- sapply(resolution, function(res) paste0(key, ".", res))
+        keys <- c(keys, key)
+        cached$data <- sobj1@meta.data[, keys, drop = FALSE]
+        save_to_cache(cached, "FindClusters", cache_dir)
+        rm(sobj1)
+    } else {
+        log_info("- Using cached FindClusters at resolution: {paste(resolution, collapse = ',')} ...")
     }
-    for (res in resolution) {
-        case$FindClusters$resolution <- res
-        cached <- get_cached(case$FindClusters, paste0("FindClusters_", res), cache_dir)
-        res_key <- paste0("seurat_clusters_", res)
-        if (is.null(cached$data)) {
-            log_info("- Running FindClusters at resolution: {res} ...")
-            case$FindClusters$object <- sobj
-            sobj1 <- do_call(FindClusters, case$FindClusters)
-            levels(sobj1$seurat_clusters) <- paste0("s", as.numeric(levels(sobj1$seurat_clusters)) + 1)
-            sobj1[[res_key]] <- sobj1$seurat_clusters
-            cached$data <- sobj1@meta.data[, res_key, drop = FALSE]
-            save_to_cache(cached, paste0("FindClusters_", res), cache_dir)
-        } else {
-            log_info("- Using cached FindClusters at resolution: {res} ...")
-        }
-        ident_table <- table(cached$data[[res_key]])
-        log_info("  Found {length(ident_table)} clusters")
-        print(ident_table)
-        cat("\n")
+    ident_table <- table(cached$data[[key]])
+    log_info("  Found {length(ident_table)} clusters")
+    print(ident_table)
+    cat("\n")
+    if (length(resolution) > 1) {
+        log_info("- Plotting clustree ...")
+        png(
+            file.path(joboutdir, paste0(key, ".clustree.png")),
+            res = case$clustree_devpars$res,
+            width = case$clustree_devpars$width,
+            height = case$clustree_devpars$height
+        )
+        p <- clustree(cached$data, prefix = paste0(key, "."))
+        print(p)
+        dev.off()
     }
     log_info("- Updating meta.data with subclusters...")
-    srtobj <- AddMetaData(srtobj, metadata = cached$data, col.name = key)
+    srtobj <- AddMetaData(srtobj, metadata = cached$data)
     srtobj[[paste0("sub_umap_", key)]] <- reduc
 }

biopipen 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl

Potentially problematic release.

biopipen 0.28.1py3-none-any.whl → 0.29.1py3-none-any.whl