PyPI - biopipen - Versions diffs - 0.22.0__py3-none-any.whl → 0.22.2__py3-none-any.whl - Mend

biopipen 0.22.0py3-none-any.whl → 0.22.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biopipen might be problematic. Click here for more details.

Files changed (27) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +6 -0
biopipen/core/filters.py +12 -0
biopipen/ns/cellranger.py +101 -0
biopipen/ns/scrna.py +2 -0
biopipen/ns/tcr.py +30 -10
biopipen/reports/cellranger/CellRangerCount.svelte +16 -0
biopipen/reports/cellranger/CellRangerVdj.svelte +16 -0
biopipen/scripts/cellranger/CellRangerCount.py +79 -0
biopipen/scripts/cellranger/CellRangerVdj.py +79 -0
biopipen/scripts/scrna/CellTypeAnnotation-direct.R +31 -24
biopipen/scripts/scrna/CellsDistribution.R +9 -8
biopipen/scripts/scrna/MarkersFinder.R +106 -28
biopipen/scripts/scrna/SeuratClusterStats-features.R +2 -2
biopipen/scripts/scrna/SeuratMetadataMutater.R +13 -1
biopipen/scripts/tcr/Attach2Seurat.R +2 -1
biopipen/scripts/tcr/CDR3AAPhyschem.R +1 -1
biopipen/scripts/tcr/Immunarch.R +3 -0
biopipen/scripts/tcr/ImmunarchLoading.R +22 -23
biopipen/scripts/tcr/TCRClustering.R +8 -9
biopipen/scripts/tcr/TESSA.R +23 -30
biopipen/utils/common_docstrs.py +3 -0
biopipen/utils/mutate_helpers.R +110 -106
{biopipen-0.22.0.dist-info → biopipen-0.22.2.dist-info}/METADATA +1 -1
{biopipen-0.22.0.dist-info → biopipen-0.22.2.dist-info}/RECORD +27 -22
{biopipen-0.22.0.dist-info → biopipen-0.22.2.dist-info}/entry_points.txt +1 -0
{biopipen-0.22.0.dist-info → biopipen-0.22.2.dist-info}/WHEEL +0 -0

biopipen/scripts/scrna/MarkersFinder.R CHANGED Viewed

@@ -143,11 +143,13 @@ for (name in names(cases)) {
     } else if (is.null(case$each)) {
         # is.null(case$ident.1)
         sections <- c(sections, name)
-        idents <- srtobj@meta.data %>% pull(case$group.by) %>% unique() %>% na.omit()
-        for (ident in idents) {
-            newcases[[paste0(name, ":", ident)]] <- case
-            newcases[[paste0(name, ":", ident)]]$ident.1 <- ident
-        }
+        newcases[[name]] <- case
+        newcases[[name]]$findall <- TRUE
+        # idents <- srtobj@meta.data %>% pull(case$group.by) %>% unique() %>% na.omit()
+        # for (ident in idents) {
+        #     newcases[[paste0(name, ":", ident)]] <- case
+        #     newcases[[paste0(name, ":", ident)]]$ident.1 <- ident
+        # }
     } else {
         eachs <- srtobj@meta.data %>% pull(case$each) %>% unique() %>% na.omit()
         for (each in eachs) {
@@ -160,18 +162,22 @@ for (name in names(cases)) {
                 )
             )
             if (is.null(case$ident.1)) {
-                idents <- srtobj@meta.data %>% pull(case$group.by) %>% unique() %>% na.omit()
-                for (ident in idents) {
-                    kname <- if (name == "DEFAULT") "" else paste0(" - ", name)
-                    sections <- c(sections, paste0(each, kname))
-                    key <- paste0(each, kname, ":", ident)
-                    if (case$prefix_each) {
-                        key <- paste0(case$each, " - ", key)
-                    }
-                    newcases[[key]] <- case
-                    newcases[[key]]$ident.1 <- ident
-                    newcases[[key]]$group.by <- by
-                }
+                kname <- if (name == "DEFAULT") "" else paste0(" - ", name)
+                sections <- c(sections, paste0(each, kname))
+                key <- paste0(each, kname)
+                newcases[[key]] <- case
+                newcases[[key]]$group.by <- by
+                newcases[[key]]$findall <- TRUE
+                # idents <- srtobj@meta.data %>% pull(case$group.by) %>% unique() %>% na.omit()
+                # for (ident in idents) {
+                #     key <- paste0(each, kname, ":", ident)
+                #     if (case$prefix_each) {
+                #         key <- paste0(case$each, " - ", key)
+                #     }
+                #     newcases[[key]] <- case
+                #     newcases[[key]]$ident.1 <- ident
+                #     newcases[[key]]$group.by <- by
+                # }
             } else {
                 sections <- c(sections, case$each)
                 key <- paste0(case$each, ":", each)
@@ -312,11 +318,11 @@ do_enrich <- function(info, markers, sig, volgenes) {
 }
-do_dotplot <- function(info, siggenes, case, args) {
-    dotplot_devpars <- case$dotplot$devpars
+do_dotplot <- function(info, siggenes, dotplot, args) {
+    dotplot_devpars <- dotplot$devpars
     if (is.null(args$ident.2)) {
-        case$dotplot$object <- args$object
-        case$dotplot$object@meta.data <- case$dotplot$object@meta.data %>%
+        dotplot$object <- args$object
+        dotplot$object@meta.data <- dotplot$object@meta.data %>%
             mutate(
                 !!sym(args$group.by) := if_else(
                     !!sym(args$group.by) == args$ident.1,
@@ -329,17 +335,16 @@ do_dotplot <- function(info, siggenes, case, args) {
                 )
             )
     } else {
-        case$dotplot$object <- args$object %>%
+        dotplot$object <- args$object %>%
             filter(!!sym(args$group.by) %in% c(args$ident.1, args$ident.2)) %>%
             mutate(!!sym(args$group.by) := factor(
                 !!sym(args$group.by),
                 levels = c(args$ident.1, args$ident.2)
             ))
     }
-    case$dotplot$devpars <- NULL
-    case$dotplot$features <- siggenes
-    case$dotplot$group.by <- args$group.by
-    case$dotplot$assay <- case$assay
+    dotplot$devpars <- NULL
+    dotplot$features <- siggenes
+    dotplot$group.by <- args$group.by
     dotplot_width = ifelse(
         is.null(dotplot_devpars$width),
         if (length(siggenes) <= 20) length(siggenes) * 60 else length(siggenes) * 30,
@@ -351,7 +356,7 @@ do_dotplot <- function(info, siggenes, case, args) {
     png(dotplot_file, res = dotplot_res, width = dotplot_height, height = dotplot_width)
     # rotate x axis labels
     print(
-        do_call(DotPlot, case$dotplot) +
+        do_call(DotPlot, dotplot) +
         theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
         coord_flip()
     )
@@ -456,9 +461,79 @@ add_case_report <- function(info, sigmarkers, siggenes) {
 }
+do_case_findall <- function(casename) {
+    log_info("- Using FindAllMarkers for case: {casename}...")
+    case = cases[[casename]]
+    args <- case$rest
+    args$group.by <- case$group.by
+    if (is.null(args$logfc.threshold)) {
+        args$locfc.threshold <- 0
+    }
+    if (is.null(args$min.cells.group)) {
+        args$min.cells.group <- 1
+    }
+    if (is.null(args$min.cells.feature)) {
+        args$min.cells.feature <- 1
+    }
+    if (is.null(args$min.pct)) {
+        args$min.pct <- 0
+    }
+    if (!is.null(case$subset)) {
+        args$object <- srtobj %>% filter(!!parse_expr(case$subset) & filter(!is.na(!!sym(case$group.by))))
+    } else {
+        args$object <- srtobj %>% filter(!is.na(!!sym(case$group.by)))
+    }
+    Idents(args$object) <- case$group.by
+    markers <- tryCatch({
+        do_call(FindAllMarkers, args)
+        # gene, p_val, avg_log2FC, pct.1, pct.2, p_val_adj, cluster
+    }, error = function(e) {
+        log_warn(e$message)
+        data.frame(
+            gene = character(),
+            p_val = numeric(),
+            avg_log2FC = numeric(),
+            pct.1 = numeric(),
+            pct.2 = numeric(),
+            p_val_adj=numeric(),
+            cluster = character()
+        )
+    })
+    if (is.null(case$dotplot$assay)) {
+        case$dotplot$assay <- assay
+    }
+    idents <- unique(markers$cluster)
+    for (ident in idents) {
+        log_info("- Dealing with ident: {ident}...")
+        info <- casename_info(paste0(casename, ":", ident), create = TRUE)
+        siggenes <- do_enrich(info, markers %>% filter(cluster == ident), case$sigmarkers, case$volcano_genes)
+        if (length(siggenes) > 0) {
+            args$ident.1 <- as.character(ident)
+            do_dotplot(info, siggenes, case$dotplot, args)
+        }
+        add_case_report(info, case$sigmarkers, siggenes)
+        if (info$section %in% overlap) {
+            if (is.null(overlaps[[info$section]])) {
+                overlaps[[info$section]] <<- list()
+            }
+            overlaps[[info$section]][[info$case]] <<- siggenes
+        }
+    }
+}
 do_case <- function(casename) {
     log_info("Dealing with case: {casename}...")
+    if (isTRUE(cases[[casename]]$findall)) {
+        do_case_findall(casename)
+        return()
+    }
     info <- casename_info(casename, create = TRUE)
     case <- cases[[casename]]
     # ident1
@@ -507,7 +582,10 @@ do_case <- function(casename) {
     siggenes <- do_enrich(info, markers, case$sigmarkers, case$volcano_genes)
     if (length(siggenes) > 0) {
-        do_dotplot(info, siggenes, case, args)
+        if (is.null(case$dotplot$assay)) {
+            case$dotplot$assay <- assay
+        }
+        do_dotplot(info, siggenes, case$dotplot, args)
     }
     if (info$section %in% overlap) {

biopipen/scripts/scrna/SeuratClusterStats-features.R CHANGED Viewed

@@ -173,8 +173,8 @@ do_one_features = function(name) {
             rownames_to_column("Feature") %>%
             select(Feature, everything())
-        exprfile = paste0(slugify(name), ".txt")
-        write.table(expr, file.path(odir, exprfile), sep="\t", quote=FALSE, row.names=FALSE)
+        exprfile = file.path(odir, paste0(slugify(name), ".txt"))
+        write.table(expr, exprfile, sep="\t", quote=FALSE, row.names=FALSE)
         add_report(
             list(

biopipen/scripts/scrna/SeuratMetadataMutater.R CHANGED Viewed

@@ -1,4 +1,6 @@
+source("{{biopipen_dir}}/utils/misc.R")
 source("{{biopipen_dir}}/utils/mutate_helpers.R")
 library(rlang)
 library(tibble)
 library(dplyr)
@@ -14,7 +16,17 @@ metadata = srt@meta.data
 if (!is.null(metafile)) {
     mdata = read.table(metafile, header=TRUE, row.names=1, sep="\t", check.names=FALSE)
-    metadata = cbind(metadata, mdata[rownames(metadata),,drop=FALSE])
+    ov_cols = intersect(colnames(metadata), colnames(mdata))
+    if (length(ov_cols) > 0) {
+        log_warn(paste0(
+            "The following columns are already present in Seurat object and will be ignored: ",
+            paste(ov_cols, collapse=', ')
+        ))
+    }
+    metadata = cbind(
+        metadata,
+        mdata[rownames(metadata), setdiff(colnames(mdata), ov_cols), drop=FALSE]
+    )
 }
 expr = list()

biopipen/scripts/tcr/Attach2Seurat.R CHANGED Viewed

@@ -11,6 +11,7 @@ immfile = {{in.immfile | r}}
 sobjfile = {{in.sobjfile | r}}
 outfile = {{out.outfile | r}}
 metacols = {{envs.metacols | r}}
+prefix = {{envs.prefix | r}}
 immdata = readRDS(immfile)
 sobj = readRDS(sobjfile)
@@ -31,7 +32,7 @@ metadf = do_call(rbind, lapply(seq_len(nrow(immdata$meta)), function(i) {
     cldata %>%
         separate_rows(Barcode, sep=";") %>%
-        mutate(Barcode = glue("{{envs.prefix}}{Barcode}"))
+        mutate(Barcode = glue(paste0(prefix, "{Barcode}")))
 }))

biopipen/scripts/tcr/CDR3AAPhyschem.R CHANGED Viewed

@@ -193,7 +193,7 @@ merge_data = function(sam) {
     if (!is.null(prefix) && nchar(prefix) > 0) {
         # Replace the placeholder like {Sample} with the data in other columns
         # in the same row
-        sdata = sdata %>% mutate(.prefix_len = nchar(glue("{{envs.prefix}}")))
+        sdata = sdata %>% mutate(.prefix_len = nchar(glue(prefix)))
         # Remove the prefix in the rownames of sdata
         rownames(sdata) = substring(rownames(sdata), sdata$.prefix_len + 1)
         sdata = sdata %>% select(-.prefix_len)

biopipen/scripts/tcr/Immunarch.R CHANGED Viewed

@@ -27,6 +27,9 @@ prefix = {{ envs.prefix | r }}
 log_info("Loading immdata ...")
 immdata = readRDS(immfile)
+if (is.null(prefix)) { prefix = immdata$prefix }
+if (is.null(prefix)) { prefix = "" }
 log_info("Expanding immdata ...")
 exdata = expand_immdata(immdata)

biopipen/scripts/tcr/ImmunarchLoading.R CHANGED Viewed

@@ -1,4 +1,5 @@
 source("{{biopipen_dir}}/utils/misc.R")
+source("{{biopipen_dir}}/utils/single_cell.R")
 # Loading 10x data into immunarch
 library(immunarch)
@@ -13,7 +14,8 @@ rdsfile = {{ out.rdsfile | quote }}
 metatxt = {{ out.metatxt | quote }}
 tmpdir = {{ envs.tmpdir | quote }}
 mode = {{ envs.mode | quote }}
-metacols = {{ envs.metacols | r}}
+extracols = {{ envs.extracols | r}}
+prefix = {{ envs.prefix | r }}
 metadata = read.table(
     metafile,
@@ -164,27 +166,24 @@ immdata$meta = left_join(
     by = "Sample"
 )
-saveRDS(immdata, file=rdsfile)
-metadf = do_call(rbind, lapply(seq_len(nrow(immdata$meta)), function(i) {
-    # Clones  Proportion   CDR3.aa                       Barcode
-    # 5      4 0.008583691 CAVRDTGNTPLVF;CASSEYSNQPQHF   GTTCGGGCACTTACGA-1;TCTCTAAGTACCAGTT-1
-    # 6      4 0.008583691 CALTQAAGNKLTF;CASRPEDLRGQPQHF GCTTGAAGTCGGCACT-1;TACTCGCTCCTAAGTG-1
-    cldata = immdata$data[[i]][, unique(c(metacols, "Barcode"))]
-    # # A tibble: 4 × 5
-    # Sample                  Patient     Timepoint Tissue
-    # <chr>                   <chr>       <chr>     <chr>
-    # 1 MC1685Pt011-Baseline-PB MC1685Pt011 Baseline  PB
-    mdata = as.list(immdata$meta[i, , drop=FALSE])
-    for (mname in names(mdata)) {
-        assign(mname, mdata[[mname]])
-    }
+immdata$prefix = prefix
-    cldata %>%
-        separate_rows(Barcode, sep=";") %>%
-        distinct(Barcode, .keep_all = TRUE) %>%
-        mutate(Barcode = glue("{{envs.prefix}}{Barcode}")) %>%
-        column_to_rownames("Barcode")
+saveRDS(immdata, file=rdsfile)
-}))
-write.table(metadf, metatxt, sep="\t", quote=FALSE, row.names=TRUE, col.names=TRUE)
+exdata <- expand_immdata(immdata, cell_id = "Barcode") %>%
+    distinct(Sample, Barcode, .keep_all = TRUE) %>%
+    mutate(Barcode = glue(paste0(prefix, "{Barcode}"))) %>%
+    select(any_of(c(
+        colnames(immdata$meta),
+        "Barcode",
+        "CDR3.aa",
+        "Clones",
+        "Proportion",
+        "V.name",
+        "D.name",
+        "J.name",
+        extracols
+    ))) %>%
+    column_to_rownames("Barcode")
+write.table(exdata, metatxt, sep="\t", quote=FALSE, row.names=TRUE, col.names=TRUE)

biopipen/scripts/tcr/TCRClustering.R CHANGED Viewed

@@ -3,11 +3,13 @@
 # python = Sys.which({{envs.python | r}})
 # Sys.setenv(RETICULATE_PYTHON = python)
 # library(reticulate)
+source("{{biopipen_dir}}/utils/single_cell.R")
 library(immunarch)
 library(dplyr)
 library(tidyr)
 library(tibble)
+library(glue)
 immfile = {{in.immfile | r}}
 outdir = normalizePath({{job.outdir | r}})
@@ -17,6 +19,7 @@ tool = {{envs.tool | r}}
 python = {{envs.python | r}}
 on_multi = {{envs.on_multi | r}}
 args = {{envs.args | r}}
+prefix = {{envs.prefix | r}}
 setwd(outdir)
@@ -26,17 +29,13 @@ if (on_multi) {
 } else {
     seqdata = immdata$data
 }
+if (is.null(prefix)) { prefix = immdata$prefix }
+if (is.null(prefix)) { prefix = "" }
 get_cdr3aa_df = function() {
-    out = NULL
-    for (sample in names(immdata$data)) {
-        tmpdf = immdata$data[[sample]] %>%
-            select(Barcode, CDR3.aa) %>%
-            separate_rows(Barcode, sep = ";") %>%
-            mutate(Barcode = paste0(sample, "_", Barcode))
-        out = bind_rows(out, tmpdf)
-    }
-    out
+    expand_immdata(immdata, cell_id = "Barcode") %>%
+        mutate(Barcode = glue(paste0(prefix, "{Barcode}"))) %>%
+        select(Barcode, CDR3.aa)
 }
 cdr3aa_df = get_cdr3aa_df()

biopipen/scripts/tcr/TESSA.R CHANGED Viewed

@@ -1,8 +1,10 @@
 source("{{biopipen_dir}}/utils/misc.R")
+source("{{biopipen_dir}}/utils/single_cell.R")
 library(glue)
 library(dplyr)
 library(tidyr)
+library(tibble)
 library(immunarch)
 library(Seurat)
 library(ggplot2)
@@ -13,6 +15,7 @@ exprfile <- {{in.srtobj | r}}
 outfile <- {{out.outfile | r}}
 joboutdir <- {{job.outdir | r}}
 python <- {{envs.python | r}}
+prefix <- {{envs.prefix | r}}
 within_sample <- {{envs.within_sample | r}}
 assay <- {{envs.assay | r}}
 predefined_b <- {{envs.predefined_b | r}}
@@ -29,34 +32,21 @@ if (!dir.exists(tessa_dir)) dir.create(tessa_dir)
 ### Start preparing input files for TESSA
 # Prepare input files
 log_info("Preparing TCR input file ...")
-immdata <- readRDS(immfile)
-has_VJ <- "V.name" %in% colnames(immdata$data[[1]]) && "J.name" %in% colnames(immdata$data[[1]])
-# Merge all samples
-tcrdata <- do_call(rbind, lapply(seq_len(nrow(immdata$meta)), function(i) {
-    # Clones  Proportion   CDR3.aa                       Barcode
-    # 5      4 0.008583691 CAVRDTGNTPLVF;CASSEYSNQPQHF   GTTCGGGCACTTACGA-1;TCTCTAAGTACCAGTT-1
-    # 6      4 0.008583691 CALTQAAGNKLTF;CASRPEDLRGQPQHF GCTTGAAGTCGGCACT-1;TACTCGCTCCTAAGTG-1
-    if (has_VJ) {
-        cldata = immdata$data[[i]][, c("Barcode", "CDR3.aa", "V.name", "J.name")]
-    } else {
-        cldata = immdata$data[[i]][, c("Barcode", "CDR3.aa")]
-    }
-    # # A tibble: 4 × 5
-    # Sample                  Patient     Timepoint Tissue
-    # <chr>                   <chr>       <chr>     <chr>
-    # 1 MC1685Pt011-Baseline-PB MC1685Pt011 Baseline  PB
-    mdata = as.list(immdata$meta[i, , drop=FALSE])
-    for (mname in names(mdata)) {
-        assign(mname, mdata[[mname]])
-    }
+# If immfile endswith .rds, then it is an immunarch object
+if (endsWith(tolower(immfile), ".rds")) {
+    immdata <- readRDS(immfile)
+    if (is.null(prefix)) { prefix = immdata$prefix }
+    if (is.null(prefix)) { prefix = "" }
+    tcrdata <- expand_immdata(immdata) %>%
+        mutate(Barcode = glue(paste0(prefix, "{Barcode}")))
+    rm(immdata)
+} else {
+    tcrdata <- read.table(immfile, sep="\t", header=TRUE, row.names=1) %>%
+        rownames_to_column("Barcode")
+}
+has_VJ <- "V.name" %in% colnames(tcrdata) && "J.name" %in% colnames(tcrdata)
-    cldata %>%
-        separate_rows(Barcode, sep=";") %>%
-        # Just in case there are duplicated barcodes
-        distinct(Barcode, .keep_all = TRUE) %>%
-        mutate(Barcode = glue("{{envs.prefix}}{Barcode}"), sample = Sample)
-}))
 if (has_VJ) {
     tcrdata <- tcrdata %>% dplyr::mutate(
         v_gene = sub("-\\d+$", "", V.name),
@@ -66,13 +56,13 @@ if (has_VJ) {
         cdr3 = CDR3.aa,
         v_gene,
         j_gene,
-        sample
+        sample = Sample
     )
 } else {
     tcrdata <- tcrdata %>% dplyr::select(
         contig_id = Barcode,
         cdr3 = CDR3.aa,
-        sample
+        sample = Sample
     )
 }
@@ -101,7 +91,10 @@ if (length(unused_expr_cells) > 0) {
     log_warn(glue("{length(unused_expr_cells)}/{ncol(expr)} expression cells are not used."))
 }
 if (length(cell_ids) == 0) {
-    stop("No common cells between TCR and expression data. Are you using the correct prefix?")
+    stop(paste0(
+        "No common cells between TCR and expression data. ",
+        "Are you using the correct `envs.prefix` here or in `ImmunarchLoading`?"
+    ))
 }
 tcrdata <- tcrdata[tcrdata$contig_id %in% cell_ids, , drop=FALSE]
 expr <- as.matrix(expr)[, tcrdata$contig_id, drop=FALSE]

biopipen/utils/common_docstrs.py CHANGED Viewed

@@ -46,11 +46,14 @@ Those functions take following arguments:
 * `group-by`: The column name in metadata to group the cells.
 * `idents`: The first group or both groups of cells to compare (value in `group-by` column). If only the first group is given, the rest of the cells (with non-NA in `group-by` column) will be used as the second group.
 * `subset`: An expression to subset the cells, will be passed to `dplyr::filter()`. Default is `TRUE` (no filtering).
+* `each`: A column name (without quotes) in metadata to split the cells.
+    Each comparison will be done for each value in this column.
 * `id`: The column name in metadata for the group ids (i.e. `CDR3.aa`).
 * `compare`: Either a (numeric) column name (i.e. `Clones`) in metadata to compare between groups, or `.n` to compare the number of cells in each group.
     If numeric column is given, the values should be the same for all cells in the same group.
     This will not be checked (only the first value is used).
 * `uniq`: Whether to return unique ids or not. Default is `TRUE`. If `FALSE`, you can mutate the meta data frame with the returned ids. For example, `df |> mutate(expanded = expanded(...))`.
+* `debug`: Return the data frame with intermediate columns instead of the ids. Default is `FALSE`.
 * `order`: The order of the returned ids. It could be `sum` or `diff`, which is the sum or diff of the `compare` between idents.
     Two kinds of modifiers can be added, including `desc` and `abs`.
     For example, `sum,desc` means the sum of `compare` between idents in descending order.

biopipen 0.22.0__py3-none-any.whl → 0.22.2__py3-none-any.whl

Potentially problematic release.

biopipen 0.22.0py3-none-any.whl → 0.22.2py3-none-any.whl