PyPI - biopipen - Versions diffs - 0.33.1__py3-none-any.whl → 0.34.0__py3-none-any.whl - Mend

biopipen 0.33.1py3-none-any.whl → 0.34.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biopipen might be problematic. Click here for more details.

Files changed (149) hide show

biopipen/__init__.py +1 -1
biopipen/core/filters.py +10 -183
biopipen/core/proc.py +5 -3
biopipen/core/testing.py +8 -1
biopipen/ns/bam.py +40 -4
biopipen/ns/cnv.py +1 -1
biopipen/ns/cnvkit.py +1 -1
biopipen/ns/delim.py +1 -1
biopipen/ns/gsea.py +63 -37
biopipen/ns/misc.py +38 -0
biopipen/ns/plot.py +8 -0
biopipen/ns/scrna.py +290 -288
biopipen/ns/scrna_metabolic_landscape.py +207 -366
biopipen/ns/tcr.py +165 -97
biopipen/reports/bam/CNVpytor.svelte +4 -9
biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
biopipen/reports/{delim/SampleInfo.svelte → common.svelte} +2 -3
biopipen/reports/scrna/DimPlots.svelte +1 -1
biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +51 -22
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +46 -42
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +63 -6
biopipen/reports/snp/PlinkCallRate.svelte +2 -2
biopipen/reports/snp/PlinkFreq.svelte +1 -1
biopipen/reports/snp/PlinkHWE.svelte +1 -1
biopipen/reports/snp/PlinkHet.svelte +1 -1
biopipen/reports/snp/PlinkIBD.svelte +1 -1
biopipen/reports/tcr/CDR3AAPhyschem.svelte +1 -1
biopipen/scripts/bam/CNAClinic.R +41 -6
biopipen/scripts/bam/CNVpytor.py +2 -1
biopipen/scripts/bam/ControlFREEC.py +2 -3
biopipen/scripts/bam/SamtoolsView.py +33 -0
biopipen/scripts/cnv/AneuploidyScore.R +25 -13
biopipen/scripts/cnv/AneuploidyScoreSummary.R +218 -163
biopipen/scripts/cnv/TMADScore.R +4 -4
biopipen/scripts/cnv/TMADScoreSummary.R +51 -84
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +3 -3
biopipen/scripts/cnvkit/CNVkitHeatmap.py +3 -3
biopipen/scripts/cnvkit/CNVkitReference.py +3 -3
biopipen/scripts/delim/RowsBinder.R +1 -1
biopipen/scripts/delim/SampleInfo.R +4 -1
biopipen/scripts/gene/GeneNameConversion.R +14 -12
biopipen/scripts/gsea/Enrichr.R +2 -2
biopipen/scripts/gsea/FGSEA.R +184 -50
biopipen/scripts/gsea/PreRank.R +3 -3
biopipen/scripts/misc/Plot.R +80 -0
biopipen/scripts/plot/VennDiagram.R +2 -2
biopipen/scripts/protein/ProdigySummary.R +34 -27
biopipen/scripts/regulatory/MotifAffinityTest.R +11 -9
biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +5 -5
biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +4 -4
biopipen/scripts/regulatory/VariantMotifPlot.R +10 -8
biopipen/scripts/regulatory/motifs-common.R +10 -9
biopipen/scripts/rnaseq/Simulation-ESCO.R +14 -11
biopipen/scripts/rnaseq/Simulation-RUVcorr.R +7 -4
biopipen/scripts/rnaseq/Simulation.R +0 -2
biopipen/scripts/rnaseq/UnitConversion.R +6 -5
biopipen/scripts/scrna/AnnData2Seurat.R +25 -73
biopipen/scripts/scrna/CellCellCommunication.py +1 -1
biopipen/scripts/scrna/CellCellCommunicationPlots.R +51 -168
biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +99 -150
biopipen/scripts/scrna/CellTypeAnnotation-direct.R +11 -9
biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +12 -9
biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +14 -11
biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +19 -16
biopipen/scripts/scrna/CellTypeAnnotation.R +10 -2
biopipen/scripts/scrna/CellsDistribution.R +1 -1
biopipen/scripts/scrna/ExprImputation-alra.R +87 -11
biopipen/scripts/scrna/ExprImputation-rmagic.R +247 -21
biopipen/scripts/scrna/ExprImputation-scimpute.R +8 -5
biopipen/scripts/scrna/MarkersFinder.R +348 -217
biopipen/scripts/scrna/MetaMarkers.R +3 -3
biopipen/scripts/scrna/ModuleScoreCalculator.R +14 -13
biopipen/scripts/scrna/RadarPlots.R +1 -1
biopipen/scripts/scrna/ScFGSEA.R +157 -75
biopipen/scripts/scrna/ScSimulation.R +11 -10
biopipen/scripts/scrna/ScVelo.py +605 -0
biopipen/scripts/scrna/Seurat2AnnData.R +2 -3
biopipen/scripts/scrna/SeuratClusterStats-clustree.R +1 -1
biopipen/scripts/scrna/SeuratClusterStats-features.R +39 -30
biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +56 -65
biopipen/scripts/scrna/SeuratClusterStats-stats.R +4 -4
biopipen/scripts/scrna/SeuratClusterStats.R +9 -6
biopipen/scripts/scrna/SeuratClustering.R +31 -48
biopipen/scripts/scrna/SeuratLoading.R +2 -2
biopipen/scripts/scrna/SeuratMap2Ref.R +66 -367
biopipen/scripts/scrna/SeuratMetadataMutater.R +5 -7
biopipen/scripts/scrna/SeuratPreparing.R +76 -24
biopipen/scripts/scrna/SeuratSubClustering.R +46 -185
biopipen/scripts/scrna/{SlingShot.R → Slingshot.R} +12 -16
biopipen/scripts/scrna/Subset10X.R +2 -2
biopipen/scripts/scrna/TopExpressingGenes.R +141 -184
biopipen/scripts/scrna/celltypist-wrapper.py +6 -4
biopipen/scripts/scrna/seurat_anndata_conversion.py +81 -0
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +429 -123
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +346 -245
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +182 -173
biopipen/scripts/snp/MatrixEQTL.R +39 -20
biopipen/scripts/snp/PlinkCallRate.R +43 -34
biopipen/scripts/snp/PlinkFreq.R +34 -41
biopipen/scripts/snp/PlinkHWE.R +23 -18
biopipen/scripts/snp/PlinkHet.R +26 -22
biopipen/scripts/snp/PlinkIBD.R +30 -34
biopipen/scripts/stats/ChowTest.R +9 -8
biopipen/scripts/stats/DiffCoexpr.R +13 -11
biopipen/scripts/stats/LiquidAssoc.R +7 -8
biopipen/scripts/stats/Mediation.R +8 -8
biopipen/scripts/stats/MetaPvalue.R +11 -13
biopipen/scripts/stats/MetaPvalue1.R +6 -5
biopipen/scripts/tcr/CDR3AAPhyschem.R +105 -164
biopipen/scripts/tcr/ClonalStats.R +5 -4
biopipen/scripts/tcr/CloneResidency.R +3 -3
biopipen/scripts/tcr/CloneSizeQQPlot.R +2 -2
biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
biopipen/scripts/tcr/ImmunarchFilter.R +3 -3
biopipen/scripts/tcr/ImmunarchLoading.R +5 -5
biopipen/scripts/tcr/ScRepCombiningExpression.R +39 -0
biopipen/scripts/tcr/ScRepLoading.R +114 -92
biopipen/scripts/tcr/TCRClusterStats.R +2 -2
biopipen/scripts/tcr/TCRClustering.R +86 -97
biopipen/scripts/tcr/TESSA.R +65 -115
biopipen/scripts/tcr/VJUsage.R +5 -5
biopipen/scripts/vcf/TruvariBenchSummary.R +15 -11
biopipen/utils/common_docstrs.py +66 -63
biopipen/utils/reporter.py +177 -0
{biopipen-0.33.1.dist-info → biopipen-0.34.0.dist-info}/METADATA +2 -1
{biopipen-0.33.1.dist-info → biopipen-0.34.0.dist-info}/RECORD +130 -144
{biopipen-0.33.1.dist-info → biopipen-0.34.0.dist-info}/WHEEL +1 -1
biopipen/reports/scrna/CellCellCommunicationPlots.svelte +0 -14
biopipen/reports/scrna/SeuratClusterStats.svelte +0 -16
biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -37
biopipen/reports/scrna/SeuratPreparing.svelte +0 -15
biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -28
biopipen/reports/utils/gsea.liq +0 -110
biopipen/scripts/scrna/CellTypeAnnotation-common.R +0 -10
biopipen/scripts/scrna/SeuratClustering-common.R +0 -213
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -193
biopipen/utils/caching.R +0 -44
biopipen/utils/gene.R +0 -95
biopipen/utils/gsea.R +0 -329
biopipen/utils/io.R +0 -20
biopipen/utils/misc.R +0 -602
biopipen/utils/mutate_helpers.R +0 -581
biopipen/utils/plot.R +0 -209
biopipen/utils/repr.R +0 -146
biopipen/utils/rnaseq.R +0 -48
biopipen/utils/single_cell.R +0 -207
{biopipen-0.33.1.dist-info → biopipen-0.34.0.dist-info}/entry_points.txt +0 -0

biopipen/scripts/tcr/TESSA.R CHANGED Viewed

@@ -1,27 +1,28 @@
-{{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
-{{ biopipen_dir | joinpaths: "utils", "single_cell.R" | source_r }}
 library(glue)
 library(dplyr)
 library(tidyr)
 library(tibble)
-library(immunarch)
 library(Seurat)
-library(ggplot2)
-library(ggprism)
+library(biopipen.utils)
-immfile <- {{in.immdata | r}}
-exprfile <- {{in.srtobj | r}}
+screpdata <- {{in.screpdata | r}}
 outfile <- {{out.outfile | r}}
 joboutdir <- {{job.outdir | r}}
 python <- {{envs.python | r}}
-prefix <- {{envs.prefix | r}}
 within_sample <- {{envs.within_sample | r}}
 assay <- {{envs.assay | r}}
 predefined_b <- {{envs.predefined_b | r}}
 max_iter <- {{envs.max_iter | int}}
 save_tessa <- {{envs.save_tessa | r}}
-tessa_srcdir <- "{{biopipen_dir}}/scripts/tcr/TESSA_source"
+log <- get_logger()
+reporter <- get_reporter()
+# In case this script is running in the cloud and <biopipen_dir> can not be found in there
+# In stead, we use the python command, which is associated with the cloud environment,
+# to get the biopipen directory
+biopipen_dir <- get_biopipen_dir(python)
+tessa_srcdir <- file.path(biopipen_dir, "scripts", "tcr", "TESSA_source")
 outdir <- dirname(outfile)
 result_dir <- file.path(outdir, "result")
@@ -31,88 +32,49 @@ if (!dir.exists(tessa_dir)) dir.create(tessa_dir)
 ### Start preparing input files for TESSA
 # Prepare input files
-log_info("Preparing TCR input file ...")
-# If immfile endswith .rds, then it is an immunarch object
-if (endsWith(tolower(immfile), ".rds")) {
-    immdata <- readRDS(immfile)
-    if (is.null(prefix)) { prefix = immdata$prefix }
-    if (is.null(prefix)) { prefix = "" }
-    tcrdata <- expand_immdata(immdata) %>%
-        mutate(Barcode = glue(paste0(prefix, "{Barcode}")))
-    rm(immdata)
-} else {
-    tcrdata <- read.table(immfile, sep="\t", header=TRUE, row.names=1) %>%
-        rownames_to_column("Barcode")
-}
-has_VJ <- "V.name" %in% colnames(tcrdata) && "J.name" %in% colnames(tcrdata)
-if (has_VJ) {
-    tcrdata <- tcrdata %>% dplyr::mutate(
-        v_gene = sub("-\\d+$", "", V.name),
-        j_gene = sub("-\\d+$", "", J.name)
-    ) %>% dplyr::select(
-        contig_id = Barcode,
-        cdr3 = CDR3.aa,
-        v_gene,
-        j_gene,
-        sample = Sample
-    )
-} else {
-    tcrdata <- tcrdata %>% dplyr::select(
-        contig_id = Barcode,
-        cdr3 = CDR3.aa,
-        sample = Sample
-    )
-}
-log_info("Preparing expression input file ...")
-is_seurat <- endsWith(tolower(exprfile), ".rds")
-is_gz <- endsWith(tolower(exprfile), ".gz")
-if (is_seurat) {
-    sobj <- readRDS(exprfile)
-    expr <- GetAssayData(sobj, layer = "data")
-} else if (is_gz) {
-    expr <- read.table(gzfile(exprfile), sep="\t", header=TRUE, row.names=1)
-} else {
-    expr <- read.table(exprfile, sep="\t", header=TRUE, row.names=1)
-}
+log$info("Reading input file ...")
+sobj <- read_obj(screpdata)
+log$info("Preparing TCR input file ...")
+# If immfile endswith .rds, then it is an immunarch object
+tcrdata <- sobj@meta.data %>%
+    rownames_to_column("contig_id") %>%
+    filter(!is.na(CTaa) & !is.na(CTgene)) %>%
+    separate(CTaa, into = c(NA, "cdr3"), sep = "_", remove = FALSE) %>%
+    separate(CTgene, into = c(NA, "vjgene"), sep = "_", remove = FALSE) %>%
+    separate(vjgene, into = c("v_gene", NA, "j_gene", NA), sep = "\\.", remove = TRUE) %>%
+    mutate(v_gene = sub("-\\d+$", "", v_gene), j_gene = sub("-\\d+$", "", j_gene))
+log$info("Preparing expression input file ...")
+expr <- GetAssayData(sobj, layer = "data")
 cell_ids <- intersect(tcrdata$contig_id, colnames(expr))
 # Warning about unused cells
-unused_tcr_cells <- setdiff(tcrdata$contig_id, cell_ids)
 unused_expr_cells <- setdiff(colnames(expr), cell_ids)
-if (length(unused_tcr_cells) > 0) {
-    log_warn(glue("{length(unused_tcr_cells)}/{nrow(tcrdata)} TCR cells are not used."))
-}
 if (length(unused_expr_cells) > 0) {
-    log_warn(glue("{length(unused_expr_cells)}/{ncol(expr)} expression cells are not used."))
+    log$warn(glue("{length(unused_expr_cells)}/{ncol(expr)} cells without TCR data are not used."))
 }
 if (length(cell_ids) == 0) {
-    stop(paste0(
-        "No common cells between TCR and expression data. ",
-        "Are you using the correct `envs.prefix` here or in `ImmunarchLoading`?"
-    ))
+    stop(
+        "No TCR data found in the Seurat object. ",
+        "Please use scRepertiore::combineExpression() to generate the Seurat object with TCR data."
+    )
 }
-tcrdata <- tcrdata[tcrdata$contig_id %in% cell_ids, , drop=FALSE]
 expr <- as.matrix(expr)[, tcrdata$contig_id, drop=FALSE]
 # Write input files
-log_info("Writing input files ...")
+log$info("Writing input files ...")
 write.table(tcrdata, file.path(tessa_dir, "tcrdata.txt"), sep=",", quote=FALSE, row.names=FALSE)
 write.table(expr, file.path(tessa_dir, "exprdata.txt"), sep=",", quote=FALSE, row.names=TRUE, col.names=TRUE)
 ### End preparing input files for TESSA
 ### Start running TESSA
-log_info("Running TESSA ...")
+log$info("Running TESSA ...")
 # The original TESSA uses a python wrapper to run the encoder and tessa model
 # here we run those two steps directly here
-log_info("- Running encoder ...")
+log$info("- Running encoder ...")
 cmd_encoder <- paste(
     python,
     file.path(tessa_srcdir, "BriseisEncoder.py"),
@@ -127,23 +89,22 @@ cmd_encoder <- paste(
     "-output_log",
     file.path(tessa_dir, "tcr_encoder.log")
 )
-if (has_VJ) {
-    cmd_encoder <- paste(
-        cmd_encoder,
-        "-output_VJ",
-        file.path(tessa_dir, "tcr_vj.txt")
-    )
-}
+cmd_encoder <- paste(
+    cmd_encoder,
+    "-output_VJ",
+    file.path(tessa_dir, "tcr_vj.txt")
+)
 print("Running:")
 print(cmd_encoder)
-log_debug(paste("- ", cmd_encoder))
+log$debug(paste("- ", cmd_encoder))
 rc <- system(cmd_encoder)
 if (rc != 0) {
     stop("Error: Failed to run encoder.")
 }
-log_info("- Running TESSA model ...")
+log$info("- Running TESSA model ...")
 source(file.path(tessa_srcdir, "real_data.R"))
 tessa <- run_tessa(
@@ -158,51 +119,40 @@ tessa <- run_tessa(
 )
 # Save TESSA results
-log_info("Saving TESSA results ...")
-if (is_seurat) {
-    cells <- rownames(sobj@meta.data)
-    sobj@meta.data <- sobj@meta.data %>%
-        mutate(
-            TESSA_Cluster = tessa$meta[
-                match(cells, tessa$meta$barcode),
-                "cluster_number"
-            ]
-        ) %>%
-        add_count(TESSA_Cluster, name = "TESSA_Cluster_Size")
-    rownames(sobj@meta.data) <- cells
-    if (save_tessa) {
-        sobj@misc$tessa <- tessa
-    }
-    saveRDS(sobj, outfile)
-} else {
-    out <- tessa$meta %>%
-        dplyr::select(barcode, TESSA_Cluster = cluster_number) %>%
-        add_count(TESSA_Cluster, name = "TESSA_Cluster_Size")
-    write.table(out, outfile, sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)
+log$info("Saving TESSA results ...")
+cells <- rownames(sobj@meta.data)
+sobj@meta.data <- sobj@meta.data %>%
+    mutate(
+        TESSA_Cluster = tessa$meta[
+            match(cells, tessa$meta$barcode),
+            "cluster_number"
+        ]
+    ) %>%
+    add_count(TESSA_Cluster, name = "TESSA_Cluster_Size")
+rownames(sobj@meta.data) <- cells
+if (save_tessa) {
+    sobj@misc$tessa <- tessa
 }
+save_obj(sobj, outfile)
 # Post analysis
-log_info("Post analysis ...")
+log$info("Post analysis ...")
 plot_tessa(tessa, result_dir)
 plot_Tessa_clusters(tessa, result_dir)
 p <- tessa$meta %>%
     dplyr::select(barcode, TESSA_Cluster = cluster_number) %>%
     add_count(TESSA_Cluster, name = "TESSA_Cluster_Size") %>%
-    ggplot(aes(x = TESSA_Cluster_Size)) +
-    geom_histogram(binwidth = 1) +
-    theme_prism()
-png(file.path(result_dir, "Cluster_size_dist.png"), width=8, height=8, units="in", res=100)
-print(p)
-dev.off()
+    plotthis::Histogram(x = "TESSA_Cluster_Size")
-pdf(file.path(result_dir, "Cluster_size_dist.pdf"), width=8, height=8)
-print(p)
-dev.off()
+res <- 100
+height <- attr(p, "height") * res
+width <- attr(p, "width") * res
+prefix <- file.path(result_dir, "Cluster_size_dist")
+save_plot(p, prefix, devpars = list(width = width, height = height, res = res))
-add_report(
+reporter$add(
     list(
         src = file.path(result_dir, "Cluster_size_dist.png"),
         descr = "Histogram of cluster size distribution",
@@ -232,4 +182,4 @@ add_report(
     ui = "table_of_images"
 )
-save_report(joboutdir)
+reporter$save(joboutdir)

biopipen/scripts/tcr/VJUsage.R CHANGED Viewed

@@ -1,9 +1,9 @@
-infile = {{in.infile | quote}}
-outprefix = {{out.outfile | prefix | replace: ".fancyvj.wt", "" | quote}}
-vdjtools = {{ envs.vdjtools | quote }}
-vdjtools_patch = {{ envs.vdjtools_patch | quote }}
-joboutdir = {{job.outdir | quote}}
+infile = {{in.infile | r}}
+outprefix = {{out.outfile | prefix | replace: ".fancyvj.wt", "" | r}}
+vdjtools = {{ envs.vdjtools | r }}
+vdjtools_patch = {{ envs.vdjtools_patch | r }}
+joboutdir = {{job.outdir | r}}
 command = sprintf(
     "cd %s && bash %s %s PlotFancyVJUsage --plot-type png %s %s",

biopipen/scripts/vcf/TruvariBenchSummary.R CHANGED Viewed

@@ -1,11 +1,7 @@
-{{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
-{{ biopipen_dir | joinpaths: "utils", "plot.R" | source_r }}
-library(ggprism)
 library(rjson)
+library(rlang)
 library(dplyr)
-theme_set(theme_prism(axis_text_angle = 90))
+library(plotthis)
 indirs = {{in.indirs | r}}
 outdir = {{out.outdir | r}}
@@ -39,13 +35,21 @@ get_devpars = function() {
 plot_summary = function(col) {
     outfile = file.path(outdir, paste0(col, ".png"))
-    plotGG(
+    p <- plotthis::BarPlot(
         summaries,
-        "col",
-        list(mapping = aes_string(x = "Sample", y = bQuote(col), fill = "Sample")),
-        devpars = get_devpars(),
-        outfile = outfile
+        x = "Sample",
+        y = col,
+        x_text_angle = 90
+    )
+    devpars <- get_devpars()
+    png(
+        filename = outfile,
+        width = devpars$width,
+        height = devpars$height,
+        res = devpars$res
     )
+    print(p)
+    dev.off()
 }
 main = function() {

biopipen/utils/common_docstrs.py CHANGED Viewed

@@ -27,74 +27,77 @@ def format_placeholder(**kwargs) -> Callable[[type], type]:
     """
     def decorator(klass: type) -> type:
+        if not klass.__doc__:
+            return klass
         klass.__doc__ = klass.__doc__ % kwargs
         return klass
     return decorator
-MUTATE_HELPERS_CLONESIZE = """
-There are also also 4 helper functions, `expanded`, `collapsed`, `emerged` and `vanished`,
-which can be used to identify the expanded/collpased/emerged/vanished groups (i.e. TCR clones).
-See also <https://pwwang.github.io/immunopipe/configurations/#mutater-helpers>.
-For example, you can use
-`{"Patient1_Tumor_Collapsed_Clones": "expanded(., Source, 'Tumor', subset = Patent == 'Patient1', uniq = FALSE)"}`
-to create a new column in metadata named `Patient1_Tumor_Collapsed_Clones`
-with the collapsed clones in the tumor sample (compared to the normal sample) of patient 1.
-The values in this columns for other clones will be `NA`.
-Those functions take following arguments:
-* `df`: The metadata data frame. You can use the `.` to refer to it.
-* `group.by`: The column name in metadata to group the cells.
-* `idents`: The first group or both groups of cells to compare (value in `group.by` column). If only the first group is given, the rest of the cells (with non-NA in `group.by` column) will be used as the second group.
-* `subset`: An expression to subset the cells, will be passed to `dplyr::filter()`. Default is `TRUE` (no filtering).
-* `each`: A column name (without quotes) in metadata to split the cells.
-    Each comparison will be done for each value in this column (typically each patient or subject).
-* `id`: The column name in metadata for the group ids (i.e. `CDR3.aa`).
-* `compare`: Either a (numeric) column name (i.e. `Clones`) in metadata to compare between groups, or `.n` to compare the number of cells in each group.
-    If numeric column is given, the values should be the same for all cells in the same group.
-    This will not be checked (only the first value is used).
-    It is helpful to use `Clones` to use the raw clone size from TCR data, in case the cells are not completely mapped to RNA data.
-    Also if you have `subset` set or `NA`s in `group.by` column, you should use `.n` to compare the number of cells in each group.
-* `uniq`: Whether to return unique ids or not. Default is `TRUE`. If `FALSE`, you can mutate the meta data frame with the returned ids. For example, `df |> mutate(expanded = expanded(...))`.
-* `debug`: Return the data frame with intermediate columns instead of the ids. Default is `FALSE`.
-* `order`: The expression passed to `dplyr::arrange()` to order intermediate dataframe and get the ids in order accordingly.
-    The intermediate dataframe includes the following columns:
-    * `<id>`: The ids of clones (i.e. `CDR3.aa`).
-    * `<each>`: The values in `each` column.
-    * `ident_1`: The size of clones in the first group.
-    * `ident_2`: The size of clones in the second group.
-    * `.diff`: The difference between the sizes of clones in the first and second groups.
-    * `.sum`: The sum of the sizes of clones in the first and second groups.
-    * `.predicate`: Showing whether the clone is expanded/collapsed/emerged/vanished.
-* `include_emerged`: Whether to include the emerged group for `expanded` (only works for `expanded`). Default is `FALSE`.
-* `include_vanished`: Whether to include the vanished group for `collapsed` (only works for `collapsed`). Default is `FALSE`.
+# MUTATE_HELPERS_CLONESIZE = """
+# There are also also 4 helper functions, `expanded`, `collapsed`, `emerged` and `vanished`,
+# which can be used to identify the expanded/collpased/emerged/vanished groups (i.e. TCR clones).
+# See also <https://pwwang.github.io/immunopipe/configurations/#mutater-helpers>.
+# For example, you can use
+# `{"Patient1_Tumor_Collapsed_Clones": "expanded(., Source, 'Tumor', subset = Patent == 'Patient1', uniq = FALSE)"}`
+# to create a new column in metadata named `Patient1_Tumor_Collapsed_Clones`
+# with the collapsed clones in the tumor sample (compared to the normal sample) of patient 1.
+# The values in this columns for other clones will be `NA`.
+# Those functions take following arguments:
+# * `df`: The metadata data frame. You can use the `.` to refer to it.
+# * `group.by`: The column name in metadata to group the cells.
+# * `idents`: The first group or both groups of cells to compare (value in `group.by` column). If only the first group is given, the rest of the cells (with non-NA in `group.by` column) will be used as the second group.
+# * `subset`: An expression to subset the cells, will be passed to `dplyr::filter()`. Default is `TRUE` (no filtering).
+# * `each`: A column name (without quotes) in metadata to split the cells.
+#     Each comparison will be done for each value in this column (typically each patient or subject).
+# * `id`: The column name in metadata for the group ids (i.e. `CDR3.aa`).
+# * `compare`: Either a (numeric) column name (i.e. `Clones`) in metadata to compare between groups, or `.n` to compare the number of cells in each group.
+#     If numeric column is given, the values should be the same for all cells in the same group.
+#     This will not be checked (only the first value is used).
+#     It is helpful to use `Clones` to use the raw clone size from TCR data, in case the cells are not completely mapped to RNA data.
+#     Also if you have `subset` set or `NA`s in `group.by` column, you should use `.n` to compare the number of cells in each group.
+# * `uniq`: Whether to return unique ids or not. Default is `TRUE`. If `FALSE`, you can mutate the meta data frame with the returned ids. For example, `df |> mutate(expanded = expanded(...))`.
+# * `debug`: Return the data frame with intermediate columns instead of the ids. Default is `FALSE`.
+# * `order`: The expression passed to `dplyr::arrange()` to order intermediate dataframe and get the ids in order accordingly.
+#     The intermediate dataframe includes the following columns:
+#     * `<id>`: The ids of clones (i.e. `CDR3.aa`).
+#     * `<each>`: The values in `each` column.
+#     * `ident_1`: The size of clones in the first group.
+#     * `ident_2`: The size of clones in the second group.
+#     * `.diff`: The difference between the sizes of clones in the first and second groups.
+#     * `.sum`: The sum of the sizes of clones in the first and second groups.
+#     * `.predicate`: Showing whether the clone is expanded/collapsed/emerged/vanished.
+# * `include_emerged`: Whether to include the emerged group for `expanded` (only works for `expanded`). Default is `FALSE`.
+# * `include_vanished`: Whether to include the vanished group for `collapsed` (only works for `collapsed`). Default is `FALSE`.
-You can also use `top()` to get the top clones (i.e. the clones with the largest size) in each group.
-For example, you can use
-`{"Patient1_Top10_Clones": "top(subset = Patent == 'Patient1', uniq = FALSE)"}`
-to create a new column in metadata named `Patient1_Top10_Clones`.
-The values in this columns for other clones will be `NA`.
-This function takes following arguments:
-* `df`: The metadata data frame. You can use the `.` to refer to it.
-* `id`: The column name in metadata for the group ids (i.e. `CDR3.aa`).
-* `n`: The number of top clones to return. Default is `10`.
-    If n < 1, it will be treated as the percentage of the size of the group.
-    Specify `0` to get all clones.
-* `compare`: Either a (numeric) column name (i.e. `Clones`) in metadata to compare between groups, or `.n` to compare the number of cells in each group.
-    If numeric column is given, the values should be the same for all cells in the same group.
-    This will not be checked (only the first value is used).
-    It is helpful to use `Clones` to use the raw clone size from TCR data, in case the cells are not completely mapped to RNA data.
-    Also if you have `subset` set or `NA`s in `group.by` column, you should use `.n` to compare the number of cells in each group.
-* `subset`: An expression to subset the cells, will be passed to `dplyr::filter()`. Default is `TRUE` (no filtering).
-* `each`: A column name (without quotes) in metadata to split the cells.
-    Each comparison will be done for each value in this column (typically each patient or subject).
-* `uniq`: Whether to return unique ids or not. Default is `TRUE`. If `FALSE`, you can mutate the meta data frame with the returned ids. For example, `df |> mutate(expanded = expanded(...))`.
-* `debug`: Return the data frame with intermediate columns instead of the ids. Default is `FALSE`.
-* `with_ties`: Whether to include ties (i.e. clones with the same size as the last clone) or not. Default is `FALSE`.
-"""
+# You can also use `top()` to get the top clones (i.e. the clones with the largest size) in each group.
+# For example, you can use
+# `{"Patient1_Top10_Clones": "top(subset = Patent == 'Patient1', uniq = FALSE)"}`
+# to create a new column in metadata named `Patient1_Top10_Clones`.
+# The values in this columns for other clones will be `NA`.
+# This function takes following arguments:
+# * `df`: The metadata data frame. You can use the `.` to refer to it.
+# * `id`: The column name in metadata for the group ids (i.e. `CDR3.aa`).
+# * `n`: The number of top clones to return. Default is `10`.
+#     If n < 1, it will be treated as the percentage of the size of the group.
+#     Specify `0` to get all clones.
+# * `compare`: Either a (numeric) column name (i.e. `Clones`) in metadata to compare between groups, or `.n` to compare the number of cells in each group.
+#     If numeric column is given, the values should be the same for all cells in the same group.
+#     This will not be checked (only the first value is used).
+#     It is helpful to use `Clones` to use the raw clone size from TCR data, in case the cells are not completely mapped to RNA data.
+#     Also if you have `subset` set or `NA`s in `group.by` column, you should use `.n` to compare the number of cells in each group.
+# * `subset`: An expression to subset the cells, will be passed to `dplyr::filter()`. Default is `TRUE` (no filtering).
+# * `each`: A column name (without quotes) in metadata to split the cells.
+#     Each comparison will be done for each value in this column (typically each patient or subject).
+# * `uniq`: Whether to return unique ids or not. Default is `TRUE`. If `FALSE`, you can mutate the meta data frame with the returned ids. For example, `df |> mutate(expanded = expanded(...))`.
+# * `debug`: Return the data frame with intermediate columns instead of the ids. Default is `FALSE`.
+# * `with_ties`: Whether to include ties (i.e. clones with the same size as the last clone) or not. Default is `FALSE`.
+# """
-ENVS_SECTION_EACH = """
-The `section` is used to collect cases and put the results under the same directory and the same section in report.
-When `each` for a case is specified, the `section` will be ignored and case name will be used as `section`.
-The cases will be the expanded values in `each` column. When `prefix_each` is True, the column name specified by `each` will be prefixed to each value as directory name and expanded case name.
-"""
+# ENVS_SECTION_EACH = """
+# The `section` is used to collect cases and put the results under the same directory and the same section in report.
+# When `each` for a case is specified, the `section` will be ignored and case name will be used as `section`.
+# The cases will be the expanded values in `each` column. When `prefix_each` is True, the column name specified by `each` will be prefixed to each value as directory name and expanded case name.
+# """

biopipen/utils/reporter.py ADDED Viewed

@@ -0,0 +1,177 @@
+from __future__ import annotations
+from typing import Sequence
+from os import PathLike
+from pathlib import Path
+"""An implementation of reporter in python
+"https://pwwang.github.io/biopipen.utils.R/reference/Reporter.html
+to generate a json file for pipen-report to build a report for a process.
+"""
+import json
+class Reporter:
+    def __init__(self):
+        self.report = {}
+    def add(
+        self,
+        *args,
+        h1: str,
+        h2: str = "#",
+        h3: str = "#",
+        ui: str = "flat",
+    ) -> None:
+        """Add a content to the report
+        Args:
+            *args: The content of the report
+            h1 (str): The first level header
+            h2 (str): The second level header
+            h3 (str): The third level header
+            ui (str): The user interface of the report
+        """
+        self.report.setdefault(h1, {})
+        self.report[h1].setdefault(h2, {})
+        self.report[h1][h2].setdefault(h3, {})
+        self.report[h1][h2][h3][ui] = []
+        for arg in args:
+            self.report[h1][h2][h3][ui].append(arg)
+    def add2(
+        self,
+        *args,
+        hs: Sequence[str],
+        hs2: Sequence[str] = (),
+        ui: str = "flat",
+        collapse: str = ": ",
+    ) -> None:
+        """Add a content to the report
+        Args:
+            *args: The content of the report
+            hs: The headings of the case
+            hs2: The headings that must be shown.
+                When there are more items in `hs`, they will be concatenated.
+                For example, if `hs = c("Section1", "Case1")`, and `hs2 = c("A", "B")`,
+                then headings will be `h1 = "Section1: Case1"` and `h2 = "A"` and
+                `h3 = "B"`.
+            ui: The user interface of the report
+            collapse: The separator to concatenate the headings
+        """
+        if len(hs2) > 2:
+            raise ValueError("hs2 must have 2 or less items")
+        if len(hs2) == 2:
+            h1 = collapse.join(hs)
+            h2 = hs2[0]
+            h3 = hs2[1]
+        elif len(hs2) == 1:
+            h1 = hs[0]
+            hs = hs[1:]
+            if hs:
+                h2 = collapse.join(hs)
+                h3 = hs2[0]
+            else:
+                h2 = hs2[0]
+                h3 = "#"
+        else:
+            h1 = hs[0]
+            hs = hs[1:]
+            if hs:
+                h2 = hs[0]
+                hs = hs[1:]
+            else:
+                h2 = "#"
+            if hs:
+                h3 = collapse.join(hs)
+            else:
+                h3 = "#"
+        self.add(*args, h1=h1, h2=h2, h3=h3, ui=ui)
+    def image(
+        self,
+        prefix: str,
+        more_formats: str | Sequence[str],
+        save_code: bool,
+        kind: str = "image",
+        **kwargs,
+    ) -> dict:
+        """Generate a report for an image to be added.
+        Args:
+            prefix: The prefix of the image.
+            more_formats: More formats of the image available.
+            save_code: Whether to save the code to reproduce the plot.
+            kind: The kind of the report, default is "image".
+            **kwargs: Other arguments to add to the report.
+        Returns:
+            dict: The structured report for the image
+        Examples:
+            >>> reporter = Reporter()
+            >>> reporter.add(
+            >>>   {
+            >>>     "name": "Image 1",
+            >>>     "contents": [
+            >>>       reporter.image("/path/to/image1", "pdf", save_code=True)
+            >>>     ]
+            >>>   },
+            >>>   h1="Images",
+            >>>   h2="Image 1",
+            >>> )
+        """
+        out = {
+            "kind": kind,
+            "src": f"{prefix}.png",
+            **kwargs,
+        }
+        if more_formats or save_code:
+            out["download"] = []
+        if more_formats:
+            for mf in more_formats:
+                out["download"].append(f"{prefix}.{mf}")
+        if save_code:
+            out["download"].append(
+                {
+                    "src": f"{prefix}.code.zip",
+                    "tip": "Download the code to reproduce the plot",
+                    "icon": "Code",
+                }
+            )
+        return out
+    def clear(self):
+        """Clear the report"""
+        self.report = {}
+    def save(self, path: str | PathLike, clear: bool = True) -> None:
+        """Save the report to a file
+        Args:
+            path: The path to save the report
+                If the path is a directory, the report will be saved as `report.json`
+                in the directory. Otherwise, the report will be saved to the file.
+            clear: Whether to clear the report after saving.
+        """
+        path = Path(path)
+        if path.is_dir():
+            path = path / "report.json"
+        with open(path, "w") as f:
+            json.dump(self.report, f, indent=2)
+        if clear:
+            self.clear()

{biopipen-0.33.1.dist-info → biopipen-0.34.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: biopipen
-Version: 0.33.1
+Version: 0.34.0
 Summary: Bioinformatics processes/pipelines that can be run from `pipen run`
 License: MIT
 Author: pwwang
@@ -17,6 +17,7 @@ Provides-Extra: runinfo
 Requires-Dist: datar[pandas] (>=0.15.8,<0.16.0)
 Requires-Dist: pipen-board[report] (>=0.17,<0.18)
 Requires-Dist: pipen-cli-run (>=0.15,<0.16)
+Requires-Dist: pipen-deprecated (>=0.0,<0.1)
 Requires-Dist: pipen-filters (>=0.15,<0.16)
 Requires-Dist: pipen-poplog (>=0.3,<0.4)
 Requires-Dist: pipen-runinfo (>=0.9,<0.10) ; extra == "runinfo"

biopipen 0.33.1__py3-none-any.whl → 0.34.0__py3-none-any.whl

Potentially problematic release.

biopipen 0.33.1py3-none-any.whl → 0.34.0py3-none-any.whl