PyPI - biopipen - Versions diffs - 0.34.6__py3-none-any.whl → 0.34.26__py3-none-any.whl - Mend

biopipen 0.34.6py3-none-any.whl → 0.34.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +4 -0
biopipen/core/filters.py +1 -1
biopipen/core/testing.py +2 -1
biopipen/ns/cellranger.py +33 -3
biopipen/ns/regulatory.py +4 -0
biopipen/ns/scrna.py +548 -98
biopipen/ns/scrna_metabolic_landscape.py +4 -0
biopipen/ns/tcr.py +256 -16
biopipen/ns/web.py +5 -0
biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +9 -9
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +9 -8
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +9 -9
biopipen/reports/tcr/ClonalStats.svelte +1 -0
biopipen/scripts/cellranger/CellRangerCount.py +55 -11
biopipen/scripts/cellranger/CellRangerVdj.py +54 -8
biopipen/scripts/regulatory/MotifAffinityTest.R +21 -5
biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +9 -2
biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +15 -6
biopipen/scripts/regulatory/VariantMotifPlot.R +1 -1
biopipen/scripts/regulatory/motifs-common.R +3 -2
biopipen/scripts/scrna/AnnData2Seurat.R +2 -1
biopipen/scripts/scrna/CellCellCommunication.py +26 -14
biopipen/scripts/scrna/CellCellCommunicationPlots.R +23 -4
biopipen/scripts/scrna/CellSNPLite.py +30 -0
biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +27 -36
biopipen/scripts/scrna/CellTypeAnnotation-direct.R +42 -26
biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +11 -13
biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +5 -8
biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +5 -8
biopipen/scripts/scrna/CellTypeAnnotation.R +26 -3
biopipen/scripts/scrna/MQuad.py +25 -0
biopipen/scripts/scrna/MarkersFinder.R +128 -30
biopipen/scripts/scrna/ModuleScoreCalculator.R +9 -1
biopipen/scripts/scrna/PseudoBulkDEG.R +113 -27
biopipen/scripts/scrna/ScFGSEA.R +23 -26
biopipen/scripts/scrna/ScVelo.py +20 -8
biopipen/scripts/scrna/SeuratClusterStats-clustree.R +1 -1
biopipen/scripts/scrna/SeuratClusterStats-features.R +6 -1
biopipen/scripts/scrna/SeuratClustering.R +5 -1
biopipen/scripts/scrna/SeuratMap2Ref.R +1 -2
biopipen/scripts/scrna/SeuratPreparing.R +19 -11
biopipen/scripts/scrna/SeuratSubClustering.R +1 -1
biopipen/scripts/scrna/Slingshot.R +2 -4
biopipen/scripts/scrna/TopExpressingGenes.R +1 -4
biopipen/scripts/scrna/celltypist-wrapper.py +140 -4
biopipen/scripts/scrna/scvelo_paga.py +313 -0
biopipen/scripts/scrna/seurat_anndata_conversion.py +18 -1
biopipen/scripts/tcr/{TCRClustering.R → CDR3Clustering.R} +63 -23
biopipen/scripts/tcr/ClonalStats.R +76 -35
biopipen/utils/misc.py +104 -9
{biopipen-0.34.6.dist-info → biopipen-0.34.26.dist-info}/METADATA +5 -2
{biopipen-0.34.6.dist-info → biopipen-0.34.26.dist-info}/RECORD +55 -53
{biopipen-0.34.6.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
biopipen/utils/common_docstrs.py +0 -103
{biopipen-0.34.6.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +0 -0

biopipen/ns/scrna.py CHANGED Viewed

@@ -3,15 +3,6 @@
 from pipen.utils import mark
 from ..core.proc import Proc
 from ..core.config import config
-# from ..utils.common_docstrs import (
-#     indent_docstr,
-#     format_placeholder,
-#     MUTATE_HELPERS_CLONESIZE,
-#     ENVS_SECTION_EACH,
-# )
-# MUTATE_HELPERS_CLONESIZE_INDENTED = indent_docstr(MUTATE_HELPERS_CLONESIZE, "    " * 3)
-# ENVS_SECTION_EACH_INDENTED = indent_docstr(ENVS_SECTION_EACH, "    " * 3)
 class SeuratLoading(Proc):
@@ -96,6 +87,8 @@ class SeuratPreparing(Proc):
             `RNAData` to assign the path of the data to the samples
             The path will be read by `Read10X()` from `Seurat`, or the path
             to the h5 file that can be read by `Read10X_h5()` from `Seurat`.
+            It can also be an RDS or qs2 file containing a `Seurat` object.
+            Note that it must has a column named `Sample` in the meta.data to specify the sample names.
     Output:
         outfile: The qs2 file with the Seurat object with all samples integrated.
@@ -111,13 +104,17 @@ class SeuratPreparing(Proc):
         min_cells (type=int): The minimum number of cells that a gene must be
             expressed in to be kept. This is used in `Seurat::CreateSeuratObject()`.
             Futher QC (`envs.cell_qc`, `envs.gene_qc`) will be performed after this.
-            It doesn't work when data is loaded from loom files.
+            It doesn't work when data is loaded from loom files or RDS/qs2 files.
         min_features (type=int): The minimum number of features that a cell must
             express to be kept. This is used in `Seurat::CreateSeuratObject()`.
             Futher QC (`envs.cell_qc`, `envs.gene_qc`) will be performed after this.
-            It doesn't work when data is loaded from loom files.
+            It doesn't work when data is loaded from loom files or RDS/qs2 files.
         cell_qc: Filter expression to filter cells, using
             `tidyrseurat::filter()`.
+            It can also be a dictionary of expressions, where the names of the list are
+            sample names.
+            You can have a default expression in the list with the name "DEFAULT" for
+            the samples that are not listed.
             Available QC keys include `nFeature_RNA`, `nCount_RNA`,
             `percent.mt`, `percent.ribo`, `percent.hb`, and `percent.plat`.
@@ -128,6 +125,7 @@ class SeuratPreparing(Proc):
             ```toml
             [SeuratPreparing.envs]
             cell_qc = "nFeature_RNA > 200 & percent.mt < 5"
             ```
             will keep cells with more than 200 genes and less than 5%% mitochondrial
@@ -144,6 +142,7 @@ class SeuratPreparing(Proc):
             /// Tip | Example
             ```toml
             [SeuratPreparing.envs]
             gene_qc = { min_cells = 3 }
             ```
             will keep genes that are expressed in at least 3 cells.
@@ -331,13 +330,16 @@ class SeuratClustering(Proc):
         srtobj: The seurat object loaded by SeuratPreparing
     Output:
-        outfile: The seurat object with cluster information at `seurat_clusters`.
+        outfile: The seurat object with cluster information at `seurat_clusters` or
+            the name specified by `envs.ident`
     Envs:
         ncores (type=int;order=-100): Number of cores to use.
             Used in `future::plan(strategy = "multicore", workers = <ncores>)`
             to parallelize some Seurat procedures.
             See also: <https://satijalab.org/seurat/articles/future_vignette.html>
+        ident: The name in the metadata to save the cluster labels.
+            A shortcut for `envs["FindClusters"]["cluster.name"]`.
         RunUMAP (ns): Arguments for [`RunUMAP()`](https://satijalab.org/seurat/reference/runumap).
             `object` is specified internally, and `-` in the key will be replaced with `.`.
             `dims=N` will be expanded to `dims=1:N`; The maximal value of `N` will be the minimum of `N` and the number of columns - 1 for each sample.
@@ -353,12 +355,12 @@ class SeuratClustering(Proc):
             - <more>: See <https://satijalab.org/seurat/reference/findneighbors>
         FindClusters (ns): Arguments for [`FindClusters()`](https://satijalab.org/seurat/reference/findclusters).
             `object` is specified internally, and `-` in the key will be replaced with `.`.
-            The cluster labels will be saved in `seurat_clusters` and prefixed with "c".
+            The cluster labels will be saved in cluster names and prefixed with "c".
             The first cluster will be "c1", instead of "c0".
             - resolution (type=auto): The resolution of the clustering. You can have multiple resolutions as a list or as a string separated by comma.
                 Ranges are also supported, for example: `0.1:0.5:0.1` will generate `0.1, 0.2, 0.3, 0.4, 0.5`. The step can be omitted, defaulting to 0.1.
-                The results will be saved in `seurat_clusters_<resolution>`.
-                The final resolution will be used to define the clusters at `seurat_clusters`.
+                The results will be saved in `<ident>_<resolution>`.
+                The final resolution will be used to define the clusters at `<ident>`.
             - <more>: See <https://satijalab.org/seurat/reference/findclusters>
         cache (type=auto): Where to cache the information at different steps.
             If `True`, the seurat object will be cached in the job output directory, which will be not cleaned up when job is rerunning.
@@ -378,6 +380,7 @@ class SeuratClustering(Proc):
     lang = config.lang.rscript
     envs = {
         "ncores": config.misc.ncores,
+        "ident": "seurat_clusters",
         "RunPCA": {},
         "RunUMAP": {},
         "FindNeighbors": {},
@@ -476,48 +479,248 @@ class SeuratClusterStats(Proc):
     TCR clones/clusters or other metadata for each T-cell cluster.
     Examples:
-        ### Number of cells in each cluster
+        ### Clustree Plot
+        ```toml
+        [SeuratClusterStats.envs.clustrees."Clustree Plot"]
+        prefix = "seurat_clusters"
+        devpars = {height = 500}
+        ```
+        ![Clustree Plot](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/clustrees/seurat_clusters.clustree.png){: width="80%" }
+        ### Number of cells in each cluster (Bar Chart)
+        ```toml
+        [SeuratClusterStats.envs.stats."Number of cells in each cluster (Bar Chart)"]
+        plot_type = "bar"
+        x_text_angle = 90
+        ```
+        ![Number of cells in each cluster (Bar Chart)](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/stats/Number-of-cells-in-each-cluster-Bar-Chart-.png){: width="80%" }
+        ### Number of cells in each cluster by Sample (Bar Chart)
+        ```toml
+        [SeuratClusterStats.envs.stats."Number of cells in each cluster by Sample (Bar Chart)"]
+        plot_type = "bar"
+        group_by = "Sample"
+        x_text_angle = 90
+        ```
+        ![Number of cells in each cluster by Sample (Bar Chart)](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/stats/Number-of-cells-in-each-cluster-by-Sample-Bar-Chart-.png){: width="80%" }
+        ### Number of cells in each cluster by Diagnosis
+        ```toml
+        [SeuratClusterStats.envs.stats."Number of cells in each cluster by Diagnosis"]
+        plot_type = "bar"
+        group_by = "Diagnosis"
+        frac = "group"
+        x_text_angle = 90
+        swap = true
+        position = "stack"
+        ```
+        ![Number of cells in each cluster by Diagnosis](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/stats/Number-of-cells-in-each-cluster-by-Diagnosis.png){: width="80%" }
+        ### Number of cells in each cluster by Diagnosis (Circos Plot)
+        ```toml
+        [SeuratClusterStats.envs.stats."Number of cells in each cluster by Diagnosis (Circos Plot)"]
+        plot_type = "circos"
+        group_by = "Diagnosis"
+        ```
+        ![Number of cells in each cluster by Diagnosis (Circos Plot)](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/stats/Number-of-cells-in-each-cluster-by-Diagnosis-Circos-Plot-.png){: width="80%" }
+        ### Number of cells in each cluster by Diagnosis (Sankey Plot)
+        ```toml
+        [SeuratClusterStats.envs.stats."Number of cells in each cluster by Diagnosis (Sankey Plot)"]
+        plot_type = "sankey"
+        group_by = ["seurat_clusters", "Diagnosis"]
+        links_alpha = 0.6
+        devpars = {width = 800}
+        ```
+        ![Number of cells in each cluster by Diagnosis (Sankey Plot)](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/stats/Number-of-cells-in-each-cluster-by-Diagnosis-Sankey-Plot-.png){: width="80%" }
+        ### Number of cells in each cluster by Sample (Spider Plot)
+        ```toml
+        [SeuratClusterStats.envs.stats."Number of cells in each cluster by Sample (Spider Plot)"]
+        plot_type = "spider"
+        group_by = "Diagnosis"
+        palette = "Set1"
+        ```
+        ![Number of cells in each cluster by Sample (Spider Plot)](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/stats/Number-of-cells-in-each-cluster-by-Sample-Spider-Plot-.png){: width="80%" }
+        ### Number of genes detected in each cluster
+        ```toml
+        [SeuratClusterStats.envs.ngenes."Number of genes detected in each cluster"]
+        plot_type = "violin"
+        add_box = true
+        add_point = true
+        ```
+        ![Number of genes detected in each cluster](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/ngenes/Number-of-genes-detected-in-each-cluster.png){: width="80%" }
+        ### Feature Expression in Clusters (Violin Plots)
+        ```toml
+        [SeuratClusterStats.envs.features_defaults]
+        features = ["CD3D", "CD4", "CD8A", "MS4A1", "CD14", "LYZ", "FCGR3A", "NCAM1", "KLRD1"]
+        [SeuratClusterStats.envs.features."Feature Expression in Clusters (Violin Plots)"]
+        plot_type = "violin"
+        ident = "seurat_clusters"
+        ```
+        ![Feature Expression in Clusters (Violin Plots)](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/features/Feature-Expression-in-Clusters-Violin-Plots-.png){: width="80%" }
+        ### Feature Expression in Clusters (Ridge Plots)
+        ```toml
+        # Using the same features as above
+        [SeuratClusterStats.envs.features."Feature Expression in Clusters (Ridge Plots)"]
+        plot_type = "ridge"
+        ident = "seurat_clusters"
+        flip = true
+        ```
+        ![Feature Expression in Clusters (Ridge Plots)](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/features/Feature-Expression-in-Clusters-Ridge-Plots-.png){: width="80%" }
+        ### Feature Expression in Clusters by Diagnosis
         ```toml
-        [SeuratClusterStats.envs.stats]
-        # suppose you have nothing set in `envs.stats_defaults`
-        # otherwise, the settings will be inherited here
-        nCells_All = { }
+        # Using the same features as above
+        [SeuratClusterStats.envs.features."Feature Expression in Clusters by Diagnosis"]
+        plot_type = "violin"
+        group_by = "Diagnosis"
+        ident = "seurat_clusters"
+        comparisons = true
+        sig_label = "p.signif"
         ```
-        ![nCells_All](https://pwwang.github.io/immunopipe/latest/processes/images/SeuratClusterStats_nCells_All.png){: width="80%" }
+        ![Feature Expression in Clusters by Diagnosis](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/features/Feature-Expression-in-Clusters-by-Diagnosis.png){: width="80%" }
-        ### Number of cells in each cluster by groups
+        ### Feature Expression in Clusters (stacked)
         ```toml
-        [SeuratClusterStats.envs.stats]
-        nCells_Sample = { group_by = "Sample" }
+        # Using the same features as above
+        [SeuratClusterStats.envs.features."Feature Expression in Clusters (stacked)"]
+        plot_type = "violin"
+        ident = "seurat_clusters"
+        add_bg = true
+        stack = true
+        add_box = true
         ```
-        ![nCells_Sample](https://pwwang.github.io/immunopipe/latest/processes/images/SeuratClusterStats_nCells_Sample.png){: width="80%" }
+        ![Feature Expression in Clusters (stacked)](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/features/Feature-Expression-in-Clusters-stacked-.png){: width="80%" }
-        ### Violin plots for the gene expressions
+        ### CD4 Expression on UMAP
         ```toml
-        [SeuratClusterStats.envs.features]
-        features = "CD4,CD8A"
-        # Remove the dots in the violin plots
-        vlnplots = { pt-size = 0, kind = "vln" }
-        # Don't use the default genes
-        vlnplots_1 = { features = ["FOXP3", "IL2RA"], pt-size = 0, kind = "vln" }
+        [SeuratClusterStats.envs.features."CD4 Expression on UMAP"]
+        plot_type = "dim"
+        feature = "CD4"
+        highlight = "seurat_clusters == 'c1'"
         ```
-        ![vlnplots](https://pwwang.github.io/immunopipe/latest/processes/images/SeuratClusterStats_vlnplots.png){: width="80%" }
-        ![vlnplots_1](https://pwwang.github.io/immunopipe/latest/processes/images/SeuratClusterStats_vlnplots_1.png){: width="80%" }
+        ![CD4 Expression on UMAP](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/features/CD4-Expression-on-UMAP.png){: width="80%" }
-        ### Dimension reduction plot with labels
+        ### Feature Expression in Clusters by Diagnosis (Heatmap)
         ```toml
-        [SeuratClusterStats.envs.dimplots.Idents]
+        [SeuratClusterStats.envs.features."Feature Expression in Clusters by Diagnosis (Heatmap)"]
+        # Grouped features
+        features = {"T cell markers" = ["CD3D", "CD4", "CD8A"], "B cell markers" = ["MS4A1"], "Monocyte markers" = ["CD14", "LYZ", "FCGR3A"], "NK cell markers" = ["NCAM1", "KLRD1"]}
+        plot_type = "heatmap"
+        ident = "Diagnosis"
+        columns_split_by = "seurat_clusters"
+        name = "Expression"
+        devpars = {height = 560}
+        ```
+        ![Feature Expression in Clusters by Diagnosis (Heatmap)](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/features/Feature-Expression-in-Clusters-by-Diagnosis-Heatmap-.png){: width="80%" }
+        ### Feature Expression in Clusters by Diagnosis (Heatmap with annotations)
+        ```toml
+        # Using the default features
+        [SeuratClusterStats.envs.features."Feature Expression in Clusters by Diagnosis (Heatmap with annotations)"]
+        ident = "seurat_clusters"
+        cell_type = "dot"
+        plot_type = "heatmap"
+        name = "Expression Level"
+        dot_size = "nanmean"
+        dot_size_name = "Percent Expressed"
+        add_bg = true
+        rows_split_by = "Diagnosis"
+        cluster_rows = false
+        flip = true
+        palette = "YlOrRd"
+        column_annotation = ["percent.mt", "VDJ_Presence"]
+        column_annotation_type = {"percent.mt" = "violin", VDJ_Presence = "pie"}
+        column_annotation_params = {"percent.mt" = {show_legend = false}}
+        devpars = {width = 1400, height = 900}
+        ```
+        ![Feature Expression in Clusters by Diagnosis (Heatmap with annotations)](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/features/Feature-Expression-in-Clusters-by-Diagnosis-Heatmap-with-annotations-.png){: width="80%" }
+        ### Dimensional reduction plot
+        ```toml
+        [SeuratClusterStats.envs.features."Dimensional reduction plot"]
         label = true
         ```
-        ![dimplots](https://pwwang.github.io/immunopipe/latest/processes/images/SeuratClusterStats_dimplots.png){: width="80%" }
+        ![Dimensional reduction plot](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/dimplots/Dimensional-reduction-plot.dim.png){: width="80%" }
+        ### Dimensional reduction plot (with marks)
+        ```toml
+        [SeuratClusterStats.envs.dimplots."Dimensional reduction plot (with marks)"]
+        add_mark = true
+        mark_linetype = 2
+        ```
+        ![Dimensional reduction plot (with marks)](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/dimplots/Dimensional-reduction-plot-with-marks-.dim.png){: width="80%" }
+        ### Dimensional reduction plot (with hex bins)
+        ```toml
+        [SeuratClusterStats.envs.dimplots."Dimensional reduction plot (with hex bins)"]
+        hex = true
+        hex_bins = 50
+        ```
+        ![Dimensional reduction plot (with hex bins)](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/dimplots/Dimensional-reduction-plot-with-hex-bins-.dim.png){: width="80%" }
+        ### Dimensional reduction plot (with Diagnosis stats)
+        ```toml
+        [SeuratClusterStats.envs.dimplots."Dimensional reduction plot (with Diagnosis stats)"]
+        stat_by = "Diagnosis"
+        stat_plot_type = "ring"
+        stat_plot_size = 0.15
+        ```
+        ![Dimensional reduction plot (with Diagnosis stats)](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/dimplots/Dimensional-reduction-plot-with-Diagnosis-stats-.dim.png){: width="80%" }
+        ### Dimensional reduction plot by Diagnosis
+        ```toml
+        [SeuratClusterStats.envs.dimplots."Dimensional reduction plot by Diagnosis"]
+        facet_by = "Diagnosis"
+        highlight = true
+        theme = "theme_blank"
+        ```
+        ![Dimensional reduction plot by Diagnosis](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/seuratclusterstats/SeuratClusterStats/sampleinfo.scRep.cluster_stats/dimplots/Dimensional-reduction-plot-by-Diagnosis.dim.png){: width="80%" }
     Input:
         srtobj: The seurat object loaded by `SeuratClustering`
@@ -574,12 +777,6 @@ class SeuratClusterStats(Proc):
                 See <https://pwwang.github.io/scplotter/reference/CellStatPlot.html>.
         stats (type=json): The number/fraction of cells to plot.
             Keys are the names of the plots and values are the dicts inherited from `env.stats_defaults`.
-            Here are some examples -
-            >>> {
-            >>>     "nCells_All": {},
-            >>>     "nCells_Sample": {"group_by": "Sample"},
-            >>>     "fracCells_Sample": {"scale_y": True, "group_by": "Sample", plot_type = "pie"},
-            >>> }
         ngenes_defaults (ns): The default parameters for `ngenes`.
             The default parameters to plot the number of genes expressed in each cell.
             - more_formats (type=list): The formats to save the plots other than `png`.
@@ -603,7 +800,7 @@ class SeuratClusterStats(Proc):
                 `ActivationScore` in the metadata.
                 You may also specify the literal order of the clusters by a list of strings (at least two).
             - subset: An expression to subset the cells, will be passed to `tidyrseurat::filter()`.
-            - devpars (ns): The device parameters for the plots. Does not work for `table`.
+            - devpars (ns): The device parameters for the plots.
                 - res (type=int): The resolution of the plots.
                 - height (type=int): The height of the plots.
                 - width (type=int): The width of the plots.
@@ -695,7 +892,7 @@ class SeuratClusterStats(Proc):
         },
         "features": {},
         "dimplots_defaults": {
-            "group_by": "seurat_clusters",
+            "group_by": None,  # use default ident
             "split_by": None,
             "subset": None,
             "reduction": "dim",
@@ -782,11 +979,16 @@ class ModuleScoreCalculator(Proc):
             will perform diffusion map as a reduction and add the first 2
             components as `DC_1` and `DC_2` to the metadata. `diffmap` is a shortcut
             for `diffusion_map`. Other key-value pairs will pass to
-            [`destiny::DiffusionMap()`](https://www.rdocumentation.org/packages/destiny/versions/2.0.4/topics/DiffusionMap%20class).
+            [`destiny::DiffusionMap()`](https://www.rdocumentation.org/packages/destiny/versions/2.0.4/topics/DiffusionMap class).
             You can later plot the diffusion map by using
             `reduction = "DC"` in `env.dimplots` in `SeuratClusterStats`.
             This requires [`SingleCellExperiment`](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html)
             and [`destiny`](https://bioconductor.org/packages/release/bioc/html/destiny.html) R packages.
+        post_mutaters (type=json): The mutaters to mutate the metadata after
+            calculating the module scores.
+            The mutaters will be applied in the order specified.
+            This is useful when you want to create new scores based on the
+            calculated module scores.
     """  # noqa: E501
     input = "srtobj:file"
@@ -810,6 +1012,7 @@ class ModuleScoreCalculator(Proc):
             # "Activation": {"features": "IFNG"},
             # "Proliferation": {"features": "STMN1,TUBB"},
         },
+        "post_mutaters": {},
     }
     script = "file://../scripts/scrna/ModuleScoreCalculator.R"
@@ -1010,7 +1213,7 @@ class DimPlots(Proc):
 class MarkersFinder(Proc):
     """Find markers between different groups of cells
-    When only `group_by` is specified as `"seurat_clusters"` in
+    When only `group_by` is specified as identity column in
     `envs.cases`, the markers will be found for all the clusters.
     You can also find the differentially expressed genes between
@@ -1034,7 +1237,7 @@ class MarkersFinder(Proc):
             You can also use the clone selectors to select the TCR clones/clusters.
             See <https://pwwang.github.io/scplotter/reference/clone_selectors.html>.
         group_by: The column name in metadata to group the cells.
-            If only `group_by` is specified, and `ident-1` and `ident-2` are
+            If only `group_by` is specified, and `ident_1` and `ident_2` are
             not specified, markers will be found for all groups in this column
             in the manner of "group vs rest" comparison.
             `NA` group will be ignored.
@@ -1043,7 +1246,7 @@ class MarkersFinder(Proc):
         ident_1: The first group of cells to compare
             When this is empty, the comparisons will be expanded to each group v.s. the rest of the cells in `group_by`.
         ident_2: The second group of cells to compare
-            If not provided, the rest of the cells are used for `ident-2`.
+            If not provided, the rest of the cells are used for `ident_2`.
         each: The column name in metadata to separate the cells into different
             cases.
             When this is specified, the case will be expanded for each value of
@@ -1051,9 +1254,19 @@ class MarkersFinder(Proc):
             then the case will be expanded as `envs.cases."Cluster Markers - Sample1"`, `envs.cases."Cluster Markers - Sample2"`, etc.
             You can specify `allmarker_plots` and `overlaps` to plot the markers for all cases in the same plot and plot the overlaps of the markers
             between different cases by values in this column.
-        dbs (list): The dbs to do enrichment analysis for significant
-            markers See below for all libraries.
-            <https://maayanlab.cloud/Enrichr/#libraries>
+        dbs (list): The dbs to do enrichment analysis for significant markers.
+            You can use built-in dbs in `enrichit`, or provide your own gmt files.
+            See also <https://pwwang.github.io/enrichit/reference/FetchGMT.html>.
+            The built-in dbs include:
+            * "BioCarta" or "BioCarta_2016"
+            * "GO_Biological_Process" or "GO_Biological_Process_2025"
+            * "GO_Cellular_Component" or "GO_Cellular_Component_2025"
+            * "GO_Molecular_Function" or "GO_Molecular_Function_2025"
+            * "KEGG", "KEGG_Human", "KEGG_2021", or "KEGG_2021_Human"
+            * "Hallmark", "MSigDB_Hallmark", or "MSigDB_Hallmark_2020"
+            * "Reactome", "Reactome_Pathways", or "Reactome_Pathways_2024"
+            * "WikiPathways", "WikiPathways_2024", "WikiPathways_Human", or "WikiPathways_2024_Human"
+            You can also fetch more dbs from <https://maayanlab.cloud/Enrichr/#libraries>.
         sigmarkers: An expression passed to `dplyr::filter()` to filter the
             significant markers for enrichment analysis.
             Available variables are `p_val`, `avg_log2FC`, `pct.1`, `pct.2` and
@@ -1077,9 +1290,9 @@ class MarkersFinder(Proc):
             Use `-` to replace `.` in the argument name. For example,
             use `min-pct` instead of `min.pct`.
             - <more>: See <https://satijalab.org/seurat/reference/findmarkers>
-        allmarker_plots_defaults (ns): Default options for the plots for all markers when `ident-1` is not specified.
+        allmarker_plots_defaults (ns): Default options for the plots for all markers when `ident_1` is not specified.
             - plot_type: The type of the plot.
-                See <https://pwwang.github.io/scplotter/reference/FeatureStatPlot.html>.
+                See <https://pwwang.github.io/biopipen.utils.R/reference/VizDEGs.html>.
                 Available types are `violin`, `box`, `bar`, `ridge`, `dim`, `heatmap` and `dot`.
             - more_formats (type=list): The extra formats to save the plot in.
             - save_code (flag): Whether to save the code to generate the plot.
@@ -1087,9 +1300,7 @@ class MarkersFinder(Proc):
                 - res (type=int): The resolution of the plots.
                 - height (type=int): The height of the plots.
                 - width (type=int): The width of the plots.
-            - order_by: an expression to order the markers, passed by `dplyr::arrange()`.
-            - genes: The number of top genes to show or an expression passed to `dplyr::filter()` to filter the genes.
-            - <more>: Other arguments passed to [`scplotter::FeatureStatPlot()`](https://pwwang.github.io/scplotter/reference/FeatureStatPlot.html).
+            - <more>: Other arguments passed to [`biopipen.utils::VizDEGs()`](https://pwwang.github.io/biopipen.utils.R/reference/VizDEGs.html).
         allmarker_plots (type=json): All marker plot cases.
             The keys are the names of the cases and the values are the dicts inherited from `allmarker_plots_defaults`.
         allenrich_plots_defaults (ns): Default options for the plots to generate for the enrichment analysis.
@@ -1104,7 +1315,7 @@ class MarkersFinder(Proc):
             The cases under `envs.cases` can inherit this options.
         marker_plots_defaults (ns): Default options for the plots to generate for the markers.
             - plot_type: The type of the plot.
-                See <https://pwwang.github.io/scplotter/reference/FeatureStatPlot.html>.
+                See <https://pwwang.github.io/biopipen.utils.R/reference/VizDEGs.html>.
                 Available types are `violin`, `box`, `bar`, `ridge`, `dim`, `heatmap` and `dot`.
                 There are two additional types available - `volcano_pct` and `volcano_log2fc`.
             - more_formats (type=list): The extra formats to save the plot in.
@@ -1113,9 +1324,7 @@ class MarkersFinder(Proc):
                 - res (type=int): The resolution of the plots.
                 - height (type=int): The height of the plots.
                 - width (type=int): The width of the plots.
-            - order_by: an expression to order the markers, passed by `dplyr::arrange()`.
-            - genes: The number of top genes to show or an expression passed to `dplyr::filter()` to filter the genes.
-            - <more>: Other arguments passed to [`scplotter::FeatureStatPlot()`](https://pwwang.github.io/scplotter/reference/FeatureStatPlot.html).
+            - <more>: Other arguments passed to [`biopipen.utils::VizDEGs()`](https://pwwang.github.io/biopipen.utils.R/reference/VizDEGs.html).
                 If `plot_type` is `volcano_pct` or `volcano_log2fc`, they will be passed to
                 [`scplotter::VolcanoPlot()`](https://pwwang.github.io/plotthis/reference/VolcanoPlot.html).
         marker_plots (type=json): Cases of the plots to generate for the markers.
@@ -1131,12 +1340,12 @@ class MarkersFinder(Proc):
                 - res (type=int): The resolution of the plots.
                 - height (type=int): The height of the plots.
                 - width (type=int): The width of the plots.
-            - <more>: See <https://pwwang.github.io/scplotter/reference/EnrichmentPlot.htmll>.
+            - <more>: See <https://pwwang.github.io/scplotter/reference/EnrichmentPlot.html>.
         enrich_plots (type=json): Cases of the plots to generate for the enrichment analysis.
             The keys are the names of the cases and the values are the dicts inherited from `enrich_plots_defaults`.
             The cases under `envs.cases` can inherit this options.
         overlaps_defaults (ns): Default options for investigating the overlapping of significant markers between different cases or comparisons.
-            This means either `ident-1` should be empty, so that they can be expanded to multiple comparisons.
+            This means either `ident_1` should be empty, so that they can be expanded to multiple comparisons.
             - sigmarkers: The expression to filter the significant markers for each case.
                 If not provided, `envs.sigmarkers` will be used.
             - plot_type (choice): The type of the plot to generate for the overlaps.
@@ -1155,8 +1364,8 @@ class MarkersFinder(Proc):
         overlaps (type=json): Cases for investigating the overlapping of significant markers between different cases or comparisons.
             The keys are the names of the cases and the values are the dicts inherited from `overlaps_defaults`.
             There are two situations that we can perform overlaps:
-            1. If `ident-1` is not specified, the overlaps can be performed between different comparisons.
-            2. If `each` is specified, the overlaps can be performed between different cases, where in each case, `ident-1` must be specified.
+            1. If `ident_1` is not specified, the overlaps can be performed between different comparisons.
+            2. If `each` is specified, the overlaps can be performed between different cases, where in each case, `ident_1` must be specified.
         cases (type=json): If you have multiple cases for marker discovery, you can specify them
             here. The keys are the names of the cases and the values are the above options. If some options are
             not specified, the default values specified above (under `envs`) will be used.
@@ -1186,8 +1395,6 @@ class MarkersFinder(Proc):
             "more_formats": [],
             "save_code": False,
             "devpars": {"res": 100},
-            "order_by": "desc(abs(avg_log2FC))",
-            "genes": 10,
         },
         "allmarker_plots": {},
         "allenrich_plots_defaults": {
@@ -1200,8 +1407,6 @@ class MarkersFinder(Proc):
             "more_formats": [],
             "save_code": False,
             "devpars": {"res": 100},
-            "order_by": "desc(abs(avg_log2FC))",
-            "genes": 10,
         },
         "marker_plots": {
             "Volcano Plot (diff_pct)": {"plot_type": "volcano_pct"},
@@ -1255,9 +1460,19 @@ class TopExpressingGenes(Proc):
         group_by: The column name in metadata to group the cells.
         each: The column name in metadata to separate the cells into different
             cases.
-        dbs (list): The dbs to do enrichment analysis for significant
-            markers See below for all libraries.
-            <https://maayanlab.cloud/Enrichr/#libraries>
+        dbs (list): The dbs to do enrichment analysis for significant markers.
+            You can use built-in dbs in `enrichit`, or provide your own gmt files.
+            See also <https://pwwang.github.io/enrichit/reference/FetchGMT.html>.
+            The built-in dbs include:
+            * "BioCarta" or "BioCarta_2016"
+            * "GO_Biological_Process" or "GO_Biological_Process_2025"
+            * "GO_Cellular_Component" or "GO_Cellular_Component_2025"
+            * "GO_Molecular_Function" or "GO_Molecular_Function_2025"
+            * "KEGG", "KEGG_Human", "KEGG_2021", or "KEGG_2021_Human"
+            * "Hallmark", "MSigDB_Hallmark", or "MSigDB_Hallmark_2020"
+            * "Reactome", "Reactome_Pathways", or "Reactome_Pathways_2024"
+            * "WikiPathways", "WikiPathways_2024", "WikiPathways_Human", or "WikiPathways_2024_Human"
+            You can also fetch more dbs from <https://maayanlab.cloud/Enrichr/#libraries>.
         n (type=int): The number of top expressing genes to find.
         enrich_style (choice): The style of the enrichment analysis.
             The enrichment analysis will be done by `EnrichIt()` from [`enrichit`](https://pwwang.github.io/enrichit/).
@@ -1604,6 +1819,32 @@ class ScFGSEA(Proc):
     For each case, the process will generate a table with the enrichment scores for
     each gene set, and GSEA plots for the top gene sets.
+    Examples:
+        ### The summary and GSEA plots
+        ![GSEA summary](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/scfgsea/ScFGSEA/sampleinfo.fgsea/seurat_clusters/c1/summary.png){: width="80%"}
+        ![GSEA plot](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/scfgsea/ScFGSEA/sampleinfo.fgsea/seurat_clusters/c1/pathways.png){: width="80%"}
+        ### Summary plot for all subsets or idents
+        If you use `each` to separate the cells into different subsets, this is useful to
+        make a summary plot for all subsets. Or if you don't specify `ident_1`, the summary plot for all idents in `group_by` will be generated.
+        ```toml
+        [ScFGSEA.envs]
+        group_by = "Diagnosis"
+        ident_1 = "Colitis"
+        ident_2 = "Control"
+        each = "seurat_clusters"
+        [ScFGSEA.envs.alleach_plots.Heatmap]
+        plot_type = "heatmap"
+        group_by = "Diagnosis"
+        ```
+        ![GSEA summary for all subsets](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/scfgsea/ScFGSEA/sampleinfo.fgsea/GSEA-all-seurat_clusters-/all.Heatmap.png){: width="80%"}
     Input:
         srtobj: The seurat object in RDS format
@@ -1620,11 +1861,23 @@ class ScFGSEA(Proc):
         group_by: The column name in metadata to group the cells.
         ident_1: The first group of cells to compare
-        ident_2: The second group of cells to compare, if not provided, the rest of the cells that are not `NA`s in `group_by` column are used for `ident-2`.
+        ident_2: The second group of cells to compare, if not provided, the rest of the cells that are not `NA`s in `group_by` column are used for `ident_2`.
+        assay: The assay to use. If not provided, the default assay will be used.
         each: The column name in metadata to separate the cells into different subsets to do the analysis.
         subset: An expression to subset the cells.
         gmtfile: The pathways in GMT format, with the gene names/ids in the same format as the seurat object.
-            One could also use a URL to a GMT file. For example, from <https://download.baderlab.org/EM_Genesets/current_release/Human/symbol/Pathways/>.
+            You can use built-in dbs in `enrichit`, or provide your own gmt files.
+            See also <https://pwwang.github.io/enrichit/reference/FetchGMT.html>.
+            The built-in dbs include:
+            * "BioCarta" or "BioCarta_2016"
+            * "GO_Biological_Process" or "GO_Biological_Process_2025"
+            * "GO_Cellular_Component" or "GO_Cellular_Component_2025"
+            * "GO_Molecular_Function" or "GO_Molecular_Function_2025"
+            * "KEGG", "KEGG_Human", "KEGG_2021", or "KEGG_2021_Human"
+            * "Hallmark", "MSigDB_Hallmark", or "MSigDB_Hallmark_2020"
+            * "Reactome", "Reactome_Pathways", or "Reactome_Pathways_2024"
+            * "WikiPathways", "WikiPathways_2024", "WikiPathways_Human", or "WikiPathways_2024_Human"
+            You can also fetch more dbs from <https://maayanlab.cloud/Enrichr/#libraries>.
         method (choice): The method to do the preranking.
             - signal_to_noise: Signal to noise.
                 The larger the differences of the means (scaled by the standard deviations);
@@ -1677,6 +1930,7 @@ class ScFGSEA(Proc):
     envs = {
         "mutaters": {},
         "ncores": config.misc.ncores,
+        "assay": None,
         "group_by": None,
         "ident_1": None,
         "ident_2": None,
@@ -1711,13 +1965,18 @@ class CellTypeAnnotation(Proc):
     3. Use [`scCATCH`](https://github.com/ZJUFanLab/scCATCH)
     4. Use [`hitype`](https://github.com/pwwang/hitype)
-    The annotated cell types will replace the original `seurat_clusters` column in the metadata,
+    The annotated cell types will replace the original identity column in the metadata,
     so that the downstream processes will use the annotated cell types.
-    The old `seurat_clusters` column will be renamed to `seurat_clusters_id`.
+    /// Note
+    When cell types are annotated, the original identity column (e.g. `seurat_clusters`) will be renamed
+    to `envs.backup_col` (e.g. `seurat_clusters_id`), and the new identity column will be added.
+    ///
     If you are using `ScType`, `scCATCH`, or `hitype`, a text file containing the mapping from
-    the old `seurat_clusters` to the new cell types will be generated and saved to
+    the original identity to the new cell types will be generated and saved to
     `cluster2celltype.tsv` under `<workdir>/<pipline_name>/CellTypeAnnotation/0/output/`.
     Examples:
@@ -1741,8 +2000,10 @@ class CellTypeAnnotation(Proc):
     Output:
         outfile: The rds/qs/qs2/h5ad file of seurat object with cell type annotated.
-            A text file containing the mapping from the old `seurat_clusters` to the new cell types
+            A text file containing the mapping from the old identity to the new cell types
             will be generated and saved to `cluster2celltype.tsv` under the job output directory.
+            Note that if `envs.ident` is specified, the output Seurat object will have
+            the identity set to the specified column in metadata.
     Envs:
         tool (choice): The tool to use for cell type annotation.
@@ -1760,6 +2021,13 @@ class CellTypeAnnotation(Proc):
             If not specified, all rows in `sctype_db` will be used.
         sctype_db: The database to use for sctype.
             Check examples at <https://github.com/IanevskiAleksandr/sc-type/blob/master/ScTypeDB_full.xlsx>
+        ident: The column name in metadata to use as the clusters.
+            If not specified, the identity column will be used when input is rds/qs/qs2 (supposing we have a Seurat object).
+            If input data is h5ad, this is required to run cluster-based annotation tools.
+            For `celltypist`, this is a shortcut to set `over_clustering` in `celltypist_args`.
+        backup_col: The backup column name to store the original identities.
+            If not specified, the original identity column will not be stored.
+            If `envs.newcol` is specified, this will be ignored.
         hitype_tissue: The tissue to use for `hitype`.
             Avaiable tissues should be the first column (`tissueType`) of `hitype_db`.
             If not specified, all rows in `hitype_db` will be used.
@@ -1769,7 +2037,7 @@ class CellTypeAnnotation(Proc):
             You can also use built-in databases, including `hitypedb_short`, `hitypedb_full`, and `hitypedb_pbmc3k`.
         cell_types (list): The cell types to use for direct annotation.
             You can use `"-"` or `""` as the placeholder for the clusters that
-            you want to keep the original cell types (`seurat_clusters`).
+            you want to keep the original cell types.
             If the length of `cell_types` is shorter than the number of
             clusters, the remaining clusters will be kept as the original cell
             types.
@@ -1781,6 +2049,11 @@ class CellTypeAnnotation(Proc):
             the original cell types will be kept and nothing will be changed.
             ///
+        more_cell_types (type=json): The additional cell type annotations to add to the metadata.
+            The keys are the new column names and the values are the cell types lists.
+            The cell type lists work the same as `cell_types` above.
+            This is useful when you want to keep multiple annotations of cell types.
         sccatch_args (ns): The arguments for `scCATCH::findmarkergene()` if `tool` is `sccatch`.
             - species: The specie of cells.
             - cancer: If the sample is from cancer tissue, then the cancer type may be defined.
@@ -1805,8 +2078,8 @@ class CellTypeAnnotation(Proc):
         merge (flag): Whether to merge the clusters with the same cell types.
             Otherwise, a suffix will be added to the cell types (ie. `.1`, `.2`, etc).
         newcol: The new column name to store the cell types.
-            If not specified, the `seurat_clusters` column will be overwritten.
-            If specified, the original `seurat_clusters` column will be kept and `Idents` will be kept as the original `seurat_clusters`.
+            If not specified, the identity column will be overwritten.
+            If specified, the original identity column will be kept and `Idents` will be kept as the original identity.
         outtype (choice): The output file type. Currently only works for `celltypist`.
             An RDS file will be generated for other tools.
             - input: Use the same file type as the input.
@@ -1841,7 +2114,10 @@ class CellTypeAnnotation(Proc):
         "tool": "hitype",
         "sctype_tissue": None,
         "sctype_db": config.ref.sctype_db,
+        "ident": None,
+        "backup_col": "seurat_clusters_id",
         "cell_types": [],
+        "more_cell_types": None,
         "sccatch_args": {
             "species": None,
             "cancer": "Normal",
@@ -2217,9 +2493,19 @@ class MetaMarkers(Proc):
         idents: The groups of cells to compare, values should be in the `group-by` column.
         each: The column name in metadata to separate the cells into different cases.
         prefix_each (flag): Whether to add the `each` value as prefix to the case name.
-        dbs (list): The dbs to do enrichment analysis for significant
-            markers See below for all libraries.
-            <https://maayanlab.cloud/Enrichr/#libraries>
+        dbs (list): The dbs to do enrichment analysis for significant markers.
+            You can use built-in dbs in `enrichit`, or provide your own gmt files.
+            See also <https://pwwang.github.io/enrichit/reference/FetchGMT.html>.
+            The built-in dbs include:
+            * "BioCarta" or "BioCarta_2016"
+            * "GO_Biological_Process" or "GO_Biological_Process_2025"
+            * "GO_Cellular_Component" or "GO_Cellular_Component_2025"
+            * "GO_Molecular_Function" or "GO_Molecular_Function_2025"
+            * "KEGG", "KEGG_Human", "KEGG_2021", or "KEGG_2021_Human"
+            * "Hallmark", "MSigDB_Hallmark", or "MSigDB_Hallmark_2020"
+            * "Reactome", "Reactome_Pathways", or "Reactome_Pathways_2024"
+            * "WikiPathways", "WikiPathways_2024", "WikiPathways_Human", or "WikiPathways_2024_Human"
+            You can also fetch more dbs from <https://maayanlab.cloud/Enrichr/#libraries>.
         subset: The subset of the cells to do the analysis.
             An expression passed to `dplyr::filter()`.
         p_adjust (choice): The method to adjust the p values, which can be used to filter the significant markers.
@@ -2310,6 +2596,9 @@ class AnnData2Seurat(Proc):
     Envs:
         assay: The assay to use to convert to seurat object.
+        ident: The column name in `adfile.obs` to use as the identity
+            for the seurat object.
+            If not specified, no identity will be set.
         dotplot_check (type=auto): Whether to do a check with a dot plot.
             (`scplotter::FeatureStatPlot(plot_type = "dot", ..)` will be used)
             to see if the conversion is successful.
@@ -2322,7 +2611,7 @@ class AnnData2Seurat(Proc):
     input = "adfile:file"
     output = "outfile:file:{{in.adfile | stem}}.qs"
     lang = config.lang.rscript
-    envs = {"assay": "RNA", "dotplot_check": True}
+    envs = {"assay": "RNA", "ident": None, "dotplot_check": True}
     script = "file://../scripts/scrna/AnnData2Seurat.R"
@@ -2415,6 +2704,18 @@ class CellCellCommunication(Proc):
             * `lr_means`: mean ligand-receptor expression, as a measure of ligand-receptor interaction magnitude.
             * `cellphone_pvals`: permutation-based p-values, as a measure of interaction specificity.
+            A typical output will look like this:
+            | ligand | ligand_complex | ligand_props | ligand_trimean | mat_max | receptor | receptor_complex | receptor_props | receptor_trimean | source | target | lr_probs | cellchat_pvals | mag_score | spec_score |
+            |--------|---------------|--------------|----------------|---------|----------|------------------|----------------|------------------|--------|--------|----------|----------------|-----------|------------|
+            | VIM | VIM | 1.00 | 0.36 | 8.73 | CD44 | CD44 | 0.77 | 0.16 | c7 | c3 | 0.10 | 0.00 | 0.10 | 0.00 |
+            | MIF | MIF | 0.97 | 0.22 | 8.73 | CXCR4 | CD74_CXCR4 | 0.87 | 0.26 | c5 | c6 | 0.10 | 0.00 | 0.10 | 0.00 |
+            | HLA-B | HLA-B | 1.00 | 0.44 | 8.73 | KLRD1 | KLRD1 | 0.73 | 0.13 | c9 | c2 | 0.10 | 0.00 | 0.10 | 0.00 |
+            | HMGB1 | HMGB1 | 0.99 | 0.26 | 8.73 | CXCR4 | CXCR4 | 0.81 | 0.21 | c2 | c7 | 0.10 | 0.00 | 0.10 | 0.00 |
+            | CD48 | CD48 | 0.94 | 0.20 | 8.73 | CD2 | CD2 | 0.99 | 0.28 | c7 | c8 | 0.10 | 0.00 | 0.10 | 0.00 |
+            | HLA-C | HLA-C | 1.00 | 0.38 | 8.73 | CD8B | CD8B | 0.73 | 0.15 | c1 | c9 | 0.10 | 0.00 | 0.10 | 0.00 |
+            | LGALS1 | LGALS1 | 0.95 | 0.17 | 8.73 | CD69 | CD69 | 0.99 | 0.34 | c10 | c5 | 0.10 | 0.00 | 0.10 | 0.00 |
     Envs:
         method (choice): The method to use for cell-cell communication inference.
             - CellPhoneDB: Use CellPhoneDB method.
@@ -2457,6 +2758,11 @@ class CellCellCommunication(Proc):
         ncores (type=int): The number of cores to use.
         groupby: The column name in metadata to group the cells.
             Typically, this column should be the cluster id.
+            If provided input is a Seurat object, the default identity will be used by default.
+            Otherwise, it is recommended to provide this parameter.
+            "seurat_clusters" will be used with a warning if the input is in AnnData format and
+            this parameter is not provided.
+        group_by: alias for `groupby`
         species (choice): The species of the cells.
             - human: Human cells, the 'consensus' resource will be used.
             - mouse: Mouse cells, the 'mouseconsensus' resource will be used.
@@ -2488,7 +2794,8 @@ class CellCellCommunication(Proc):
         "subset_using": "auto",
         "split_by": None,
         "ncores": config.misc.ncores,
-        "groupby": "seurat_clusters",
+        "groupby": None,
+        "group_by": None,
         "species": "human",
         "expr_prop": 0.1,
         "min_cells": 5,
@@ -2501,6 +2808,38 @@ class CellCellCommunication(Proc):
 class CellCellCommunicationPlots(Proc):
     """Visualization for cell-cell communication inference.
+    Examples:
+        ### Network Plot
+        ```toml
+        [CellCellCommunicationPlots.envs.cases."Cell-Cell Communication Network"]
+        plot_type = "network"
+        legend-position = "none"
+        theme = "theme_blank"
+        theme_args = {add_coord = false}
+        ```
+        ![Network Plot](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/cccplots/CellCellCommunicationPlots/sampleinfo.scRep-ccc_plots/Cell-Cell-Communication-Network.png){: width="80%"}
+        ### Circos Plot
+        ![Circos Plot](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/cccplots/CellCellCommunicationPlots/sampleinfo.scRep-ccc_plots/Cell-Cell-Communication-Circos-Plot.png){: width="80%"}
+        ### Heatmap Plot
+        ![Heatmap Plot](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/cccplots/CellCellCommunicationPlots/sampleinfo.scRep-ccc_plots/Cell-Cell-Communication-Heatmap.png){: width="80%"}
+        ### Cell-Cell Communication Interaction (Box Plot)
+        ```toml
+        [CellCellCommunicationPlots.envs.cases."Cell-Cell Communication Interaction (Box Plot)"]
+        plot_type = "box"
+        x_text_angle = 90
+        method = "interaction"
+        ```
+        ![Box Plot](https://raw.githubusercontent.com/pwwang/immunopipe/tests-output/cccplots/CellCellCommunicationPlots/sampleinfo.scRep-ccc_plots/Cell-Cell-Communication-Interaction-Box-Plot-.png){: width="80%"}
     Input:
         cccfile: The output file from `CellCellCommunication`
@@ -2524,6 +2863,10 @@ class CellCellCommunicationPlots(Proc):
         cases (type=json): The cases for the plots.
             The keys are the names of the cases and the values are the arguments for
             the plots. The arguments include the ones inherited from `envs`.
+            You can have a special `plot_type` `"table"` to generate a table for the
+            ccc data to save as a text file and show in the report.
+            If no cases are given, a default case will be used, with the
+            key `Cell-Cell Communication`.
         <more>: Other arguments passed to
             [scplotter::CCCPlot](https://pwwang.github.io/scplotter/reference/CCCPlot.html)
     """  # noqa: E501
@@ -2569,6 +2912,10 @@ class ScVelo(Proc):
         ncores (type=int): Number of cores to use.
         group_by: The column name in metadata to group the cells.
             Typically, this column should be the cluster id.
+            If provided input is a Seurat object, the default identity will be used by
+            default. Otherwise, it is recommended to provide this parameter.
+            "seurat_clusters" will be used with a warning if the input is in AnnData
+            format and this parameter is not provided.
         mode (type=list): The mode to use for the velocity analysis.
             It should be a subset of `['deterministic', 'stochastic', 'dynamical']`,
             meaning that we can perform the velocity analysis in multiple modes.
@@ -2607,7 +2954,7 @@ class ScVelo(Proc):
     lang = config.lang.python
     envs = {
         "ncores": config.misc.ncores,
-        "group_by": "seurat_clusters",
+        "group_by": None,
         "mode": ["deterministic", "stochastic", "dynamical"],
         "fitting_by": "stochastic",
         "min_shared_counts": 30,
@@ -2645,6 +2992,7 @@ class Slingshot(Proc):
     Envs:
         group_by: The column name in metadata to group the cells.
             Typically, this column should be the cluster id.
+            Default is the default identity of the seurat object.
         reduction: The nonlinear reduction to use for the trajectory analysis.
         dims (type=auto): The dimensions to use for the analysis.
             A list or a string with comma separated values.
@@ -2661,7 +3009,7 @@ class Slingshot(Proc):
     output = "outfile:file:{{in.sobjfile | stem}}.qs"
     lang = config.lang.rscript
     envs = {
-        "group_by": "seurat_clusters",
+        "group_by": None,
         "reduction": None,
         "dims": [1, 2],
         "start": None,
@@ -2706,6 +3054,7 @@ class PseudoBulkDEG(Proc):
             analysis.
     Envs:
+        ncores (type=int): Number of cores to use for parallelization.
         mutaters (type=json): Mutaters to mutate the metadata of the
             seurat object. Keys are the new column names and values are the
             expressions to mutate the columns. These new columns can be
@@ -2715,6 +3064,9 @@ class PseudoBulkDEG(Proc):
         each: The column name in metadata to separate the cells into different cases.
             When specified, the case will be expanded to multiple cases for
             each value in the column.
+        cache (type=auto): Where to cache the results.
+            If `True`, cache to `outdir` of the job. If `False`, don't cache.
+            Otherwise, specify the directory to cache to.
         subset: An expression in string to subset the cells.
         aggregate_by: The column names in metadata to aggregate the cells.
         layer: The layer to pull and aggregate the data.
@@ -2728,11 +3080,18 @@ class PseudoBulkDEG(Proc):
         paired_by: The column name in metadata to mark the paired samples.
             For example, subject. If specified, the paired test will be performed.
         dbs (list): The databases to use for enrichment analysis.
-            The databases are passed to `biopipen.utils::Enrichr()` to do the
-            enrichment analysis. The default databases are `KEGG_2021_Human` and
-            `MSigDB_Hallmark_2020`.
-            See <https://maayanlab.cloud/Enrichr/#libraries> for the available
-            libraries.
+            You can use built-in dbs in `enrichit`, or provide your own gmt files.
+            See also <https://pwwang.github.io/enrichit/reference/FetchGMT.html>.
+            The built-in dbs include:
+            * "BioCarta" or "BioCarta_2016"
+            * "GO_Biological_Process" or "GO_Biological_Process_2025"
+            * "GO_Cellular_Component" or "GO_Cellular_Component_2025"
+            * "GO_Molecular_Function" or "GO_Molecular_Function_2025"
+            * "KEGG", "KEGG_Human", "KEGG_2021", or "KEGG_2021_Human"
+            * "Hallmark", "MSigDB_Hallmark", or "MSigDB_Hallmark_2020"
+            * "Reactome", "Reactome_Pathways", or "Reactome_Pathways_2024"
+            * "WikiPathways", "WikiPathways_2024", "WikiPathways_Human", or "WikiPathways_2024_Human"
+            You can also fetch more dbs from <https://maayanlab.cloud/Enrichr/#libraries>.
         sigmarkers: An expression passed to `dplyr::filter()` to filter the
             significant markers for enrichment analysis.
             The default is `p_val_adj < 0.05`.
@@ -2743,7 +3102,7 @@ class PseudoBulkDEG(Proc):
         enrich_style (choice): The style of the enrichment analysis.
             - enrichr: Use `enrichr`-style for the enrichment analysis.
             - clusterProfiler: Use `clusterProfiler`-style for the enrichment analysis.
-        allmarker_plots_defaults (ns): Default options for the plots for all markers when `ident-1` is not specified.
+        allmarker_plots_defaults (ns): Default options for the plots for all markers when `ident_1` is not specified.
             - plot_type: The type of the plot.
                 See <https://pwwang.github.io/scplotter/reference/FeatureStatPlot.html>.
                 Available types are `violin`, `box`, `bar`, `ridge`, `dim`, `heatmap` and `dot`.
@@ -2802,7 +3161,7 @@ class PseudoBulkDEG(Proc):
             The keys are the names of the cases and the values are the dicts inherited from `enrich_plots_defaults`.
             The cases under `envs.cases` can inherit this options.
         overlaps_defaults (ns): Default options for investigating the overlapping of significant markers between different cases or comparisons.
-            This means either `ident-1` should be empty, so that they can be expanded to multiple comparisons.
+            This means either `ident_1` should be empty, so that they can be expanded to multiple comparisons.
             - sigmarkers: The expression to filter the significant markers for each case.
                 If not provided, `envs.sigmarkers` will be used.
             - plot_type (choice): The type of the plot to generate for the overlaps.
@@ -2821,8 +3180,8 @@ class PseudoBulkDEG(Proc):
         overlaps (type=json): Cases for investigating the overlapping of significant markers between different cases or comparisons.
             The keys are the names of the cases and the values are the dicts inherited from `overlaps_defaults`.
             There are two situations that we can perform overlaps:
-            1. If `ident-1` is not specified, the overlaps can be performed between different comparisons.
-            2. If `each` is specified, the overlaps can be performed between different cases, where in each case, `ident-1` must be specified.
+            1. If `ident_1` is not specified, the overlaps can be performed between different comparisons.
+            2. If `each` is specified, the overlaps can be performed between different cases, where in each case, `ident_1` must be specified.
         tool (choice): The method to use for the differential expression analysis.
             - DESeq2: Use DESeq2 for the analysis.
             - edgeR: Use edgeR for the analysis.
@@ -2844,12 +3203,14 @@ class PseudoBulkDEG(Proc):
     lang = config.lang.rscript
     script = "file://../scripts/scrna/PseudoBulkDEG.R"
     envs = {
+        "ncores": config.misc.ncores,
         "mutaters": {},
+        "cache": config.path.tmpdir,
         "each": None,
         "subset": None,
         "aggregate_by": None,
         "layer": "counts",
-        "assay": "RNA",
+        "assay": None,
         "error": False,
         "group_by": None,
         "ident_1": None,
@@ -2906,3 +3267,92 @@ class PseudoBulkDEG(Proc):
         "report": "file://../reports/common.svelte",
         "report_paging": 8,
     }
+class CellSNPLite(Proc):
+    """Genotyping bi-allelic SNPs on single cells using cellsnp-lite.
+    The output from cellsnp-lite can be directly used for downstream analysis such as -
+    * Donor deconvolution in multiplexed single-cell RNA-seq data (e.g., with vireo).
+    * Allele-specific CNV analysis in single-cell or spatial transcriptomics data (e.g., with Numbat, XClone, or CalicoST).
+    * Clonal substructure discovery using single cell mitochondrial variants (e.g., with MQuad).
+    Here we only support model `1a`/`2a` in cellsnp-lite, which is designed for a single bam file as input.
+    For model `1b`/`2b`, which is designed for multiple bam files as input (e.g., one per cell), you can still
+    run with this process, but only one bam file is allowed.
+    See <https://github.com/single-cell-genetics/cellsnp-lite> for more details about cellsnp-lite.
+    Input:
+        crdir: The cellranger output directory or the directory containing
+            the bam file and barcode file.
+            It should contain the `outs/possorted_genome_bam.bam` file and
+            the `outs/filtered_feature_bc_matrix/barcodes.tsv.gz` file.
+    Output:
+        outdir: The output directory for cellsnp-lite results.
+    Envs:
+        ncores (type=int): The number of cores to use.
+            Will pass to `-p` option in cellsnp-lite.
+        regionsVCF: A vcf file listing all candidate SNPs, for fetch each variants.
+        genotype (flag): Whether to perform genotyping.
+            If `False`, only the allele counts will be computed.
+        gzip (flag): Whether to gzip the output files.
+        <more>: Other arguments passed to cellsnp-lite.
+            See <https://cellsnp-lite.readthedocs.io/en/latest/main/manual.html#full-parameters> for more details.
+    """  # noqa: E501
+    input = "crdir:dir"
+    output = """
+        outdir:dir:
+        {%- if basename(in.crdir) == 'outs' -%}
+            {{in.crdir | dirname | basename}}
+        {%- else -%}
+            {{in.crdir | basename}}
+        {%- endif -%}
+        .cellsnp
+    """  # noqa: E501
+    lang = config.lang.python
+    envs = {
+        "cellsnp_lite": config.exe.cellsnp_lite,
+        "ncores": config.misc.ncores,
+        "regionsVCF": None,
+        "genotype": False,
+        "gzip": True,
+    }
+    script = "file://../scripts/scrna/CellSNPLite.py"
+class MQuad(Proc):
+    """Clonal substructure discovery using single cell mitochondrial variants with MQuad.
+    MQuad uses a Mixture Model for Mitochondrial Mutation detection in single-cell omics data.
+    MQuad is a tool that detects mitochondrial mutations that are informative for clonal substructure inference. It uses a binomial mixture model to assess the heteroplasmy of mtDNA variants among background noise.
+    Input:
+        cellsnpout: The output directory from `CellSNPLite` process, which should contain
+            AD and DP sparse matrices (.mtx) or the vcf file.
+    Output:
+        outdir: The output directory for MQuad results.
+    Envs:
+        ncores (type=int): The number of cores to use.
+            It will be passed to `--nproc` option in MQuad.
+        seed (type=int): The seed for the random number generator.
+            It will be passed to `--randSeed` option in MQuad.
+        <more>: Other arguments passed to MQuad.
+            See <https://github.com/single-cell-genetics/MQuad/blob/main/mquad/mquad_CLI.py> for more details.
+    """  # noqa: E501
+    input = "cellsnpout:dir"
+    output = "outdir:dir:{{in.cellsnpout | stem}}.mquad"
+    lang = config.lang.python
+    envs = {
+        "mquad": config.exe.mquad,
+        "ncores": config.misc.ncores,
+        "seed": 8525,
+    }
+    script = "file://../scripts/scrna/MQuad.py"

biopipen 0.34.6__py3-none-any.whl → 0.34.26__py3-none-any.whl

biopipen 0.34.6py3-none-any.whl → 0.34.26py3-none-any.whl