PyPI - biopipen - Versions diffs - 0.34.6__py3-none-any.whl → 0.34.26__py3-none-any.whl - Mend

biopipen 0.34.6py3-none-any.whl → 0.34.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +4 -0
biopipen/core/filters.py +1 -1
biopipen/core/testing.py +2 -1
biopipen/ns/cellranger.py +33 -3
biopipen/ns/regulatory.py +4 -0
biopipen/ns/scrna.py +548 -98
biopipen/ns/scrna_metabolic_landscape.py +4 -0
biopipen/ns/tcr.py +256 -16
biopipen/ns/web.py +5 -0
biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +9 -9
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +9 -8
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +9 -9
biopipen/reports/tcr/ClonalStats.svelte +1 -0
biopipen/scripts/cellranger/CellRangerCount.py +55 -11
biopipen/scripts/cellranger/CellRangerVdj.py +54 -8
biopipen/scripts/regulatory/MotifAffinityTest.R +21 -5
biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +9 -2
biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +15 -6
biopipen/scripts/regulatory/VariantMotifPlot.R +1 -1
biopipen/scripts/regulatory/motifs-common.R +3 -2
biopipen/scripts/scrna/AnnData2Seurat.R +2 -1
biopipen/scripts/scrna/CellCellCommunication.py +26 -14
biopipen/scripts/scrna/CellCellCommunicationPlots.R +23 -4
biopipen/scripts/scrna/CellSNPLite.py +30 -0
biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +27 -36
biopipen/scripts/scrna/CellTypeAnnotation-direct.R +42 -26
biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +11 -13
biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +5 -8
biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +5 -8
biopipen/scripts/scrna/CellTypeAnnotation.R +26 -3
biopipen/scripts/scrna/MQuad.py +25 -0
biopipen/scripts/scrna/MarkersFinder.R +128 -30
biopipen/scripts/scrna/ModuleScoreCalculator.R +9 -1
biopipen/scripts/scrna/PseudoBulkDEG.R +113 -27
biopipen/scripts/scrna/ScFGSEA.R +23 -26
biopipen/scripts/scrna/ScVelo.py +20 -8
biopipen/scripts/scrna/SeuratClusterStats-clustree.R +1 -1
biopipen/scripts/scrna/SeuratClusterStats-features.R +6 -1
biopipen/scripts/scrna/SeuratClustering.R +5 -1
biopipen/scripts/scrna/SeuratMap2Ref.R +1 -2
biopipen/scripts/scrna/SeuratPreparing.R +19 -11
biopipen/scripts/scrna/SeuratSubClustering.R +1 -1
biopipen/scripts/scrna/Slingshot.R +2 -4
biopipen/scripts/scrna/TopExpressingGenes.R +1 -4
biopipen/scripts/scrna/celltypist-wrapper.py +140 -4
biopipen/scripts/scrna/scvelo_paga.py +313 -0
biopipen/scripts/scrna/seurat_anndata_conversion.py +18 -1
biopipen/scripts/tcr/{TCRClustering.R → CDR3Clustering.R} +63 -23
biopipen/scripts/tcr/ClonalStats.R +76 -35
biopipen/utils/misc.py +104 -9
{biopipen-0.34.6.dist-info → biopipen-0.34.26.dist-info}/METADATA +5 -2
{biopipen-0.34.6.dist-info → biopipen-0.34.26.dist-info}/RECORD +55 -53
{biopipen-0.34.6.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
biopipen/utils/common_docstrs.py +0 -103
{biopipen-0.34.6.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +0 -0

biopipen/scripts/scrna/celltypist-wrapper.py CHANGED Viewed

@@ -1,4 +1,10 @@
 from argparse import ArgumentParser
+from typing import Union
+import numpy as np
+import pandas as pd
+import scanpy as sc
+import celltypist
+from celltypist.classifier import logger, AnnData, Model, Classifier
 parser = ArgumentParser(description="Run CellTypist")
 parser.add_argument(
@@ -18,9 +24,139 @@ parser.add_argument(
 )
+def classifier_init(
+    self, filename="", model="", transpose=False, gene_file=None, cell_file=None
+):
+    """Celltypist check if adata is in the range of log1p normalized data to 10000
+    counts per cell. Otherwise it will use the raw data if available. However, in
+    some cases, the raw data has invalid feature names (var_names) which causes errors.
+    Here we check if the feature names of raw data is valid with intersection with
+    model features, if not, we will use the adata.X instead of adata.raw.X
+    """
+    if isinstance(model, str):
+        model = Model.load(model)
+    self.model = model
+    if not filename:
+        logger.warn("📭 No input file provided to the classifier")
+        return
+    if isinstance(filename, str):
+        self.filename = filename
+        logger.info(f"📁 Input file is '{self.filename}'")
+        logger.info("⏳ Loading data")
+    if isinstance(filename, str) and filename.endswith(
+        (".csv", ".txt", ".tsv", ".tab", ".mtx", ".mtx.gz")
+    ):
+        self.adata = sc.read(self.filename)
+        if transpose:
+            self.adata = self.adata.transpose()
+        if self.filename.endswith((".mtx", ".mtx.gz")):
+            if (gene_file is None) or (cell_file is None):
+                raise FileNotFoundError(
+                    "🛑 Missing `gene_file` and/or `cell_file`. Please provide both "
+                    "arguments together with the input mtx file"
+                )
+            genes_mtx = pd.read_csv(gene_file, header=None)[0].values
+            cells_mtx = pd.read_csv(cell_file, header=None)[0].values
+            if len(genes_mtx) != self.adata.n_vars:
+                raise ValueError(
+                    f"🛑 The number of genes in {gene_file} does not match the number "
+                    f"of genes in {self.filename}"
+                )
+            if len(cells_mtx) != self.adata.n_obs:
+                raise ValueError(
+                    f"🛑 The number of cells in {cell_file} does not match the number "
+                    f"of cells in {self.filename}"
+                )
+            self.adata.var_names = genes_mtx
+            self.adata.obs_names = cells_mtx
+        if not float(self.adata.X[:1000].max()).is_integer():
+            logger.warn(
+                "⚠️ Warning: the input file seems not a raw count matrix. The "
+                "prediction result may not be accurate"
+            )
+        if (
+            (self.adata.n_vars >= 100000)
+            or (len(self.adata.var_names[0]) >= 30)
+            or (
+                len(
+                    self.adata.obs_names.intersection(
+                        ["GAPDH", "ACTB", "CALM1", "PTPRC", "MALAT1"]
+                    )
+                )
+                >= 1
+            )
+        ):
+            logger.warn(
+                "⚠️ The input matrix is detected to be a gene-by-cell matrix, will "
+                "transpose it"
+            )
+            self.adata = self.adata.transpose()
+        self.adata.var_names_make_unique()
+        sc.pp.normalize_total(self.adata, target_sum=1e4)
+        sc.pp.log1p(self.adata)
+        self.indata = self.adata.X
+        self.indata_genes = self.adata.var_names
+        self.indata_names = self.adata.obs_names
+    elif isinstance(filename, AnnData) or (
+        isinstance(filename, str) and filename.endswith(".h5ad")
+    ):
+        self.adata = sc.read(filename) if isinstance(filename, str) else filename
+        self.adata.var_names_make_unique()
+        # When to use raw.X?
+        # 1. if adata.raw exists
+        # 2. if adata.raw.var_names has intersection with model genes
+        # 3. if adata.X is not in the expected range
+        use_raw = self.adata.raw and (
+            self.adata.X[:1000].min() < 0 or self.adata.X[:1000].max() > 9.22
+        ) and np.isin(
+            self.adata.raw.var_names, self.model.classifier.features
+        ).sum() > 0
+        if use_raw:
+            if not self.adata.raw:
+                raise ValueError(
+                    "🛑 Invalid expression matrix in `.X`, expect log1p normalized "
+                    "expression to 10000 counts per cell"
+                )
+            elif (self.adata.raw.X[:1000].min() < 0) or (
+                self.adata.raw.X[:1000].max() > 9.22
+            ):
+                raise ValueError(
+                    "🛑 Invalid expression matrix in both `.X` and `.raw.X`, expect "
+                    "log1p normalized expression to 10000 counts per cell"
+                )
+            else:
+                logger.info(
+                    "👀 Invalid expression matrix in `.X`, expect log1p normalized "
+                    "expression to 10000 counts per cell; will use `.raw.X` instead"
+                )
+                self.indata = self.adata.raw.X
+                self.indata_genes = self.adata.raw.var_names
+                self.indata_names = self.adata.raw.obs_names
+        else:
+            self.indata = self.adata.X
+            self.indata_genes = self.adata.var_names
+            self.indata_names = self.adata.obs_names
+        if np.abs(np.expm1(self.indata[0]).sum() - 10000) > 1:
+            logger.warn(
+                "⚠️ Warning: invalid expression matrix, expect ALL genes and log1p "
+                "normalized expression to 10000 counts per cell. The prediction result "
+                "may not be accurate"
+            )
+    else:
+        raise ValueError(
+            "🛑 Invalid input. Supported types: .csv, .txt, .tsv, .tab, .mtx, .mtx.gz "
+            "and .h5ad, or AnnData loaded in memory"
+        )
+    logger.info(
+        f"🔬 Input data has {self.indata.shape[0]} cells and {len(self.indata_genes)} "
+        "genes"
+    )
 if __name__ == "__main__":
-    import scanpy as sc
-    import celltypist
+    Classifier.__init__ = classifier_init  # type: ignore
     args = parser.parse_args()
     adata = sc.read_h5ad(args.input)
@@ -29,8 +165,8 @@ if __name__ == "__main__":
         raise ValueError(
             f"Over clustering column '{over_clustering}' not found in AnnData object."
         )
-    if 'neighbors' in adata.uns and 'params' in adata.uns['neighbors']:
-        adata.uns['neighbors']['params'].setdefault('n_neighbors', 15)
+    if "neighbors" in adata.uns and "params" in adata.uns["neighbors"]:
+        adata.uns["neighbors"]["params"].setdefault("n_neighbors", 15)
     annotated = celltypist.annotate(
         adata,

biopipen/scripts/scrna/scvelo_paga.py ADDED Viewed

@@ -0,0 +1,313 @@
+"""This file is used to patch scvelo's paga to fix
+https://github.com/theislab/scvelo/issues/1241
+This is from pull request
+https://github.com/theislab/scvelo/pull/1308
+which has not been merged yet as of 2025-11-07.
+"""
+import numpy as np
+import pandas as pd
+from scipy.sparse import csr_matrix
+from scanpy.tools._paga import PAGA
+import scvelo
+# This is adapted from https://github.com/theislab/paga
+from scvelo import logging as logg
+from scvelo import settings
+from scvelo.tools.rank_velocity_genes import velocity_clusters
+from scvelo.tools.utils import strings_to_categoricals
+from scvelo.tools.velocity_graph import vals_to_csr
+from scvelo.tools.velocity_pseudotime import velocity_pseudotime
+# TODO: Finish docstrings
+def get_igraph_from_adjacency(adjacency, directed=None):
+    """Get igraph graph from adjacency matrix."""
+    import igraph as ig
+    sources, targets = adjacency.nonzero()
+    weights = adjacency[sources, targets]
+    if isinstance(weights, np.matrix):
+        weights = weights.A1
+    g = ig.Graph(directed=directed)
+    g.add_vertices(adjacency.shape[0])  # this adds adjacency.shap[0] vertices
+    g.add_edges(list(zip(sources, targets)))
+    g.es["weight"] = weights
+    if g.vcount() != adjacency.shape[0]:
+        logg.warn(
+            f"The constructed graph has only {g.vcount()} nodes. "
+            "Your adjacency matrix contained redundant nodes."
+        )
+    return g
+# TODO: Add docstrings
+def get_sparse_from_igraph(graph, weight_attr=None):
+    """TODO."""
+    edges = graph.get_edgelist()
+    if weight_attr is None:
+        weights = [1] * len(edges)
+    else:
+        weights = graph.es[weight_attr]
+    if not graph.is_directed():
+        edges.extend([(v, u) for u, v in edges])
+        weights.extend(weights)
+    shape = graph.vcount()
+    shape = (shape, shape)
+    if len(edges) > 0:
+        rows, cols = zip(*edges)
+        return csr_matrix((weights, (rows, cols)), shape=shape)
+    else:
+        return csr_matrix(shape)
+# TODO: Finish docstrings
+def set_row_csr(csr, rows, value=0):
+    """Set all nonzero elements to the given value. Useful to set to 0 mostly."""
+    for row in rows:
+        start = csr.indptr[row]
+        end = csr.indptr[row + 1]
+        csr.data[start:end] = value
+    if value == 0:
+        csr.eliminate_zeros()
+# TODO: Add docstrings
+class PAGA_tree(PAGA):
+    """TODO."""
+    def __init__(
+        self,
+        adata,
+        groups=None,
+        vkey=None,
+        use_time_prior=None,
+        root_key=None,
+        end_key=None,
+        threshold_root_end_prior=None,
+        minimum_spanning_tree=None,
+    ):
+        super().__init__(adata=adata, groups=groups, model="v1.2")
+        self.groups = groups
+        self.vkey = vkey
+        self.use_time_prior = use_time_prior
+        self.root_key = root_key
+        self.end_key = end_key
+        self.threshold_root_end_prior = threshold_root_end_prior
+        if self.threshold_root_end_prior is None:
+            self.threshold_root_end_prior = 0.9
+        self.minimum_spanning_tree = minimum_spanning_tree
+    # TODO: Add docstrings
+    def compute_transitions(self):
+        """TODO."""
+        try:
+            import igraph
+        except ImportError:
+            raise ImportError("To run paga, you need to install `pip install igraph`")
+        vkey = f"{self.vkey}_graph"
+        if vkey not in self._adata.uns:
+            raise ValueError(
+                "The passed AnnData needs to have an `uns` annotation "
+                "with key 'velocity_graph' - a sparse matrix from RNA velocity."
+            )
+        if self._adata.uns[vkey].shape != (self._adata.n_obs, self._adata.n_obs):
+            raise ValueError(
+                f"The passed 'velocity_graph' has shape {self._adata.uns[vkey].shape} "
+                f"but shoud have shape {(self._adata.n_obs, self._adata.n_obs)}"
+            )
+        clusters = self._adata.obs[self.groups]
+        cats = clusters.cat.categories
+        vgraph = self._adata.uns[vkey] > 0.1
+        time_prior = self.use_time_prior
+        if isinstance(time_prior, str) and time_prior in self._adata.obs.keys():
+            vpt = self._adata.obs[time_prior].values
+            vpt_mean = self._adata.obs.groupby(self.groups)[time_prior].mean()
+            vpt_means = np.array([vpt_mean[cat] for cat in clusters])
+            rows, cols, vals = [], [], []
+            for i in range(vgraph.shape[0]):
+                indices = vgraph[i].indices
+                idx_bool = vpt[i] < vpt[indices]
+                idx_bool &= vpt_means[indices] > vpt_means[i] - 0.1
+                cols.extend(indices[idx_bool])
+                vals.extend(vgraph[i].data[idx_bool])
+                rows.extend([i] * np.sum(idx_bool))
+            vgraph = vals_to_csr(vals, rows, cols, shape=vgraph.shape)
+        lb = self.threshold_root_end_prior  # cells to be consider as terminal states
+        if isinstance(self.end_key, str) and self.end_key in self._adata.obs.keys():
+            set_row_csr(vgraph, rows=np.where(self._adata.obs[self.end_key] > lb)[0])
+        if isinstance(self.root_key, str) and self.root_key in self._adata.obs.keys():
+            vgraph[:, np.where(self._adata.obs[self.root_key] > lb)[0]] = 0
+            vgraph.eliminate_zeros()
+        membership = self._adata.obs[self.groups].cat.codes.values
+        g = get_igraph_from_adjacency(vgraph, directed=True)
+        vc = igraph.VertexClustering(g, membership=membership)
+        cg_full = vc.cluster_graph(combine_edges="sum")
+        transitions = get_sparse_from_igraph(cg_full, weight_attr="weight")
+        transitions = transitions - transitions.T
+        transitions_conf = transitions.copy()
+        transitions = transitions.tocoo()
+        total_n = self._neighbors.n_neighbors * np.array(vc.sizes())
+        for i, j, v in zip(transitions.row, transitions.col, transitions.data):
+            reference = np.sqrt(total_n[i] * total_n[j])
+            transitions_conf[i, j] = 0 if v < 0 else v / reference
+        transitions_conf.eliminate_zeros()
+        # remove non-confident direct paths if more confident indirect path is found.
+        T = transitions_conf.toarray()
+        threshold = max(np.nanmin(np.nanmax(T / (T > 0), axis=0)) - 1e-6, 0.01)
+        T *= T > threshold
+        for i in range(len(T)):
+            idx = T[i] > 0
+            if np.any(idx):
+                indirect = np.clip(T[idx], None, T[i][idx][:, None]).max(0)
+                T[i, T[i] < indirect] = 0
+        if self.minimum_spanning_tree:
+            T_tmp = T.copy()
+            T_num = T > 0
+            T_sum = np.sum(T_num, 0)
+            T_max = np.max(T_tmp)
+            for i in range(len(T_tmp)):
+                if T_sum[i] == 1:
+                    T_tmp[np.where(T_num[:, i])[0][0], i] = T_max
+            from scipy.sparse.csgraph import minimum_spanning_tree
+            T_tmp = np.abs(minimum_spanning_tree(-T_tmp).toarray()) > 0
+            T = T_tmp * T
+        transitions_conf = csr_matrix(T)
+        self.transitions_confidence = transitions_conf.T
+        # set threshold for minimal spanning tree.
+        df = pd.DataFrame(T, index=cats, columns=cats)
+        self.threshold = np.nanmin(np.nanmax(df.values / (df.values > 0), axis=0))
+        self.threshold = max(self.threshold - 1e-6, 0.01)
+def paga(
+    adata,
+    groups=None,
+    vkey="velocity",
+    use_time_prior=True,
+    root_key=None,
+    end_key=None,
+    threshold_root_end_prior=None,
+    minimum_spanning_tree=True,
+    copy=False,
+):
+    """PAGA graph with velocity-directed edges.
+    Mapping out the coarse-grained connectivity structures of complex manifolds
+    :cite:p:`Wolf19`. By quantifying the connectivity of partitions (groups, clusters) of the
+    single-cell graph, partition-based graph abstraction (PAGA) generates a much
+    simpler abstracted graph (*PAGA graph*) of partitions, in which edge weights
+    represent confidence in the presence of connections.
+    Parameters
+    ----------
+    adata : :class:`~anndata.AnnData`
+        An annotated data matrix.
+    groups : key for categorical in `adata.obs`, optional (default: 'louvain')
+        You can pass your predefined groups by choosing any categorical
+        annotation of observations (`adata.obs`).
+    vkey: `str` or `None` (default: `None`)
+        Key for annotations of observations/cells or variables/genes.
+    use_time_prior : `str` or bool, optional (default: True)
+        Obs key for pseudo-time values.
+        If True, 'velocity_pseudotime' is used if available.
+    root_key : `str` or bool, optional (default: None)
+        Obs key for root states.
+    end_key : `str` or bool, optional (default: None)
+        Obs key for end states.
+    threshold_root_end_prior : `float` (default: 0.9)
+        Threshold for root and final states priors, to be in the range of [0,1].
+        Values above the threshold will be considered as terminal and included as prior.
+    minimum_spanning_tree : bool, optional (default: True)
+        Whether to prune the tree such that a path from A-to-B
+        is removed if another more confident path exists.
+    copy : `bool`, optional (default: `False`)
+        Copy `adata` before computation and return a copy.
+        Otherwise, perform computation inplace and return `None`.
+    Returns
+    -------
+    connectivities: `.uns`
+        The full adjacency matrix of the abstracted graph, weights correspond to
+        confidence in the connectivities of partitions.
+    connectivities_tree: `.uns`
+        The adjacency matrix of the tree-like subgraph that best explains the topology.
+    transitions_confidence: `.uns`
+        The adjacency matrix of the abstracted directed graph, weights correspond to
+        confidence in the transitions between partitions.
+    """
+    if "neighbors" not in adata.uns:
+        raise ValueError(
+            "You need to run `pp.neighbors` first to compute a neighborhood graph."
+        )
+    adata = adata.copy() if copy else adata
+    strings_to_categoricals(adata)
+    if groups is None:
+        groups = (
+            "clusters"
+            if "clusters" in adata.obs.keys()
+            else "louvain"
+            if "louvain" in adata.obs.keys()
+            else None
+        )
+    elif groups == "velocity_clusters" and "velocity_clusters" not in adata.obs.keys():
+        velocity_clusters(adata)
+    if use_time_prior and not isinstance(use_time_prior, str):
+        use_time_prior = "velocity_pseudotime"
+        if use_time_prior not in adata.obs.keys():
+            velocity_pseudotime(adata, vkey=vkey, root_key=root_key, end_key=end_key)
+    priors = [p for p in [use_time_prior, root_key, end_key] if p in adata.obs.keys()]
+    logg.info(
+        "running PAGA",
+        f"using priors: {priors}" if len(priors) > 0 else "",
+        r=True,
+    )
+    paga = PAGA_tree(
+        adata,
+        groups,
+        vkey=vkey,
+        use_time_prior=use_time_prior,
+        root_key=root_key,
+        end_key=end_key,
+        threshold_root_end_prior=threshold_root_end_prior,
+        minimum_spanning_tree=minimum_spanning_tree,
+    )
+    if "paga" not in adata.uns:
+        adata.uns["paga"] = {}
+    paga.compute_connectivities()
+    adata.uns["paga"]["connectivities"] = paga.connectivities
+    adata.uns["paga"]["connectivities_tree"] = paga.connectivities_tree
+    adata.uns[f"{groups}_sizes"] = np.array(paga.ns)
+    paga.compute_transitions()
+    adata.uns["paga"]["transitions_confidence"] = paga.transitions_confidence
+    adata.uns["paga"]["threshold"] = paga.threshold
+    adata.uns["paga"]["groups"] = groups
+    logg.info("    finished", time=True, end=" " if settings.verbosity > 2 else "\n")
+    logg.hint(
+        "added\n" + "    'paga/connectivities', connectivities adjacency (adata.uns)\n"
+        "    'paga/connectivities_tree', connectivities subtree (adata.uns)\n"
+        "    'paga/transitions_confidence', velocity transitions (adata.uns)"
+    )
+    return adata if copy else None
+scvelo.tl.paga = paga

biopipen/scripts/scrna/seurat_anndata_conversion.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Need R and R packages Seurat, SeuratDisk and biopipen.utils.R installed.
 """
+from __future__ import annotations
 def convert_seurat_to_anndata(
@@ -10,7 +11,8 @@ def convert_seurat_to_anndata(
     assay=None,
     subset=None,
     rscript="Rscript",
-):
+    return_ident_col=False,
+) -> None | str:
     """Convert Seurat object to AnnData format.
     Args:
@@ -43,6 +45,21 @@ def convert_seurat_to_anndata(
     cmd = [rscript, temp_script_path]
     run_command(cmd, fg=True)
+    if return_ident_col:
+        ident_col_script = f"""
+            library(biopipen.utils)
+            obj <- read_obj("{input_file}")
+            cat(GetIdentityColumn(obj))
+        """
+        with NamedTemporaryFile(suffix=".R", delete=False) as temp_script:
+            temp_script.write(ident_col_script.encode('utf-8'))
+            temp_script_path = temp_script.name
+        cmd = [rscript, temp_script_path]
+        ident_col = run_command(cmd, stdout="RETURN").strip()
+        return ident_col
 def convert_anndata_to_seurat(
     input_file,

biopipen/scripts/tcr/{TCRClustering.R → CDR3Clustering.R} RENAMED Viewed

@@ -13,6 +13,7 @@ python <- {{envs.python | r}}
 within_sample <- {{envs.within_sample | r}}
 args <- {{envs.args | r}}
 chain <- {{envs.chain | r}}
+type <- {{envs.type | r}}
 setwd(outdir)
@@ -22,7 +23,36 @@ log$info("Reading input file ...")
 obj <- read_obj(screpfile)
 is_seurat <- inherits(obj, "Seurat")
-get_cdr3aa_df = function() {
+get_type <- function() {
+    if (!is_seurat) {
+        for (sample in names(obj)) {
+            for (gene in obj[[sample]]$CTgene) {
+                if (grepl("^TRB", gene) || grepl("^TRG", gene) || grepl("^TRA", gene) || grepl("^TRD", gene)) {
+                    return("TCR")
+                } else if (grepl("^IGH", gene) || grepl("^IGK", gene) || grepl("^IGL", gene)) {
+                    return("BCR")
+                }
+            }
+        }
+    } else {
+        for (gene in obj@meta.data$CTgene) {
+            if (grepl("^TRB", gene) || grepl("^TRG", gene) || grepl("^TRA", gene) || grepl("^TRD", gene)) {
+                return("TCR")
+            } else if (grepl("^IGH", gene) || grepl("^IGK", gene) || grepl("^IGL", gene)) {
+                return("BCR")
+            }
+        }
+    }
+    stop("Cannot determine the type of the data (TCR or BCR). Please set envs.type to 'TCR' or 'BCR'.")
+}
+if (type == "auto") {
+    type <- get_type()
+    log$info("Auto-detected data type: {type}")
+}
+get_cdr3aa_df <- function() {
     if (!is_seurat) {
         out <- NULL
         for (sample in names(obj)) {
@@ -32,10 +62,12 @@ get_cdr3aa_df = function() {
             )
             if (chain == "both") {
                 df$CDR3.aa <- obj[[sample]]$CTaa
-            } else if (chain == "alpha") {
+            } else if ((type == "BCR" && chain == "heavy") || (type == "TCR" && chain == "light")) {
                 df$CDR3.aa <- obj[[sample]]$cdr3_aa1
-            } else if (chain == "beta") {
+            } else if ((type == "BCR" && chain == "light") || (type == "TCR" && chain == "heavy")) {
                 df$CDR3.aa <- obj[[sample]]$cdr3_aa2
+            } else {
+                stop(paste("Unknown chain:", chain, "for", type))
             }
             out <- rbind(out, df)
         }
@@ -47,11 +79,13 @@ get_cdr3aa_df = function() {
             if (chain == "both") {
                 out$CDR3.aa <- out$CTaa
             } else {
-                out <- separate(out, CTaa, into = c("alpha.aa", "beta.aa"), sep = "_")
-                if (chain == "alpha") {
-                    out$CDR3.aa <- out$alpha.aa
-                } else if (chain == "beta") {
-                    out$CDR3.aa <- out$beta.aa
+                out <- separate(out, CTaa, into = c("first", "second"), sep = "_")
+                if ((type == "BCR" && chain == "heavy") || (type == "TCR" && chain == "light")) {
+                    out$CDR3.aa <- out$first
+                } else if ((type == "BCR" && chain == "light") || (type == "TCR" && chain == "heavy")) {
+                    out$CDR3.aa <- out$second
+                } else {
+                    stop(paste("Unknown chain:", chain, "for", type))
                 }
             }
         } else {
@@ -132,21 +166,24 @@ output.clusters_df.to_csv(clustcr_dir + "/clusters.txt", sep="\t", index=False)
 clean_clustcr_output = function(clustcr_outfile) {
     clustcr_out = read.delim2(clustcr_outfile, header=TRUE, row.names = NULL)
-    colnames(clustcr_out) = c("CDR3.aa", "TCR_Cluster")
+    colnames(clustcr_out) = c("CDR3.aa", "CDR3_Cluster")
     out = left_join(cdr3aa_df, distinct(clustcr_out), by=c(cdr3seq4clustering = "CDR3.aa")) %>%
         mutate(
-            TCR_Cluster = if_else(
-                is.na(TCR_Cluster),
+            CDR3_Cluster = if_else(
+                is.na(CDR3_Cluster),
                 paste0("S_", row_number()),
-                paste0("M_", as.character(TCR_Cluster))
+                paste0("M_", as.character(CDR3_Cluster))
             )
         )
     if (within_sample) {
-        out <- mutate(out, TCR_Cluster = paste0(Sample, ".", TCR_Cluster))
+        out <- mutate(out, CDR3_Cluster = paste0(Sample, ".", CDR3_Cluster))
     }
-    left_join(cdr3aa_df, out, by = "CDR3.aa")
+    # This join would result in more rows than dplyr can handle
+    # left_join(cdr3aa_df, out, by = "CDR3.aa")
+    out <- out[match(cdr3aa_df$CDR3.aa, out$CDR3.aa), , drop=FALSE]
+    cbind(cdr3aa_df, out[, setdiff(colnames(out), "CDR3.aa"), drop=FALSE])
 }
 run_clustcr = function() {
@@ -208,25 +245,28 @@ prepare_input = function() {
 clean_giana_output = function(giana_outfile) {
     # generate an output file with columns:
-    # CDR3.aa, TCR_Cluster, V.name, Sample
+    # CDR3.aa, CDR3_Cluster, V.name, Sample
     # If sequence doesn't exist in the input file,
     # Then a unique cluster id is assigned to it.
     giana_out = read.delim2(giana_outfile, header=FALSE, comment.char = "#", row.names = NULL)[, 1:2, drop=FALSE]
-    colnames(giana_out) = c("CDR3.aa", "TCR_Cluster")
+    colnames(giana_out) = c("CDR3.aa", "CDR3_Cluster")
     out = left_join(cdr3aa_df, distinct(giana_out), by=c(cdr3seq4clustering = "CDR3.aa")) %>%
         mutate(
-            TCR_Cluster = if_else(
-                is.na(TCR_Cluster),
+            CDR3_Cluster = if_else(
+                is.na(CDR3_Cluster),
                 paste0("S_", row_number()),
-                paste0("M_", as.character(TCR_Cluster))
+                paste0("M_", as.character(CDR3_Cluster))
             )
         )
     if (within_sample) {
-        out <- mutate(out, TCR_Cluster = paste0(Sample, ".", TCR_Cluster))
+        out <- mutate(out, CDR3_Cluster = paste0(Sample, ".", CDR3_Cluster))
     }
-    left_join(cdr3aa_df, out, by = "CDR3.aa")
+    # This join would result in more rows than dplyr can handle
+    # left_join(cdr3aa_df, out, by = "CDR3.aa")
+    out <- out[match(cdr3aa_df$CDR3.aa, out$CDR3.aa), , drop=FALSE]
+    cbind(cdr3aa_df, out[, setdiff(colnames(out), "CDR3.aa"), drop=FALSE])
 }
 run_giana = function() {
@@ -276,12 +316,12 @@ attach_to_obj = function(obj, out) {
     rownames(out) <- out$Barcode
     if (is_seurat) {
         # Attach results to Seurat object
-        obj@meta.data$TCR_Cluster <- out[rownames(obj@meta.data), "TCR_Cluster"]
+        obj@meta.data$CDR3_Cluster <- out[rownames(obj@meta.data), "CDR3_Cluster"]
     } else {
         # Attach results to the list of data frames
         for (sample in names(obj)) {
             sout <- filter(out, Sample == sample)
-            obj[[sample]]$TCR_Cluster <- sout[obj[[sample]]$barcode, "TCR_Cluster"]
+            obj[[sample]]$CDR3_Cluster <- sout[obj[[sample]]$barcode, "CDR3_Cluster"]
         }
     }
     obj

biopipen 0.34.6__py3-none-any.whl → 0.34.26__py3-none-any.whl

biopipen 0.34.6py3-none-any.whl → 0.34.26py3-none-any.whl