PyPI - sclab - Versions diffs - 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

sclab 0.2.5py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sclab might be problematic. Click here for more details.

Files changed (50) hide show

sclab/__init__.py +1 -1
sclab/dataset/_dataset.py +1 -1
sclab/examples/processor_steps/__init__.py +2 -0
sclab/examples/processor_steps/_doublet_detection.py +68 -0
sclab/examples/processor_steps/_integration.py +37 -4
sclab/examples/processor_steps/_neighbors.py +24 -4
sclab/examples/processor_steps/_pca.py +5 -5
sclab/examples/processor_steps/_preprocess.py +14 -1
sclab/examples/processor_steps/_qc.py +22 -6
sclab/gui/__init__.py +0 -0
sclab/gui/components/__init__.py +5 -0
sclab/gui/components/_guided_pseudotime.py +482 -0
sclab/methods/__init__.py +25 -1
sclab/preprocess/__init__.py +18 -0
sclab/preprocess/_cca.py +154 -0
sclab/preprocess/_cca_integrate.py +77 -0
sclab/preprocess/_filter_obs.py +42 -0
sclab/preprocess/_harmony.py +421 -0
sclab/preprocess/_harmony_integrate.py +50 -0
sclab/preprocess/_normalize_weighted.py +61 -0
sclab/preprocess/_subset.py +208 -0
sclab/preprocess/_transfer_metadata.py +137 -0
sclab/preprocess/_transform.py +82 -0
sclab/preprocess/_utils.py +96 -0
sclab/tools/__init__.py +0 -0
sclab/tools/cellflow/__init__.py +0 -0
sclab/tools/cellflow/density_dynamics/__init__.py +0 -0
sclab/tools/cellflow/density_dynamics/_density_dynamics.py +349 -0
sclab/tools/cellflow/pseudotime/__init__.py +0 -0
sclab/tools/cellflow/pseudotime/_pseudotime.py +332 -0
sclab/tools/cellflow/pseudotime/timeseries.py +226 -0
sclab/tools/cellflow/utils/__init__.py +0 -0
sclab/tools/cellflow/utils/density_nd.py +136 -0
sclab/tools/cellflow/utils/interpolate.py +334 -0
sclab/tools/cellflow/utils/smoothen.py +124 -0
sclab/tools/cellflow/utils/times.py +55 -0
sclab/tools/differential_expression/__init__.py +5 -0
sclab/tools/differential_expression/_pseudobulk_edger.py +304 -0
sclab/tools/differential_expression/_pseudobulk_helpers.py +277 -0
sclab/tools/doublet_detection/__init__.py +5 -0
sclab/tools/doublet_detection/_scrublet.py +64 -0
sclab/tools/labeling/__init__.py +6 -0
sclab/tools/labeling/sctype.py +233 -0
sclab/utils/__init__.py +5 -0
sclab/utils/_write_excel.py +510 -0
{sclab-0.2.5.dist-info → sclab-0.3.0.dist-info}/METADATA +6 -2
sclab-0.3.0.dist-info/RECORD +81 -0
sclab-0.2.5.dist-info/RECORD +0 -45
{sclab-0.2.5.dist-info → sclab-0.3.0.dist-info}/WHEEL +0 -0
{sclab-0.2.5.dist-info → sclab-0.3.0.dist-info}/licenses/LICENSE +0 -0

sclab/tools/cellflow/utils/smoothen.py ADDED Viewed

@@ -0,0 +1,124 @@
+import logging
+from typing import Callable
+import numpy as np
+from numpy import bool_, floating, integer
+from numpy.typing import NDArray
+from tqdm.auto import tqdm
+logger = logging.getLogger(__name__)
+PIX2 = 2 * np.pi
+def count_empty_intervals(t: NDArray[floating], t_grid: NDArray[floating]) -> int:
+    n_data_in_intervals = count_data_in_intervals(t, t_grid)
+    empty_intervals_count = np.sum(n_data_in_intervals == 0)
+    return empty_intervals_count
+def count_data_in_intervals(
+    t: NDArray[floating], t_grid: NDArray[floating]
+) -> NDArray[integer]:
+    t = t.reshape(-1, 1)
+    return np.logical_and(t_grid[:-1] <= t, t <= t_grid[1:]).sum(axis=0)
+def choose_grid_size(t: NDArray[floating], t_range: tuple[float, float]) -> int:
+    grid_size = 2**10
+    for _ in range(10):
+        t_grid: NDArray[floating] = np.linspace(*t_range, grid_size + 1)
+        empty_intervals = count_empty_intervals(t, t_grid)
+        if empty_intervals == 0:
+            break
+        grid_size //= 2
+    else:
+        raise ValueError("Could not find a suitable grid size")
+    return grid_size
+def smoothen_data(
+    t: NDArray[floating],
+    X: NDArray[floating],
+    t_range: tuple[float, float] | None = None,
+    t_grid: NDArray[floating] | None = None,
+    fn: Callable[[NDArray[floating]], NDArray[floating]] = np.average,
+    window_width: float | None = None,
+    weights: NDArray[floating] | None = None,
+    zero_weight: float = 1,
+    periodic: bool = False,
+    quiet: bool = False,
+    progress: bool = False,
+) -> NDArray[floating]:
+    if t_grid is None:
+        # no grid provided. We will have one output point for each input point
+        t_grid = t
+        is_grid = False
+    else:
+        # grid is provided
+        is_grid = True
+        empty_intervals = count_empty_intervals(t, t_grid)
+        if empty_intervals > 0 and not quiet:
+            logger.warning(f"Provided grid has {empty_intervals} empty intervals")
+    if t_range is not None:
+        # we used a specific t values range
+        tmin, tmax = t_range
+    else:
+        tmin, tmax = t_grid.min(), t_grid.max()
+    # full time window size
+    tspan = tmax - tmin
+    if window_width is None and not is_grid:
+        window_width = tspan * 0.05
+    elif window_width is None and is_grid:
+        window_width = tspan / (t_grid.size - 1) * 2
+    # initialize the output matrix with NaNs
+    X_smooth: NDArray[floating] = np.full((t_grid.size,) + X.shape[1:], np.nan)
+    generator = enumerate(t_grid)
+    if progress:
+        generator = tqdm(
+            generator,
+            total=t_grid.size,
+            bar_format="{desc} {percentage:3.0f}%|{bar}|",
+            desc="Smoothing data",
+        )
+    X = X.astype(float)
+    eps = np.finfo(float).eps
+    for i, m in generator:
+        low = m - window_width / 2
+        hig = m + window_width / 2
+        mask: NDArray[bool_] = (t >= low) & (t <= hig)
+        if periodic:
+            # include points beyond the periodic boundaries
+            mask = (
+                mask
+                | (t >= low + tspan) & (t <= hig + tspan)
+                | (t >= low - tspan) & (t <= hig - tspan)
+            )
+        if mask.sum() == 0:
+            continue
+        x = X[mask] + eps
+        if fn == np.average and weights is not None:
+            w = weights[mask]
+            X_smooth[i] = np.average(x, axis=0, weights=w)
+        elif fn == np.average and zero_weight == 1:
+            X_smooth[i] = np.mean(x, axis=0)
+        elif fn == np.average and zero_weight != 1:
+            w = np.ones_like(x)
+            w[x == eps] = zero_weight + eps
+            X_smooth[i] = fn(x, axis=0, weights=w)
+        else:
+            X_smooth[i] = fn(x, axis=0)
+    return X_smooth - eps

sclab/tools/cellflow/utils/times.py ADDED Viewed

@@ -0,0 +1,55 @@
+import itertools
+import numpy as np
+from numpy import floating
+from numpy.typing import NDArray
+def guess_trange(
+    times: NDArray[floating], verbose: bool = False
+) -> tuple[float, float]:
+    tmin, tmax = times.min(), times.max()
+    tspan = tmax - tmin
+    scale = 10.0 ** np.ceil(np.log10(tspan)) / 100
+    tspan = np.ceil(tspan / scale) * scale
+    scale = 10.0 ** np.ceil(np.log10(tspan)) / 100
+    g_tmin = np.floor(tmin / scale) * scale
+    g_tmax = np.ceil(tmax / scale) * scale
+    g_tmin = 0.0 if g_tmin == -0.0 else g_tmin
+    g_tmax = 0.0 if g_tmax == -0.0 else g_tmax
+    if verbose:
+        print(
+            f"tspan: {tspan:10.4f}    min-max: {tmin:10.4f} - {tmax:10.4f} | {g_tmin:>8} - {g_tmax:>8}"
+        )
+    return g_tmin, g_tmax
+def test_guess_trange(N: int = 1000, verbose: bool = False) -> None:
+    def _test1(trange: tuple[float, float]) -> bool:
+        tmin, tmax = trange
+        tspan = tmax - tmin
+        g_tmin, g_tmax = guess_trange(np.random.uniform(*trange, N))
+        err_min = np.abs(g_tmin - tmin) / tspan
+        err_max = np.abs(g_tmax - tmax) / tspan
+        return err_min <= 0.01 and err_max <= 0.01
+    scales1 = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
+    scales2 = [1, 2, 3, 5, 7]
+    for s1, s2 in itertools.product(scales1, scales2):
+        scale = s1 * s2
+        for lw, hg in [(-2, -1), (-1 / 2, 1 / 2), (1, 2)]:
+            trange = lw * scale, hg * scale
+            acc1 = np.mean([_test1(trange) for _ in range(500)])
+            if verbose:
+                print(
+                    f"scale: {scale: 9.3f} | lw-hg: {lw: 5.1f} - {hg: 5.1f} | {acc1: 8.2%}"
+                )
+            else:
+                assert acc1 > 0.95, (
+                    f"scale: {scale: 9.3f} | lw-hg: {lw: 5.1f} - {hg: 5.1f} | {acc1: 8.2%}"
+                )

sclab/tools/differential_expression/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from ._pseudobulk_edger import pseudobulk_edger
+__all__ = [
+    "pseudobulk_edger",
+]

sclab/tools/differential_expression/_pseudobulk_edger.py ADDED Viewed

@@ -0,0 +1,304 @@
+import pandas as pd
+from anndata import AnnData
+from ._pseudobulk_helpers import aggregate_and_filter
+def pseudobulk_edger(
+    adata_: AnnData,
+    group_key: str,
+    condition_group: str | list[str] | None = None,
+    reference_group: str | None = None,
+    cell_identity_key: str | None = None,
+    batch_key: str | None = None,
+    layer: str | None = None,
+    replicas_per_group: int = 10,
+    min_cells_per_group: int = 30,
+    bootstrap_sampling: bool = True,
+    use_cells: dict[str, list[str]] | None = None,
+    aggregate: bool = True,
+    verbosity: int = 0,
+) -> dict[str, pd.DataFrame]:
+    """
+    Fits a model using edgeR and computes top tags for a given condition vs
+    reference group.
+    Parameters
+    ----------
+    adata_ : AnnData
+        Annotated data matrix.
+    group_key : str
+        Key in AnnData object to use to group cells.
+    condition_group : str | list[str] | None, optional
+        Condition group to compare to reference group. If None, each group will be
+        contrasted to the corresponding reference group.
+    reference_group : str | None, optional
+        Reference group to compare condition group(s) to. If None, the condition group
+        is compared to the rest of the cells.
+    cell_identity_key : str | None, optional
+        If provided, separate contrasts will be computed for each identity. Defaults to None.
+    layer : str | None, optional
+        Layer in AnnData object to use. EdgeR requires raw counts. Defaults to None.
+    replicas_per_group : int, optional
+        Number of replicas to create for each group. Defaults to 10.
+    min_cells_per_group : int, optional
+        Minimum number of cells required for a group to be included. Defaults to 30.
+    bootstrap_sampling : bool, optional
+        Whether to use bootstrap sampling to create replicas. Defaults to True.
+    use_cells : dict[str, list[str]] | None, optional
+        If not None, only use the specified cells. Defaults to None. Dictionary key
+        is a categorical variable in the obs dataframe and the dictionary value is a
+        list of categories to include.
+    aggregate : bool, optional
+        Whether to aggregate cells before fitting the model. EdgeR requires a small
+        number of samples, so if adata_ is a single-cell experiment, the cells should
+        be aggregated. Defaults to True.
+    verbosity : int, optional
+        Verbosity level. Defaults to 0.
+    Returns
+    -------
+    dict[str, pd.DataFrame]
+        Dictionary of dataframes, one for each contrast, with the following columns:
+        * gene_ids : str
+            Gene IDs.
+        * logFC : float
+            Log2 fold change.
+        * logCPM : float
+            Log2 CPM.
+        * F: float
+            F-statistic.
+        * PValue : float
+            p-value.
+        * FDR : float
+            False discovery rate.
+        * pct_expr_cnd : float
+            Percentage of cells in condition group expressing the gene.
+        * pct_expr_ref : float
+            Percentage of cells in reference group expressing the gene.
+    """
+    _try_imports()
+    import anndata2ri  # noqa: F401
+    import rpy2.robjects as robjects
+    from rpy2.rinterface_lib.embedded import RRuntimeError  # noqa: F401
+    from rpy2.robjects import pandas2ri  # noqa: F401
+    from rpy2.robjects.conversion import localconverter  # noqa: F401
+    R = robjects.r
+    if aggregate:
+        aggr_adata = aggregate_and_filter(
+            adata_,
+            group_key,
+            cell_identity_key,
+            layer,
+            replicas_per_group,
+            min_cells_per_group,
+            bootstrap_sampling,
+            use_cells,
+        )
+    else:
+        aggr_adata = adata_.copy()
+    with localconverter(anndata2ri.converter):
+        R.assign("aggr_adata", aggr_adata)
+    # defines the R function for fitting the model with edgeR
+    R(_fit_model_r_script)
+    if condition_group is None:
+        condition_group_list = aggr_adata.obs[group_key].unique()
+    elif isinstance(condition_group, str):
+        condition_group_list = [condition_group]
+    else:
+        condition_group_list = condition_group
+    if cell_identity_key is not None:
+        cids = aggr_adata.obs[cell_identity_key].unique()
+    else:
+        cids = [""]
+    tt_dict = {}
+    for condition_group in condition_group_list:
+        if reference_group is not None and condition_group == reference_group:
+            continue
+        if verbosity > 0:
+            print(f"Fitting model for {condition_group}...")
+        if reference_group is not None:
+            gk = group_key
+        else:
+            gk = f"{group_key}_{condition_group}"
+        try:
+            R(f"""
+                outs <- fit_model(aggr_adata, "{gk}", "{cell_identity_key}", "{batch_key}", verbosity = {verbosity})
+                fit <- outs$fit
+                y <- outs$y
+            """)
+        except RRuntimeError as e:
+            print("Error fitting model for", condition_group)
+            print("Error:", e)
+            print("Skipping...", flush=True)
+            continue
+        if reference_group is None:
+            new_contrasts_tuples = [
+                (
+                    condition_group,  # common prefix
+                    "",  # condition group
+                    "not",  # reference group
+                    cid,  # cell identity
+                )
+                for cid in cids
+            ]
+        else:
+            new_contrasts_tuples = [
+                (
+                    "",  # common prefix
+                    condition_group,  # condition group
+                    reference_group,  # reference group
+                    cid,  # cell identity
+                )
+                for cid in cids
+            ]
+        new_contrasts = [
+            f"group{cnd}{prefix}_{cid}".strip("_")
+            + "-"
+            + f"group{ref}{prefix}_{cid}".strip("_")
+            for prefix, cnd, ref, cid in new_contrasts_tuples
+        ]
+        for contrast, contrast_tuple in zip(new_contrasts, new_contrasts_tuples):
+            prefix, cnd, ref, cid = contrast_tuple
+            if ref == "not":
+                cnd, ref = "", "rest"
+            contrast_key = f"{prefix}{cnd}_vs_{ref}"
+            if cid:
+                contrast_key = f"{cell_identity_key}:{cid}|{contrast_key}"
+            if verbosity > 0:
+                print(f"Computing contrast: {contrast_key}... ({contrast})")
+            R(f"myContrast <- makeContrasts('{contrast}', levels = y$design)")
+            R("qlf <- glmQLFTest(fit, contrast=myContrast)")
+            R("tt <- topTags(qlf, n = Inf)$table")
+            tt: pd.DataFrame = pandas2ri.rpy2py(R("tt"))
+            tt.index.name = "gene_ids"
+            genes = tt.index
+            cnd, ref = [c[5:] for c in contrast.split("-")]
+            tt["pct_expr_cnd"] = aggr_adata.var[f"pct_expr_{cnd}"].loc[genes]
+            tt["pct_expr_ref"] = aggr_adata.var[f"pct_expr_{ref}"].loc[genes]
+            tt["num_expr_cnd"] = aggr_adata.var[f"num_expr_{cnd}"].loc[genes]
+            tt["num_expr_ref"] = aggr_adata.var[f"num_expr_{ref}"].loc[genes]
+            tt["tot_expr_cnd"] = aggr_adata.var[f"tot_expr_{cnd}"].loc[genes]
+            tt["tot_expr_ref"] = aggr_adata.var[f"tot_expr_{ref}"].loc[genes]
+            tt["mean_cnd"] = tt["tot_expr_cnd"] / tt["num_expr_cnd"]
+            tt["mean_ref"] = tt["tot_expr_ref"] / tt["num_expr_ref"]
+            tt_dict[contrast_key] = tt
+    return tt_dict
+_fit_model_r_script = """
+suppressPackageStartupMessages({
+    library(edgeR)
+    library(MAST)
+})
+fit_model <- function(adata_, group_key, cell_identity_key = "None", batch_key = "None", verbosity = 0){
+    if (verbosity > 0){
+        cat("Group key:", group_key, "\n")
+        cat("Cell identity key:", cell_identity_key, "\n")
+    }
+    # create an edgeR object with counts and grouping factor
+    y <- DGEList(assay(adata_, "X"), group = colData(adata_)[[group_key]])
+    # filter out genes with low counts
+    if (verbosity > 1){
+        cat("Dimensions before subsetting:", dim(y), "\n")
+    }
+    keep <- filterByExpr(y)
+    y <- y[keep, , keep.lib.sizes=FALSE]
+    if (verbosity > 1){
+        cat("Dimensions after subsetting:", dim(y), "\n")
+    }
+    # normalize
+    y <- calcNormFactors(y)
+    # create a vector that is concatentation of condition and cell type that we will later use with contrasts
+    if (cell_identity_key == "None"){
+        group <- colData(adata_)[[group_key]]
+    } else {
+        group <- paste0(colData(adata_)[[group_key]], "_", colData(adata_)[[cell_identity_key]])
+    }
+    if (verbosity > 1){
+        cat("Group(s):", group, "\n")
+    }
+    replica <- colData(adata_)$replica
+    # create a design matrix
+    if (batch_key == "None"){
+        design <- model.matrix(~ 0 + group + replica)
+    } else {
+        batch <- colData(adata_)[[batch_key]]
+        design <- model.matrix(~ 0 + group + replica + batch)
+    }
+    # estimate dispersion
+    y <- estimateDisp(y, design = design)
+    # fit the model
+    fit <- glmQLFit(y, design)
+    return(list("fit"=fit, "design"=design, "y"=y))
+}
+"""
+def _try_imports():
+    try:
+        import rpy2.robjects as robjects
+        from rpy2.robjects.packages import PackageNotInstalledError, importr
+        robjects.r("options(warn=-1)")
+        import anndata2ri  # noqa: F401
+        from rpy2.rinterface_lib.embedded import RRuntimeError  # noqa: F401
+        from rpy2.robjects import numpy2ri, pandas2ri  # noqa: F401
+        from rpy2.robjects.conversion import localconverter  # noqa: F401
+        importr("edgeR")
+        importr("MAST")
+        importr("SingleCellExperiment")
+    except ModuleNotFoundError:
+        message = (
+            "edger_pseudobulk requires rpy2 and anndata2ri to be installed.\n"
+            "or\n"
+            "$ pip install rpy2 sclab-tools[r]\n"
+            "or\n"
+            "$ pip install rpy2 anndata2ri\n"
+            "or\n"
+            "$ conda install -c conda-forge rpy2 anndata2ri\n"
+        )
+        print(message)
+        raise ModuleNotFoundError(message)
+    except PackageNotInstalledError:
+        message = (
+            "edger_pseudobulk requires the following R packages to be installed: edgeR, MAST, and SingleCellExperiment.\n"
+            "> \n"
+            "> if (!require('BiocManager', quietly = TRUE)) install.packages('BiocManager');\n"
+            "> BiocManager::install(c('edgeR', 'MAST', 'SingleCellExperiment'));\n"
+            "> \n"
+        )
+        print(message)
+        raise ImportError(message)

sclab 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

sclab 0.2.5py3-none-any.whl → 0.3.0py3-none-any.whl