PyPI - sclab - Versions diffs - 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

sclab 0.2.5py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sclab might be problematic. Click here for more details.

Files changed (53) hide show

sclab/__init__.py +1 -1
sclab/_sclab.py +7 -3
sclab/dataset/_dataset.py +1 -1
sclab/dataset/processor/_processor.py +19 -4
sclab/examples/processor_steps/__init__.py +2 -0
sclab/examples/processor_steps/_doublet_detection.py +68 -0
sclab/examples/processor_steps/_integration.py +47 -20
sclab/examples/processor_steps/_neighbors.py +24 -4
sclab/examples/processor_steps/_pca.py +11 -6
sclab/examples/processor_steps/_preprocess.py +14 -1
sclab/examples/processor_steps/_qc.py +22 -6
sclab/gui/__init__.py +0 -0
sclab/gui/components/__init__.py +7 -0
sclab/gui/components/_guided_pseudotime.py +482 -0
sclab/gui/components/_transfer_metadata.py +186 -0
sclab/methods/__init__.py +16 -0
sclab/preprocess/__init__.py +19 -0
sclab/preprocess/_cca.py +154 -0
sclab/preprocess/_cca_integrate.py +109 -0
sclab/preprocess/_filter_obs.py +42 -0
sclab/preprocess/_harmony.py +421 -0
sclab/preprocess/_harmony_integrate.py +53 -0
sclab/preprocess/_normalize_weighted.py +61 -0
sclab/preprocess/_subset.py +208 -0
sclab/preprocess/_transfer_metadata.py +137 -0
sclab/preprocess/_transform.py +82 -0
sclab/preprocess/_utils.py +96 -0
sclab/tools/__init__.py +0 -0
sclab/tools/cellflow/__init__.py +0 -0
sclab/tools/cellflow/density_dynamics/__init__.py +0 -0
sclab/tools/cellflow/density_dynamics/_density_dynamics.py +349 -0
sclab/tools/cellflow/pseudotime/__init__.py +0 -0
sclab/tools/cellflow/pseudotime/_pseudotime.py +332 -0
sclab/tools/cellflow/pseudotime/timeseries.py +226 -0
sclab/tools/cellflow/utils/__init__.py +0 -0
sclab/tools/cellflow/utils/density_nd.py +215 -0
sclab/tools/cellflow/utils/interpolate.py +334 -0
sclab/tools/cellflow/utils/smoothen.py +124 -0
sclab/tools/cellflow/utils/times.py +55 -0
sclab/tools/differential_expression/__init__.py +5 -0
sclab/tools/differential_expression/_pseudobulk_edger.py +304 -0
sclab/tools/differential_expression/_pseudobulk_helpers.py +277 -0
sclab/tools/doublet_detection/__init__.py +5 -0
sclab/tools/doublet_detection/_scrublet.py +64 -0
sclab/tools/labeling/__init__.py +6 -0
sclab/tools/labeling/sctype.py +233 -0
sclab/utils/__init__.py +5 -0
sclab/utils/_write_excel.py +510 -0
{sclab-0.2.5.dist-info → sclab-0.3.1.dist-info}/METADATA +6 -2
sclab-0.3.1.dist-info/RECORD +82 -0
sclab-0.2.5.dist-info/RECORD +0 -45
{sclab-0.2.5.dist-info → sclab-0.3.1.dist-info}/WHEEL +0 -0
{sclab-0.2.5.dist-info → sclab-0.3.1.dist-info}/licenses/LICENSE +0 -0

sclab/tools/cellflow/pseudotime/timeseries.py ADDED Viewed

@@ -0,0 +1,226 @@
+from typing import Callable, NamedTuple
+import numpy as np
+from numpy.lib.stride_tricks import sliding_window_view
+from numpy.typing import NDArray
+from scipy.signal import find_peaks
+from scipy.sparse import csr_matrix, issparse
+from tqdm.auto import tqdm
+from ..utils.interpolate import NDBSpline
+def periodic_sliding_window(
+    data: NDArray, t: NDArray, window_size: int, fn: Callable[[NDArray], NDArray]
+) -> NDArray:
+    ws = window_size + ((window_size - 1) % 2)
+    window_shape = (ws,) + (1,) * (data.ndim - 1)
+    o = np.argsort(t)
+    oo = np.argsort(o)
+    d = data[o]
+    dd = [*d[-ws // 2 + 1 :], *d, *d[: ws // 2]]
+    windows = sliding_window_view(dd, window_shape=window_shape).squeeze()
+    return fn(windows, axis=-1)[oo]
+def equalization(
+    times: NDArray,
+    t_range: tuple[float, float],
+    max_bins: int = 200,
+    iterations: int = 1e4,
+    tolerance: float = 0.02,
+) -> NDArray:
+    if not isinstance(times, np.ndarray):
+        raise TypeError("times must be a numpy array")
+    if times.ndim != 1:
+        raise ValueError("times must be a 1D array")
+    t_min, t_max = t_range
+    t_span = t_max - t_min
+    # for sorting the values
+    o = np.argsort(times)
+    # and recovering the original order
+    oo = np.argsort(o)
+    alpha = 0.1
+    scale_offset = 1
+    rng = np.random.default_rng()
+    scaled_times = times.copy()
+    for n_bins in tqdm(np.arange(25, max_bins + 1, 25)):
+        for it in range(int(iterations)):
+            bins = np.linspace(t_min, t_max, n_bins + 1)
+            bins[1:-1] += rng.normal(0, t_span / n_bins / 100, bins[1:-1].size)
+            counts, _ = np.histogram(scaled_times, bins=bins)
+            tmp: NDArray = counts / counts.max()
+            rms = np.sqrt(np.mean((tmp - tmp.mean()) ** 2))
+            if rms < tolerance:
+                break
+            scales = counts / counts.max() * alpha + scale_offset
+            t = scaled_times[o]
+            tt = []
+            i = 0
+            timepoint = 0.0
+            for start, end, scale in zip(bins[:-1], bins[1:], scales):
+                bin_size = end - start
+                new_size = bin_size * scale
+                while i < t.size and t[i] < end:
+                    new_t = (t[i] - start) * scale + timepoint
+                    tt.append(new_t)
+                    i += 1
+                timepoint += new_size
+            tt = np.array(tt)
+            scaled_times = tt[oo] / timepoint * t_span + t_min
+        else:
+            cnts_mean, cnts_max, cnts_min = counts.mean(), counts.max(), counts.min()
+            print(
+                f"Failed to converge. RMS: {rms}. "
+                + f"({cnts_mean=:.2f}, {cnts_max=:.2f}, {cnts_min=:.2f})"
+            )
+    return scaled_times
+def fit_trends(
+    X: NDArray | csr_matrix,
+    times: NDArray,
+    t_range: tuple[float, float],
+    periodic: bool,
+    grid_size: int = 128,
+    roughness: float | None = None,
+    zero_weight: float = 0.5,
+    window_width: float | None = None,
+    n_timesteps: int | None = None,
+    timestep_delta: float | None = None,
+    progress: bool = True,
+) -> None:
+    if issparse(X):
+        X = np.ascontiguousarray(X.todense())
+    tmin, tmax = t_range
+    mask = ~np.isnan(times)
+    t = times[mask]
+    X = X[mask]
+    F = NDBSpline(
+        grid_size=grid_size,
+        t_range=t_range,
+        periodic=periodic,
+        zero_weight=zero_weight,
+        roughness=roughness,
+        window_width=window_width,
+    )
+    F.fit(t, X, progress=progress)
+    eps = np.finfo(float).eps
+    SNR: NDArray
+    SNR = F(t).var(axis=0) / (X.var(axis=0) + eps)
+    SNR = SNR / SNR.max()
+    # x = np.linspace(*t_range, 10001)[:-1]
+    # peak_time = x[np.argmax(F(x), axis=0)]
+    if n_timesteps is not None and timestep_delta is not None:
+        raise ValueError("Cannot specify both n_timesteps and timestep_delta")
+    elif n_timesteps is None and timestep_delta is None:
+        # default
+        x = np.linspace(*t_range, 101)
+    elif n_timesteps is not None:
+        x = np.linspace(*t_range, n_timesteps)
+    elif timestep_delta is not None:
+        x = np.arange(tmin, tmax + timestep_delta, timestep_delta)
+    Y = F(x)
+    return x, Y
+class SinglePeakResult(NamedTuple):
+    times: NDArray
+    heights: NDArray
+    scores: NDArray
+    info: NDArray
+def find_single_peaks(
+    X: NDArray,
+    t: NDArray,
+    t_range: tuple[float, float] = (0, 1),
+    grid_size: int = 512,
+    periodic: bool = True,
+    zero_weight: float = 0.2,
+    roughness: float = 2,
+    n_timesteps: int = 201,
+    width_range: tuple[float, float] = (0, 100),
+    score_threshold: float = 2.5,
+    progress: bool = True,
+) -> tuple[NDArray, NDArray]:
+    X = X / np.percentile(X + 1, 99, axis=0, keepdims=True)
+    x, Y = fit_trends(
+        X,
+        t,
+        t_range=t_range,
+        periodic=periodic,
+        grid_size=grid_size,
+        zero_weight=zero_weight,
+        roughness=roughness,
+        n_timesteps=n_timesteps,
+        progress=progress,
+    )
+    peak_times = np.full(X.shape[1], np.nan)
+    peak_heights = np.full(X.shape[1], np.nan)
+    peak_scores = np.full(X.shape[1], np.nan)
+    peak_info_data = [{}] * X.shape[1]
+    idx_sequence = range(X.shape[1])
+    if progress:
+        idx_sequence = tqdm(idx_sequence)
+    for i in idx_sequence:
+        y = Y[:, i]
+        k, info = find_peaks(y, prominence=0.05, width=width_range, height=0)
+        m = np.median(y)
+        s = y[k] / m
+        k = k[s > score_threshold]
+        if len(k) == 1:
+            peak_times[i] = x[k]
+            peak_heights[i] = y[k]
+            peak_scores[i] = np.log2(s[0])
+            peak_info_data[i] = info
+    return SinglePeakResult(peak_times, peak_heights, peak_scores, peak_info_data)
+def piecewise_scaling(
+    times: NDArray,
+    t_range: tuple[float, float],
+    start: float,
+    end: float,
+    new_end: float,
+) -> NDArray:
+    tmin, tmax = t_range
+    times_pws = np.full(times.shape, np.nan)
+    mask = (times >= tmin) & (times < start)
+    times_pws[mask] = times[mask]
+    mask = (times >= start) & (times < end)
+    times_pws[mask] = (times[mask] - start) / (end - start) * (new_end - start) + start
+    mask = (times >= end) & (times < tmax)
+    times_pws[mask] = (times[mask] - end) / (tmax - end) * (tmax - new_end) + new_end
+    return times_pws

sclab/tools/cellflow/utils/__init__.py ADDED Viewed

File without changes

sclab/tools/cellflow/utils/density_nd.py ADDED Viewed

@@ -0,0 +1,215 @@
+from itertools import product
+from typing import Literal, NamedTuple
+import matplotlib.pyplot as plt
+import numpy as np
+from numpy.typing import NDArray
+from scipy.integrate import trapezoid
+from scipy.interpolate import BSpline
+from sklearn.neighbors import KernelDensity
+from .interpolate import fit_smoothing_spline
+class DensityResult(NamedTuple):
+    kde: KernelDensity
+    grid_size: int
+    bounds: tuple[tuple[float, float], ...]
+    grid: NDArray
+    density: NDArray
+    scale: float
+    periodic: bool
+def density_nd(
+    data: NDArray,
+    bandwidth: float | Literal["scott", "silverman"] | None = None,
+    algorithm: Literal["kd_tree", "ball_tree", "auto"] = "auto",
+    kernel: str = "gaussian",
+    metric: str = "euclidean",
+    grid_size: tuple | None = None,
+    max_grid_size: int = 2**5 + 1,
+    periodic: bool = False,
+    bounds: tuple[tuple[float, float], ...] | None = None,
+    normalize: bool = False,
+) -> DensityResult:
+    if data.ndim == 1:
+        data = data.reshape(-1, 1)
+    nsamples, ndims = data.shape
+    if bounds is None:
+        assert not periodic, "bounds must be specified if periodic=True"
+        lower, upper = data.min(axis=0), data.max(axis=0)
+        span = upper - lower
+        margins = span / 10
+        bounds = tuple(zip(lower - margins, upper + margins))
+    assert len(bounds) == ndims, "must provide bounds for each dimension"
+    if periodic:
+        offsets = np.array(list(product([-1, 0, 1], repeat=ndims)))
+        offsets = offsets * np.diff(bounds).T
+        dat = np.empty((nsamples * 3**ndims, ndims))
+        for i, offset in enumerate(offsets):
+            dat[i * nsamples : (i + 1) * nsamples] = data + offset[None, :]
+    else:
+        dat = data
+    if bandwidth is None:
+        bandwidth = np.diff(bounds).max() / 64
+    kde = KernelDensity(
+        bandwidth=bandwidth,
+        algorithm=algorithm,
+        kernel=kernel,
+        metric=metric,
+    )
+    kde.fit(dat)
+    if grid_size is None:
+        max_span = np.diff(bounds).max()
+        rel_span = np.diff(bounds).flatten() / max_span
+        grid_size = tuple((rel_span * max_grid_size).astype(int))
+    grid = np.meshgrid(
+        *[np.linspace(*b, n) for b, n in zip(bounds, grid_size)], indexing="ij"
+    )
+    grid = np.vstack([x.ravel() for x in grid]).T
+    d = np.exp(kde.score_samples(grid))
+    if normalize and ndims == 1:
+        scale = trapezoid(d, grid.reshape(-1))
+    elif normalize:
+        # perform simple Riemmann sum for higher dimensions
+        deltas = np.diff(bounds).T / (np.array(grid_size) - 1)
+        tmp = d.reshape(grid_size).copy()
+        for i, s in enumerate(grid_size):
+            # take left corners for the sum
+            tmp = tmp.take(np.arange(s - 1), axis=i)
+        scale = tmp.sum() * np.prod(deltas)
+    else:
+        scale = 1
+    d /= scale
+    return DensityResult(kde, grid_size, bounds, grid, d, scale, periodic)
+def fit_density_1d(
+    times: NDArray[np.floating],
+    t_range: tuple[float, float],
+    periodic: bool,
+    bandwidth: float | None = None,
+    algorithm: str = "auto",
+    kernel: str = "gaussian",
+    metric: str = "euclidean",
+    max_grid_size: int = 2**8 + 1,
+    lam: float = 1e-5,
+) -> tuple[DensityResult, BSpline]:
+    tmin, tmax = t_range
+    tspan = tmax - tmin
+    times_mask = (tmin <= times) * (times <= tmax)
+    times = times[times_mask]
+    if bandwidth is None:
+        bandwidth = tspan / 64
+    rslt = density_nd(
+        times.reshape(-1, 1),
+        bandwidth=bandwidth,
+        algorithm=algorithm,
+        kernel=kernel,
+        metric=metric,
+        max_grid_size=max_grid_size,
+        periodic=periodic,
+        bounds=(t_range,),
+        normalize=True,
+    )
+    bspl = fit_smoothing_spline(
+        rslt.grid[:, 0],
+        rslt.density,
+        t_range,
+        lam=lam,
+        periodic=periodic,
+    )
+    return rslt, bspl
+def density_result_1d(
+    rslt: DensityResult,
+    data: NDArray | None = None,
+    density_fit_lam: float = 1e-6,
+    plot_density: bool = False,
+    plot_density_fit: bool = True,
+    plot_density_fit_derivative: bool = False,
+    plot_histogram: bool = False,
+    histogram_nbins: int = 50,
+    ax: plt.Axes | None = None,
+    show: bool = True,
+):
+    if plot_density | plot_density_fit | plot_density_fit_derivative | plot_histogram:
+        pass
+    else:
+        raise ValueError("At least one of the plotting options must be True")
+    tmin, tmax = rslt.grid.min(), rslt.grid.max()
+    bspl = fit_smoothing_spline(
+        rslt.grid[:, 0],
+        rslt.density,
+        t_range=(tmin, tmax),
+        lam=density_fit_lam,
+        periodic=rslt.periodic,
+    )
+    if ax is None:
+        plt.figure(figsize=(10, 3))
+    else:
+        plt.sca(ax)
+    ax = plt.gca()
+    if plot_density:
+        ax.plot(rslt.grid.flatten(), rslt.density, color="black", linewidth=0.5)
+    if plot_histogram:
+        assert data is not None, "data must be provided if plot_histogram=True"
+        # we expand the time vector to make sure that the first and last point
+        # are not cut by the boundary. This also helps to avoid the problem of
+        # the first and last point having different values (should be periodic).
+        tt = np.concatenate([data - tmax, data, data + tmax])
+        bins = np.linspace(-tmax, 2 * tmax, histogram_nbins * 3 + 1)
+        dd = np.histogram(tt, bins=bins, density=True)[0]
+        # we take the middle points of the bins
+        xx = bins[:-1] + np.diff(bins) / 2
+        # we recover the original time vector and corresponding density
+        x = xx[histogram_nbins : 2 * histogram_nbins]
+        d = dd[histogram_nbins : 2 * histogram_nbins] * 3  # correct the density
+        ax.bar(x, d, width=1 / histogram_nbins, fill=False, linewidth=0.5)
+    x = np.linspace(tmin, tmax, 2**10 + 1)
+    if plot_density_fit:
+        plt.plot(x, bspl(x), color="blue")
+    ax.set_ylabel("Density", color="blue")
+    ax.set_yticks([])
+    ymin, ymax = plt.ylim()
+    # add a bit of padding in the y axis (about 10% of current range)
+    plt.ylim(ymin, ymax + 0.10 * (ymax - ymin))
+    if plot_density_fit_derivative:
+        if plot_density or histogram_nbins or plot_density_fit:
+            plt.twinx()
+        plt.plot(x, bspl.derivative()(x), color="red")
+        plt.hlines(0, tmin, tmax, linestyles="dashed", linewidth=0.5, color="black")
+        plt.ylabel("Derivative", color="red")
+        plt.gca().set_yticks([])
+        # add padding in the y axis to make zero be in the middle
+        ymax = np.abs(plt.ylim()).max() * 1.05
+        plt.ylim(-ymax, ymax)
+    if show:
+        plt.show()
+    else:
+        return plt.gca()

sclab 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

Potentially problematic release.

sclab 0.2.5py3-none-any.whl → 0.3.1py3-none-any.whl