sclab 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sclab might be problematic. Click here for more details.

Files changed (53) hide show
  1. sclab/__init__.py +1 -1
  2. sclab/_sclab.py +7 -3
  3. sclab/dataset/_dataset.py +1 -1
  4. sclab/dataset/processor/_processor.py +19 -4
  5. sclab/examples/processor_steps/__init__.py +2 -0
  6. sclab/examples/processor_steps/_doublet_detection.py +68 -0
  7. sclab/examples/processor_steps/_integration.py +47 -20
  8. sclab/examples/processor_steps/_neighbors.py +24 -4
  9. sclab/examples/processor_steps/_pca.py +11 -6
  10. sclab/examples/processor_steps/_preprocess.py +14 -1
  11. sclab/examples/processor_steps/_qc.py +22 -6
  12. sclab/gui/__init__.py +0 -0
  13. sclab/gui/components/__init__.py +7 -0
  14. sclab/gui/components/_guided_pseudotime.py +482 -0
  15. sclab/gui/components/_transfer_metadata.py +186 -0
  16. sclab/methods/__init__.py +16 -0
  17. sclab/preprocess/__init__.py +19 -0
  18. sclab/preprocess/_cca.py +154 -0
  19. sclab/preprocess/_cca_integrate.py +109 -0
  20. sclab/preprocess/_filter_obs.py +42 -0
  21. sclab/preprocess/_harmony.py +421 -0
  22. sclab/preprocess/_harmony_integrate.py +53 -0
  23. sclab/preprocess/_normalize_weighted.py +61 -0
  24. sclab/preprocess/_subset.py +208 -0
  25. sclab/preprocess/_transfer_metadata.py +137 -0
  26. sclab/preprocess/_transform.py +82 -0
  27. sclab/preprocess/_utils.py +96 -0
  28. sclab/tools/__init__.py +0 -0
  29. sclab/tools/cellflow/__init__.py +0 -0
  30. sclab/tools/cellflow/density_dynamics/__init__.py +0 -0
  31. sclab/tools/cellflow/density_dynamics/_density_dynamics.py +349 -0
  32. sclab/tools/cellflow/pseudotime/__init__.py +0 -0
  33. sclab/tools/cellflow/pseudotime/_pseudotime.py +332 -0
  34. sclab/tools/cellflow/pseudotime/timeseries.py +226 -0
  35. sclab/tools/cellflow/utils/__init__.py +0 -0
  36. sclab/tools/cellflow/utils/density_nd.py +215 -0
  37. sclab/tools/cellflow/utils/interpolate.py +334 -0
  38. sclab/tools/cellflow/utils/smoothen.py +124 -0
  39. sclab/tools/cellflow/utils/times.py +55 -0
  40. sclab/tools/differential_expression/__init__.py +5 -0
  41. sclab/tools/differential_expression/_pseudobulk_edger.py +304 -0
  42. sclab/tools/differential_expression/_pseudobulk_helpers.py +277 -0
  43. sclab/tools/doublet_detection/__init__.py +5 -0
  44. sclab/tools/doublet_detection/_scrublet.py +64 -0
  45. sclab/tools/labeling/__init__.py +6 -0
  46. sclab/tools/labeling/sctype.py +233 -0
  47. sclab/utils/__init__.py +5 -0
  48. sclab/utils/_write_excel.py +510 -0
  49. {sclab-0.2.5.dist-info → sclab-0.3.1.dist-info}/METADATA +6 -2
  50. sclab-0.3.1.dist-info/RECORD +82 -0
  51. sclab-0.2.5.dist-info/RECORD +0 -45
  52. {sclab-0.2.5.dist-info → sclab-0.3.1.dist-info}/WHEEL +0 -0
  53. {sclab-0.2.5.dist-info → sclab-0.3.1.dist-info}/licenses/LICENSE +0 -0
sclab/methods/__init__.py CHANGED
@@ -1,3 +1,5 @@
1
+ from importlib.util import find_spec
2
+
1
3
  from .._methods_registry import register_sclab_method
2
4
  from ..examples.processor_steps import (
3
5
  PCA,
@@ -5,11 +7,13 @@ from ..examples.processor_steps import (
5
7
  UMAP,
6
8
  Cluster,
7
9
  DifferentialExpression,
10
+ DoubletDetection,
8
11
  GeneExpression,
9
12
  Integration,
10
13
  Neighbors,
11
14
  Preprocess,
12
15
  )
16
+ from ..gui.components import GuidedPseudotime, TransferMetadata
13
17
 
14
18
  __all__ = [
15
19
  "QC",
@@ -19,8 +23,10 @@ __all__ = [
19
23
  "Neighbors",
20
24
  "UMAP",
21
25
  "Cluster",
26
+ "DoubletDetection",
22
27
  "GeneExpression",
23
28
  "DifferentialExpression",
29
+ "GuidedPseudotime",
24
30
  ]
25
31
 
26
32
  register_sclab_method("Processing")(QC)
@@ -28,7 +34,17 @@ register_sclab_method("Processing")(Preprocess)
28
34
  register_sclab_method("Processing")(PCA)
29
35
  register_sclab_method("Processing")(Integration)
30
36
  register_sclab_method("Processing")(Neighbors)
37
+ register_sclab_method("Processing")(TransferMetadata)
31
38
  register_sclab_method("Processing")(UMAP)
32
39
  register_sclab_method("Processing")(Cluster)
40
+
41
+ if any(
42
+ [
43
+ find_spec("scrublet"),
44
+ ]
45
+ ):
46
+ register_sclab_method("Processing")(DoubletDetection)
47
+
33
48
  register_sclab_method("Analysis")(GeneExpression)
34
49
  register_sclab_method("Analysis")(DifferentialExpression)
50
+ register_sclab_method("Analysis")(GuidedPseudotime)
@@ -0,0 +1,19 @@
1
+ from ._cca_integrate import cca_integrate, cca_integrate_pair
2
+ from ._filter_obs import filter_obs
3
+ from ._harmony_integrate import harmony_integrate
4
+ from ._normalize_weighted import normalize_weighted
5
+ from ._subset import subset_obs, subset_var
6
+ from ._transfer_metadata import transfer_metadata
7
+ from ._transform import pool_neighbors
8
+
9
+ __all__ = [
10
+ "cca_integrate",
11
+ "cca_integrate_pair",
12
+ "filter_obs",
13
+ "harmony_integrate",
14
+ "normalize_weighted",
15
+ "pool_neighbors",
16
+ "subset_obs",
17
+ "subset_var",
18
+ "transfer_metadata",
19
+ ]
@@ -0,0 +1,154 @@
1
+ import logging
2
+ from typing import Literal
3
+
4
+ import numpy as np
5
+ from numpy import matrix
6
+ from numpy.typing import NDArray
7
+ from scipy.linalg import svd
8
+ from scipy.sparse import csc_matrix, csr_matrix, issparse
9
+ from scipy.sparse.linalg import svds
10
+ from sklearn.utils.extmath import randomized_svd
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def cca(
16
+ X: NDArray | csr_matrix | csc_matrix,
17
+ Y: NDArray | csr_matrix | csc_matrix,
18
+ n_components=None,
19
+ svd_solver: Literal["full", "partial", "randomized"] = "partial",
20
+ normalize: bool = False,
21
+ random_state=42,
22
+ ) -> tuple[NDArray, NDArray, NDArray]:
23
+ """
24
+ CCA-style integration for two single-cell matrices with unequal numbers of cells.
25
+
26
+ Parameters
27
+ ----------
28
+ X, Y : array-like, shape (n_cells, n_features)
29
+ feature-by-cell matrices with same column space (variable genes/pcs) in the same order.
30
+ n_components : int or None
31
+ Dimensionality of the canonical space (default = all that the smaller
32
+ dataset allows).
33
+ svd_solver : {'full', 'partial', 'randomized'}
34
+ 'randomized' uses Halko et al. algorithm (`sklearn.utils.extmath.randomized_svd`)
35
+ and is strongly recommended when only the leading few components are needed.
36
+ random_state : int or None
37
+ Passed through to the randomized SVD for reproducibility.
38
+
39
+ Returns
40
+ -------
41
+ U : (n_cells(X), k) ndarray
42
+ V : (n_cells(Y), k) ndarray
43
+ Cell-level canonical variables.
44
+ """
45
+ n1, p1 = X.shape
46
+ n2, p2 = Y.shape
47
+ if p1 != p2:
48
+ raise ValueError("The two matrices must have the same number of features.")
49
+
50
+ k = n_components or min(n1, n2)
51
+
52
+ if issparse(X):
53
+ C = _cross_covariance_sparse(X, Y)
54
+ else:
55
+ C = _cross_covariance_dense(X, Y)
56
+
57
+ logger.info(f"Cross-covariance computed. Shape: {C.shape}")
58
+
59
+ Uc, s, Vct = _svd_decomposition(C, k, svd_solver, random_state)
60
+
61
+ # canonical variables
62
+ # Left and right singular vectors are cell embeddings
63
+ U = Uc # (n1 x k)
64
+ V = Vct.T # (n2 x k)
65
+
66
+ if normalize:
67
+ logger.info("Normalizing canonical variables...")
68
+ U = U / np.linalg.norm(U, axis=1, keepdims=True)
69
+ V = V / np.linalg.norm(V, axis=1, keepdims=True)
70
+
71
+ logger.info("Done.")
72
+
73
+ return U, s, V
74
+
75
+
76
+ def _svd_decomposition(
77
+ C: NDArray,
78
+ k: int,
79
+ svd_solver: Literal["full", "partial", "randomized"],
80
+ random_state: int | None,
81
+ ) -> tuple[NDArray, NDArray, NDArray]:
82
+ if svd_solver == "full":
83
+ logger.info("SVD decomposition with full SVD...")
84
+ Uc, s, Vct = svd(C, full_matrices=False)
85
+ Uc, s, Vct = Uc[:, :k], s[:k], Vct[:k, :]
86
+
87
+ elif svd_solver == "partial":
88
+ logger.info("SVD decomposition with partial SVD...")
89
+ Uc, s, Vct = svds(C, k=k)
90
+
91
+ elif svd_solver == "randomized":
92
+ logger.info("SVD decomposition with randomized SVD...")
93
+ Uc, s, Vct = randomized_svd(C, n_components=k, random_state=random_state)
94
+
95
+ else:
96
+ raise ValueError("svd_solver must be 'full' or 'partial'.")
97
+
98
+ order = np.argsort(-s)
99
+ s = s[order]
100
+ Uc = Uc[:, order]
101
+ Vct = Vct[order, :]
102
+
103
+ return Uc, s, Vct
104
+
105
+
106
+ def _cross_covariance_sparse(X: csr_matrix, Y: csr_matrix) -> NDArray:
107
+ _, p1 = X.shape
108
+ _, p2 = Y.shape
109
+ if p1 != p2:
110
+ raise ValueError("The two matrices must have the same number of features.")
111
+
112
+ p = p1
113
+
114
+ # TODO: incorporate sparse scaling
115
+
116
+ logger.info("Computing cross-covariance on sparse matrices...")
117
+
118
+ mux: matrix = X.mean(axis=0)
119
+ muy: matrix = Y.mean(axis=0)
120
+
121
+ XYt: csr_matrix = X.dot(Y.T)
122
+ Xmuyt: matrix = X.dot(muy.T)
123
+ muxYt: matrix = Y.dot(mux.T).T
124
+ muxmuyt: float = (mux @ muy.T)[0, 0]
125
+
126
+ C = (XYt - Xmuyt - muxYt + muxmuyt) / (p - 1)
127
+
128
+ return np.asarray(C)
129
+
130
+
131
+ def _cross_covariance_dense(X: NDArray, Y: NDArray) -> NDArray:
132
+ _, p1 = X.shape
133
+ _, p2 = Y.shape
134
+ if p1 != p2:
135
+ raise ValueError("The two matrices must have the same number of features.")
136
+
137
+ p = p1
138
+
139
+ logger.info("Computing cross-covariance on dense matrices...")
140
+ X = _dense_scale(X)
141
+ Y = _dense_scale(Y)
142
+
143
+ X = X - X.mean(axis=0, keepdims=True)
144
+ Y = Y - Y.mean(axis=0, keepdims=True)
145
+
146
+ C: NDArray = (X @ Y.T) / (p - 1)
147
+
148
+ return C
149
+
150
+
151
+ def _dense_scale(A: NDArray) -> NDArray:
152
+ A = np.asarray(A)
153
+ eps = np.finfo(A.dtype).eps
154
+ return A / (A.std(axis=0, ddof=1, keepdims=True) + eps)
@@ -0,0 +1,109 @@
1
+ import numpy as np
2
+ from anndata import AnnData
3
+
4
+ from ._cca import cca
5
+
6
+
7
+ def cca_integrate(
8
+ adata: AnnData,
9
+ key: str,
10
+ *,
11
+ basis: str = "X",
12
+ adjusted_basis: str | None = None,
13
+ reference_batch: str | list[str] | None = None,
14
+ mask_var: str | None = None,
15
+ n_components: int = 30,
16
+ svd_solver: str = "partial",
17
+ normalize: bool = False,
18
+ random_state: int | None = None,
19
+ ):
20
+ n_groups = adata.obs[key].nunique()
21
+ if n_groups == 2:
22
+ cca_integrate_pair(
23
+ adata,
24
+ key,
25
+ adata.obs[key].unique()[0],
26
+ adata.obs[key].unique()[1],
27
+ basis=basis,
28
+ adjusted_basis=adjusted_basis,
29
+ mask_var=mask_var,
30
+ n_components=n_components,
31
+ svd_solver=svd_solver,
32
+ normalize=normalize,
33
+ random_state=random_state,
34
+ )
35
+ else:
36
+ raise NotImplementedError
37
+
38
+
39
+ def cca_integrate_pair(
40
+ adata: AnnData,
41
+ key: str,
42
+ group1: str,
43
+ group2: str,
44
+ *,
45
+ basis: str | None = None,
46
+ adjusted_basis: str | None = None,
47
+ mask_var: str | None = None,
48
+ n_components: int = 30,
49
+ svd_solver: str = "partial",
50
+ normalize: bool = False,
51
+ random_state: int | None = None,
52
+ ):
53
+ if basis is None:
54
+ basis = "X"
55
+
56
+ if adjusted_basis is None:
57
+ adjusted_basis = basis + "_cca"
58
+
59
+ if mask_var is not None:
60
+ mask = adata.var[mask_var].values
61
+ else:
62
+ mask = np.ones(adata.n_vars, dtype=bool)
63
+
64
+ Xs = {}
65
+ groups = adata.obs.groupby(key, observed=True).groups
66
+ for gr, idx in groups.items():
67
+ Xs[gr] = _get_basis(adata[idx, mask], basis)
68
+
69
+ Ys = {}
70
+ Ys[group1], sigma, Ys[group2] = cca(
71
+ Xs[group1],
72
+ Xs[group2],
73
+ n_components=n_components,
74
+ svd_solver=svd_solver,
75
+ normalize=normalize,
76
+ random_state=random_state,
77
+ )
78
+
79
+ if (
80
+ adjusted_basis not in adata.obsm
81
+ or adata.obsm[adjusted_basis].shape[1] != n_components
82
+ ):
83
+ adata.obsm[adjusted_basis] = np.full((adata.n_obs, n_components), np.nan)
84
+
85
+ if adjusted_basis not in adata.uns:
86
+ adata.uns[adjusted_basis] = {}
87
+
88
+ uns = adata.uns[adjusted_basis]
89
+ uns[f"{group1}-{group2}"] = {"sigma": sigma}
90
+ for gr, obs_names in groups.items():
91
+ idx = adata.obs_names.get_indexer(obs_names)
92
+ adata.obsm[adjusted_basis][idx] = Ys[gr]
93
+ uns[gr] = Ys[gr]
94
+
95
+
96
+ def _get_basis(adata: AnnData, basis: str):
97
+ if basis == "X":
98
+ X = adata.X
99
+
100
+ elif basis in adata.layers:
101
+ X = adata.layers[basis]
102
+
103
+ elif basis in adata.obsm:
104
+ X = adata.obsm[basis]
105
+
106
+ else:
107
+ raise ValueError(f"Unknown basis {basis}")
108
+
109
+ return X
@@ -0,0 +1,42 @@
1
+ import numpy as np
2
+ from anndata import AnnData
3
+ from scipy.stats import rankdata
4
+
5
+
6
+ def filter_obs(
7
+ adata: AnnData,
8
+ *,
9
+ layer: str | None = None,
10
+ min_counts: int | None = None,
11
+ min_genes: int | None = None,
12
+ max_counts: int | None = None,
13
+ max_cells: int | None = None,
14
+ ) -> None:
15
+ if layer is not None:
16
+ X = adata.layers[layer]
17
+ else:
18
+ X = adata.X
19
+
20
+ remove_mask = np.zeros(X.shape[0], dtype=bool)
21
+
22
+ if min_genes is not None:
23
+ M = X > 0
24
+ rowsums = np.asarray(M.sum(axis=1)).squeeze()
25
+ remove_mask[rowsums < min_genes] = True
26
+
27
+ if min_counts is not None or max_counts is not None or max_cells is not None:
28
+ rowsums = np.asarray(X.sum(axis=1)).squeeze()
29
+
30
+ if min_counts is not None:
31
+ remove_mask[rowsums < min_counts] = True
32
+
33
+ if max_counts is not None:
34
+ remove_mask[rowsums > max_counts] = True
35
+
36
+ if max_cells is not None:
37
+ ranks = rankdata(-rowsums, method="min")
38
+ remove_mask[ranks > max_cells] = True
39
+
40
+ if remove_mask.any():
41
+ obs_idx = adata.obs_names[~remove_mask]
42
+ adata._inplace_subset_obs(obs_idx)