scib-metrics 0.5.3__tar.gz → 0.5.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.pre-commit-config.yaml +1 -1
  2. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/CHANGELOG.md +24 -0
  3. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/PKG-INFO +1 -1
  4. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/api.md +1 -0
  5. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/references.bib +10 -0
  6. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/pyproject.toml +1 -1
  7. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/__init__.py +2 -0
  8. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/benchmark/_core.py +2 -2
  9. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/metrics/__init__.py +2 -1
  10. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/metrics/_kbet.py +47 -46
  11. scib_metrics-0.5.5/src/scib_metrics/metrics/_silhouette.py +162 -0
  12. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/utils/_dist.py +23 -3
  13. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/utils/_silhouette.py +62 -13
  14. scib_metrics-0.5.5/tests/test_BRAS_metric.py +347 -0
  15. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/tests/test_benchmarker.py +14 -0
  16. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/tests/test_metrics.py +6 -0
  17. scib_metrics-0.5.3/src/scib_metrics/metrics/_silhouette.py +0 -86
  18. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.codecov.yaml +0 -0
  19. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.cruft.json +0 -0
  20. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.editorconfig +0 -0
  21. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  22. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  23. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  24. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.github/workflows/build.yaml +0 -0
  25. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.github/workflows/release.yaml +0 -0
  26. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.github/workflows/test_linux.yaml +0 -0
  27. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.github/workflows/test_linux_cuda.yaml +0 -0
  28. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.github/workflows/test_linux_pre.yaml +0 -0
  29. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.github/workflows/test_macos.yaml +0 -0
  30. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.github/workflows/test_macos_m1.yaml +0 -0
  31. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.github/workflows/test_windows.yaml +0 -0
  32. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.gitignore +0 -0
  33. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/.readthedocs.yaml +0 -0
  34. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/LICENSE +0 -0
  35. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/README.md +0 -0
  36. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/Makefile +0 -0
  37. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/_static/.gitkeep +0 -0
  38. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/_static/css/custom.css +0 -0
  39. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/_templates/.gitkeep +0 -0
  40. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/_templates/autosummary/class.rst +0 -0
  41. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/_templates/class_no_inherited.rst +0 -0
  42. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/changelog.md +0 -0
  43. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/conf.py +0 -0
  44. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/contributing.md +0 -0
  45. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/extensions/.gitkeep +0 -0
  46. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/extensions/typed_returns.py +0 -0
  47. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/index.md +0 -0
  48. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/notebooks/large_scale.ipynb +0 -0
  49. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/notebooks/lung_example.ipynb +0 -0
  50. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/references.md +0 -0
  51. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/template_usage.md +0 -0
  52. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/docs/tutorials.md +0 -0
  53. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/setup.py +0 -0
  54. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/_settings.py +0 -0
  55. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/_types.py +0 -0
  56. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/benchmark/__init__.py +0 -0
  57. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/metrics/_graph_connectivity.py +0 -0
  58. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/metrics/_isolated_labels.py +0 -0
  59. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/metrics/_lisi.py +0 -0
  60. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/metrics/_nmi_ari.py +0 -0
  61. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/metrics/_pcr_comparison.py +0 -0
  62. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/nearest_neighbors/__init__.py +0 -0
  63. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/nearest_neighbors/_dataclass.py +0 -0
  64. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/nearest_neighbors/_jax.py +0 -0
  65. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/nearest_neighbors/_pynndescent.py +0 -0
  66. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/utils/__init__.py +0 -0
  67. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/utils/_diffusion_nn.py +0 -0
  68. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/utils/_kmeans.py +0 -0
  69. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/utils/_lisi.py +0 -0
  70. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/utils/_pca.py +0 -0
  71. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/utils/_pcr.py +0 -0
  72. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/src/scib_metrics/utils/_utils.py +0 -0
  73. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/tests/__init__.py +0 -0
  74. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/tests/test_neighbors.py +0 -0
  75. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/tests/test_pcr_comparison.py +0 -0
  76. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/tests/utils/__init__.py +0 -0
  77. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/tests/utils/data.py +0 -0
  78. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/tests/utils/sampling.py +0 -0
  79. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/tests/utils/test_pca.py +0 -0
  80. {scib_metrics-0.5.3 → scib_metrics-0.5.5}/tests/utils/test_pcr.py +0 -0
@@ -11,7 +11,7 @@ repos:
11
11
  hooks:
12
12
  - id: prettier
13
13
  - repo: https://github.com/astral-sh/ruff-pre-commit
14
- rev: v0.9.6
14
+ rev: v0.11.12
15
15
  hooks:
16
16
  - id: ruff
17
17
  types_or: [python, pyi, jupyter]
@@ -10,6 +10,30 @@ and this project adheres to [Semantic Versioning][].
10
10
 
11
11
  ## 0.6.0 (unreleased)
12
12
 
13
+ ## 0.5.5 (2025-06-03)
14
+
15
+ ### Added
16
+
17
+ - Add batch removal adapted silhouette (BRAS) metric ({func}`scib_metrics.metrics.bras`) {pr}`197`, which addresses limitations of silhouette for scoring batch effect removal.
18
+ - Add cosine distance implementation required for BRAS.
19
+
20
+ ### Changed
21
+
22
+ - Changed {func}`scib_metrics.utils.cdist` to support cosine distance.
23
+ - Changed silhouette-related functions to be compatible with adaptions required for BRAS.
24
+
25
+ ## 0.5.4 (2025-04-23)
26
+
27
+ ### Fixed
28
+
29
+ - Apply default values for benchmarker metrics {pr}`203`.
30
+
31
+ ## 0.5.3 (2025-02-17)
32
+
33
+ ### Removed
34
+
35
+ - Reverted a change that was needed for scib-autotune in scvi-tools {pr}`189`.
36
+
13
37
  ## 0.5.2 (2025-02-13)
14
38
 
15
39
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scib-metrics
3
- Version: 0.5.3
3
+ Version: 0.5.5
4
4
  Summary: Accelerated and Python-only scIB metrics
5
5
  Project-URL: Documentation, https://scib-metrics.readthedocs.io/
6
6
  Project-URL: Source, https://github.com/yoseflab/scib-metrics
@@ -42,6 +42,7 @@ scib_metrics.ilisi_knn(...)
42
42
  pcr_comparison
43
43
  silhouette_label
44
44
  silhouette_batch
45
+ bras
45
46
  ilisi_knn
46
47
  clisi_knn
47
48
  kbet
@@ -36,3 +36,13 @@
36
36
  pages = {43--49},
37
37
  publisher = {Springer Science and Business Media {LLC}}
38
38
  }
39
+
40
+ @article{rautenstrauch2025,
41
+ title = {Metrics Matter: Why We Need to Stop Using Silhouette in Single-Cell Benchmarking},
42
+ author = {Pia Rautenstrauch and Uwe Ohler},
43
+ doi = {10.1101/2025.01.21.634098},
44
+ year = {2025},
45
+ month = jan,
46
+ journal = {bioRxiv},
47
+ publisher = {Cold Spring Harbor Laboratory}
48
+ }
@@ -5,7 +5,7 @@ requires = ["hatchling"]
5
5
 
6
6
  [project]
7
7
  name = "scib-metrics"
8
- version = "0.5.3"
8
+ version = "0.5.5"
9
9
  description = "Accelerated and Python-only scIB metrics"
10
10
  readme = "README.md"
11
11
  requires-python = ">=3.10"
@@ -15,6 +15,7 @@ from .metrics import (
15
15
  pcr_comparison,
16
16
  silhouette_batch,
17
17
  silhouette_label,
18
+ bras,
18
19
  )
19
20
  from ._settings import settings
20
21
 
@@ -25,6 +26,7 @@ __all__ = [
25
26
  "pcr_comparison",
26
27
  "silhouette_label",
27
28
  "silhouette_batch",
29
+ "bras",
28
30
  "ilisi_knn",
29
31
  "clisi_knn",
30
32
  "lisi_knn",
@@ -136,8 +136,8 @@ class Benchmarker:
136
136
  batch_key: str,
137
137
  label_key: str,
138
138
  embedding_obsm_keys: list[str],
139
- bio_conservation_metrics: BioConservation | None,
140
- batch_correction_metrics: BatchCorrection | None,
139
+ bio_conservation_metrics: BioConservation | None = BioConservation(),
140
+ batch_correction_metrics: BatchCorrection | None = BatchCorrection(),
141
141
  pre_integrated_embedding_obsm_key: str | None = None,
142
142
  n_jobs: int = 1,
143
143
  progress_bar: bool = True,
@@ -4,13 +4,14 @@ from ._kbet import kbet, kbet_per_label
4
4
  from ._lisi import clisi_knn, ilisi_knn, lisi_knn
5
5
  from ._nmi_ari import nmi_ari_cluster_labels_kmeans, nmi_ari_cluster_labels_leiden
6
6
  from ._pcr_comparison import pcr_comparison
7
- from ._silhouette import silhouette_batch, silhouette_label
7
+ from ._silhouette import bras, silhouette_batch, silhouette_label
8
8
 
9
9
  __all__ = [
10
10
  "isolated_labels",
11
11
  "pcr_comparison",
12
12
  "silhouette_label",
13
13
  "silhouette_batch",
14
+ "bras",
14
15
  "ilisi_knn",
15
16
  "clisi_knn",
16
17
  "lisi_knn",
@@ -138,8 +138,14 @@ def kbet_per_label(
138
138
  conn_graph = X.knn_graph_connectivities
139
139
 
140
140
  # prepare call of kBET per cluster
141
- kbet_scores = {"cluster": [], "kBET": []}
142
- for clus in np.unique(labels):
141
+ clusters = []
142
+ clusters, counts = np.unique(labels, return_counts=True)
143
+ skipped = clusters[counts > 10]
144
+ clusters = clusters[counts <= 10]
145
+ kbet_scores = {"cluster": list(skipped), "kBET": [np.nan] * len(skipped)}
146
+ logger.info(f"{len(skipped)} clusters consist of a single batch or are too small. Skip.")
147
+
148
+ for clus in clusters:
143
149
  # subset by label
144
150
  mask = labels == clus
145
151
  conn_graph_sub = conn_graph[mask, :][:, mask]
@@ -147,60 +153,55 @@ def kbet_per_label(
147
153
  n_obs = conn_graph_sub.shape[0]
148
154
  batches_sub = batches[mask]
149
155
 
150
- # check if neighborhood size too small or only one batch in subset
151
- if np.logical_or(n_obs < 10, len(np.unique(batches_sub)) == 1):
152
- logger.info(f"{clus} consists of a single batch or is too small. Skip.")
153
- score = np.nan
154
- else:
155
- quarter_mean = np.floor(np.mean(pd.Series(batches_sub).value_counts()) / 4).astype("int")
156
- k0 = np.min([70, np.max([10, quarter_mean])])
157
- # check k0 for reasonability
158
- if k0 * n_obs >= size_max:
159
- k0 = np.floor(size_max / n_obs).astype("int")
156
+ quarter_mean = np.floor(np.mean(pd.Series(batches_sub).value_counts()) / 4).astype("int")
157
+ k0 = np.min([70, np.max([10, quarter_mean])])
158
+ # check k0 for reasonability
159
+ if k0 * n_obs >= size_max:
160
+ k0 = np.floor(size_max / n_obs).astype("int")
161
+
162
+ n_comp, labs = scipy.sparse.csgraph.connected_components(conn_graph_sub, connection="strong")
163
+
164
+ if n_comp == 1: # a single component to compute kBET on
165
+ try:
166
+ diffusion_n_comps = np.min([diffusion_n_comps, n_obs - 1])
167
+ nn_graph_sub = diffusion_nn(conn_graph_sub, k=k0, n_comps=diffusion_n_comps)
168
+ # call kBET
169
+ score, _, _ = kbet(
170
+ nn_graph_sub,
171
+ batches=batches_sub,
172
+ alpha=alpha,
173
+ )
174
+ except ValueError:
175
+ logger.info("Diffusion distance failed. Skip.")
176
+ score = 0 # i.e. 100% rejection
160
177
 
161
- n_comp, labs = scipy.sparse.csgraph.connected_components(conn_graph_sub, connection="strong")
178
+ else:
179
+ # check the number of components where kBET can be computed upon
180
+ comp_size = pd.Series(labs).value_counts()
181
+ # check which components are small
182
+ comp_size_thresh = 3 * k0
183
+ idx_nonan = np.flatnonzero(np.in1d(labs, comp_size[comp_size >= comp_size_thresh].index))
184
+
185
+ # check if 75% of all cells can be used for kBET run
186
+ if len(idx_nonan) / len(labs) >= 0.75:
187
+ # create another subset of components, assume they are not visited in a diffusion process
188
+ conn_graph_sub_sub = conn_graph_sub[idx_nonan, :][:, idx_nonan]
189
+ conn_graph_sub_sub.sort_indices()
162
190
 
163
- if n_comp == 1: # a single component to compute kBET on
164
191
  try:
165
- diffusion_n_comps = np.min([diffusion_n_comps, n_obs - 1])
166
- nn_graph_sub = diffusion_nn(conn_graph_sub, k=k0, n_comps=diffusion_n_comps)
192
+ diffusion_n_comps = np.min([diffusion_n_comps, conn_graph_sub_sub.shape[0] - 1])
193
+ nn_results_sub_sub = diffusion_nn(conn_graph_sub_sub, k=k0, n_comps=diffusion_n_comps)
167
194
  # call kBET
168
195
  score, _, _ = kbet(
169
- nn_graph_sub,
170
- batches=batches_sub,
196
+ nn_results_sub_sub,
197
+ batches=batches_sub[idx_nonan],
171
198
  alpha=alpha,
172
199
  )
173
200
  except ValueError:
174
201
  logger.info("Diffusion distance failed. Skip.")
175
202
  score = 0 # i.e. 100% rejection
176
-
177
- else:
178
- # check the number of components where kBET can be computed upon
179
- comp_size = pd.Series(labs).value_counts()
180
- # check which components are small
181
- comp_size_thresh = 3 * k0
182
- idx_nonan = np.flatnonzero(np.in1d(labs, comp_size[comp_size >= comp_size_thresh].index))
183
-
184
- # check if 75% of all cells can be used for kBET run
185
- if len(idx_nonan) / len(labs) >= 0.75:
186
- # create another subset of components, assume they are not visited in a diffusion process
187
- conn_graph_sub_sub = conn_graph_sub[idx_nonan, :][:, idx_nonan]
188
- conn_graph_sub_sub.sort_indices()
189
-
190
- try:
191
- diffusion_n_comps = np.min([diffusion_n_comps, conn_graph_sub_sub.shape[0] - 1])
192
- nn_results_sub_sub = diffusion_nn(conn_graph_sub_sub, k=k0, n_comps=diffusion_n_comps)
193
- # call kBET
194
- score, _, _ = kbet(
195
- nn_results_sub_sub,
196
- batches=batches_sub[idx_nonan],
197
- alpha=alpha,
198
- )
199
- except ValueError:
200
- logger.info("Diffusion distance failed. Skip.")
201
- score = 0 # i.e. 100% rejection
202
- else: # if there are too many too small connected components, set kBET score to 0
203
- score = 0 # i.e. 100% rejection
203
+ else: # if there are too many too small connected components, set kBET score to 0
204
+ score = 0 # i.e. 100% rejection
204
205
 
205
206
  kbet_scores["cluster"].append(clus)
206
207
  kbet_scores["kBET"].append(score)
@@ -0,0 +1,162 @@
1
+ from typing import Literal
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ from scib_metrics.utils import silhouette_samples
7
+
8
+
9
+ def silhouette_label(
10
+ X: np.ndarray,
11
+ labels: np.ndarray,
12
+ rescale: bool = True,
13
+ chunk_size: int = 256,
14
+ metric: Literal["euclidean", "cosine"] = "euclidean",
15
+ ) -> float:
16
+ """Average silhouette width (ASW) :cite:p:`luecken2022benchmarking`.
17
+
18
+ Default parameters ('euclidean') match scIB implementation.
19
+
20
+ Parameters
21
+ ----------
22
+ X
23
+ Array of shape (n_cells, n_features).
24
+ labels
25
+ Array of shape (n_cells,) representing label values
26
+ rescale
27
+ Scale asw into the range [0, 1].
28
+ chunk_size
29
+ Size of chunks to process at a time for distance computation
30
+ metric
31
+ The distance metric to use. The distance function can be 'euclidean' (default) or 'cosine'.
32
+
33
+ Returns
34
+ -------
35
+ silhouette score
36
+ """
37
+ asw = np.mean(silhouette_samples(X, labels, chunk_size=chunk_size, metric=metric))
38
+ if rescale:
39
+ asw = (asw + 1) / 2
40
+ return np.mean(asw)
41
+
42
+
43
+ def silhouette_batch(
44
+ X: np.ndarray,
45
+ labels: np.ndarray,
46
+ batch: np.ndarray,
47
+ rescale: bool = True,
48
+ chunk_size: int = 256,
49
+ metric: Literal["euclidean", "cosine"] = "euclidean",
50
+ between_cluster_distances: Literal["nearest", "mean_other", "furthest"] = "nearest",
51
+ ) -> float:
52
+ """Average silhouette width (ASW) with respect to batch ids within each label :cite:p:`luecken2022benchmarking`.
53
+
54
+ Default parameters ('euclidean', 'nearest') match scIB implementation.
55
+
56
+ Additional options enable BRAS compatible usage (see :func:`~scib_metrics.metrics.bras` documentation).
57
+
58
+ Parameters
59
+ ----------
60
+ X
61
+ Array of shape (n_cells, n_features).
62
+ labels
63
+ Array of shape (n_cells,) representing label values
64
+ batch
65
+ Array of shape (n_cells,) representing batch values
66
+ rescale
67
+ Scale asw into the range [0, 1]. If True, higher values are better.
68
+ chunk_size
69
+ Size of chunks to process at a time for distance computation.
70
+ metric
71
+ The distance metric to use. The distance function can be 'euclidean' (default) or 'cosine'.
72
+ between_cluster_distances
73
+ Method for computing inter-cluster distances.
74
+ - 'nearest': Standard silhouette (distance to nearest cluster)
75
+ - 'mean_other': BRAS-specific (mean distance to all other clusters)
76
+ - 'furthest': BRAS-specific (distance to furthest cluster)
77
+
78
+ Returns
79
+ -------
80
+ silhouette score
81
+ """
82
+ sil_dfs = []
83
+ unique_labels = np.unique(labels)
84
+ for group in unique_labels:
85
+ labels_mask = labels == group
86
+ X_subset = X[labels_mask]
87
+ batch_subset = batch[labels_mask]
88
+ n_batches = len(np.unique(batch_subset))
89
+
90
+ if (n_batches == 1) or (n_batches == X_subset.shape[0]):
91
+ continue
92
+
93
+ sil_per_group = silhouette_samples(
94
+ X_subset,
95
+ batch_subset,
96
+ chunk_size=chunk_size,
97
+ metric=metric,
98
+ between_cluster_distances=between_cluster_distances,
99
+ )
100
+
101
+ # take only absolute value
102
+ sil_per_group = np.abs(sil_per_group)
103
+
104
+ if rescale:
105
+ # scale s.t. highest number is optimal
106
+ sil_per_group = 1 - sil_per_group
107
+
108
+ sil_dfs.append(
109
+ pd.DataFrame(
110
+ {
111
+ "group": [group] * len(sil_per_group),
112
+ "silhouette_score": sil_per_group,
113
+ }
114
+ )
115
+ )
116
+
117
+ sil_df = pd.concat(sil_dfs).reset_index(drop=True)
118
+ sil_means = sil_df.groupby("group").mean()
119
+ asw = sil_means["silhouette_score"].mean()
120
+
121
+ return asw
122
+
123
+
124
+ def bras(
125
+ X: np.ndarray,
126
+ labels: np.ndarray,
127
+ batch: np.ndarray,
128
+ chunk_size: int = 256,
129
+ metric: Literal["euclidean", "cosine"] = "cosine",
130
+ between_cluster_distances: Literal["mean_other", "furthest"] = "mean_other",
131
+ ) -> float:
132
+ """Batch removal adapted silhouette (BRAS) for single-cell data integration assessment :cite:p:`rautenstrauch2025`.
133
+
134
+ BRAS evaluates batch effect removal with respect to batch ids within each label (cell type cluster),
135
+ using a modified silhouette score that accounts for nested batch effects. Unlike standard silhouette,
136
+ BRAS computes between-cluster distances using the `between_cluster_distances` method rather than
137
+ nearest-cluster approach. A higher scores indicates better batch mixing.
138
+
139
+ Parameters
140
+ ----------
141
+ X
142
+ Array of shape (n_cells, n_features).
143
+ labels
144
+ Array of shape (n_cells,) representing label values
145
+ batch
146
+ Array of shape (n_cells,) representing batch values
147
+ rescale
148
+ Scale asw into the range [0, 1]. If True, higher values are better.
149
+ chunk_size
150
+ Size of chunks to process at a time for distance computation.
151
+ metric
152
+ The distance metric to use. The distance function can be 'euclidean' (default) or 'cosine'.
153
+ between_cluster_distances
154
+ Method for computing inter-cluster distances.
155
+ - 'mean_other': Mean distance to all cells in other clusters (default)
156
+ - 'furthest': Distance to furthest cluster (conservative estimate)
157
+
158
+ Returns
159
+ -------
160
+ BRAS score
161
+ """
162
+ return silhouette_batch(X, labels, batch, True, chunk_size, metric, between_cluster_distances)
@@ -1,3 +1,6 @@
1
+ from functools import partial
2
+ from typing import Literal
3
+
1
4
  import jax
2
5
  import jax.numpy as jnp
3
6
  import numpy as np
@@ -10,10 +13,20 @@ def _euclidean_distance(x: np.array, y: np.array) -> float:
10
13
 
11
14
 
12
15
  @jax.jit
13
- def cdist(x: np.ndarray, y: np.ndarray) -> jnp.ndarray:
16
+ def _cosine_distance(x: np.array, y: np.array) -> float:
17
+ xy = jnp.dot(x, y)
18
+ xx = jnp.dot(x, x)
19
+ yy = jnp.dot(y, y)
20
+ dist = 1.0 - xy / jnp.sqrt(xx * yy)
21
+ # Clip the result to avoid rounding error
22
+ return jnp.clip(dist, 0.0, 2.0)
23
+
24
+
25
+ @partial(jax.jit, static_argnames=["metric"])
26
+ def cdist(x: np.ndarray, y: np.ndarray, metric: Literal["euclidean", "cosine"] = "euclidean") -> jnp.ndarray:
14
27
  """Jax implementation of :func:`scipy.spatial.distance.cdist`.
15
28
 
16
- Uses euclidean distance.
29
+ Uses euclidean distance by default, cosine distance is also available.
17
30
 
18
31
  Parameters
19
32
  ----------
@@ -21,13 +34,20 @@ def cdist(x: np.ndarray, y: np.ndarray) -> jnp.ndarray:
21
34
  Array of shape (n_cells_a, n_features)
22
35
  y
23
36
  Array of shape (n_cells_b, n_features)
37
+ metric
38
+ The distance metric to use. The distance function can be 'euclidean' (default) or 'cosine'.
24
39
 
25
40
  Returns
26
41
  -------
27
42
  dist
28
43
  Array of shape (n_cells_a, n_cells_b)
29
44
  """
30
- return jax.vmap(lambda x1: jax.vmap(lambda y1: _euclidean_distance(x1, y1))(y))(x)
45
+ if metric not in ["euclidean", "cosine"]:
46
+ raise ValueError("Invalid metric choice, must be one of ['euclidean' or 'cosine'].")
47
+ if metric == "cosine":
48
+ return jax.vmap(lambda x1: jax.vmap(lambda y1: _cosine_distance(x1, y1))(y))(x)
49
+ else:
50
+ return jax.vmap(lambda x1: jax.vmap(lambda y1: _euclidean_distance(x1, y1))(y))(x)
31
51
 
32
52
 
33
53
  @jax.jit
@@ -1,4 +1,5 @@
1
1
  from functools import partial
2
+ from typing import Literal
2
3
 
3
4
  import jax
4
5
  import jax.numpy as jnp
@@ -9,13 +10,21 @@ from ._dist import cdist
9
10
  from ._utils import get_ndarray
10
11
 
11
12
 
12
- @jax.jit
13
+ @partial(jax.jit, static_argnames=["between_cluster_distances"])
13
14
  def _silhouette_reduce(
14
- D_chunk: jnp.ndarray, start: int, labels: jnp.ndarray, label_freqs: jnp.ndarray
15
+ D_chunk: jnp.ndarray,
16
+ start: int,
17
+ labels: jnp.ndarray,
18
+ label_freqs: jnp.ndarray,
19
+ between_cluster_distances: Literal["nearest", "mean_other", "furthest"] = "nearest",
15
20
  ) -> tuple[jnp.ndarray, jnp.ndarray]:
16
21
  """Accumulate silhouette statistics for vertical chunk of X.
17
22
 
18
- Follows scikit-learn implementation.
23
+ Follows scikit-learn implementation with default parameter usage ('nearest').
24
+
25
+ Additional options enable BRAS compatible usage, addressing specific limitations of using silhouette in the context
26
+ of evaluating data integration (see :func:`~scib_metrics.metrics.bras` documentation).
27
+
19
28
 
20
29
  Parameters
21
30
  ----------
@@ -29,6 +38,12 @@ def _silhouette_reduce(
29
38
  Corresponding cluster labels, encoded as {0, ..., n_clusters-1}.
30
39
  label_freqs
31
40
  Distribution of cluster labels in ``labels``.
41
+ between_cluster_distances
42
+ Method for computing inter-cluster distances.
43
+ - 'nearest': Standard silhouette (distance to nearest cluster)
44
+ - 'mean_other': BRAS-specific (mean distance to all other clusters)
45
+ - 'furthest': BRAS-specific (distance to furthest cluster)
46
+
32
47
  """
33
48
  # accumulate distances from each sample to each cluster
34
49
  D_chunk_len = D_chunk.shape[0]
@@ -43,21 +58,36 @@ def _silhouette_reduce(
43
58
  # clust_dists = jax.lax.fori_loop(
44
59
  # 0, D_chunk_len, lambda i, _data: _bincount(i, _data), (clust_dists, D_chunk, labels, label_freqs)
45
60
  # )[0]
46
-
47
61
  clust_dists = jax.vmap(partial(jnp.bincount, length=label_freqs.shape[0]), in_axes=(None, 0))(labels, D_chunk)
48
62
 
49
63
  # intra_index selects intra-cluster distances within clust_dists
50
64
  intra_index = (jnp.arange(D_chunk_len), jax.lax.dynamic_slice(labels, (start,), (D_chunk_len,)))
51
65
  # intra_clust_dists are averaged over cluster size outside this function
52
66
  intra_clust_dists = clust_dists[intra_index]
53
- # of the remaining distances we normalise and extract the minimum
54
- clust_dists = clust_dists.at[intra_index].set(jnp.inf)
55
- clust_dists /= label_freqs
56
- inter_clust_dists = clust_dists.min(axis=1)
67
+
68
+ if between_cluster_distances == "furthest":
69
+ # of the remaining distances we normalise and extract the maximum
70
+ clust_dists = clust_dists.at[intra_index].set(-jnp.inf)
71
+ clust_dists /= label_freqs
72
+ inter_clust_dists = clust_dists.max(axis=1)
73
+ elif between_cluster_distances == "mean_other":
74
+ clust_dists = clust_dists.at[intra_index].set(jnp.nan)
75
+ total_other_dists = jnp.nansum(clust_dists, axis=1)
76
+ total_other_count = jnp.sum(label_freqs) - label_freqs[jax.lax.dynamic_slice(labels, (start,), (D_chunk_len,))]
77
+ inter_clust_dists = total_other_dists / total_other_count
78
+ elif between_cluster_distances == "nearest":
79
+ # of the remaining distances we normalise and extract the minimum
80
+ clust_dists = clust_dists.at[intra_index].set(jnp.inf)
81
+ clust_dists /= label_freqs
82
+ inter_clust_dists = clust_dists.min(axis=1)
83
+ else:
84
+ raise ValueError("Parameter 'between_cluster_distances' must be one of ['nearest', 'mean_other', 'furthest'].")
57
85
  return intra_clust_dists, inter_clust_dists
58
86
 
59
87
 
60
- def _pairwise_distances_chunked(X: jnp.ndarray, chunk_size: int, reduce_fn: callable) -> jnp.ndarray:
88
+ def _pairwise_distances_chunked(
89
+ X: jnp.ndarray, chunk_size: int, reduce_fn: callable, metric: Literal["euclidean", "cosine"] = "euclidean"
90
+ ) -> jnp.ndarray:
61
91
  """Compute pairwise distances in chunks to reduce memory usage."""
62
92
  n_samples = X.shape[0]
63
93
  n_chunks = jnp.ceil(n_samples / chunk_size).astype(int)
@@ -66,17 +96,27 @@ def _pairwise_distances_chunked(X: jnp.ndarray, chunk_size: int, reduce_fn: call
66
96
  for i in range(n_chunks):
67
97
  start = i * chunk_size
68
98
  end = min((i + 1) * chunk_size, n_samples)
69
- intra_cluster_dists, inter_cluster_dists = reduce_fn(cdist(X[start:end], X), start=start)
99
+ intra_cluster_dists, inter_cluster_dists = reduce_fn(cdist(X[start:end], X, metric=metric), start=start)
70
100
  intra_dists_all.append(intra_cluster_dists)
71
101
  inter_dists_all.append(inter_cluster_dists)
72
102
  return jnp.concatenate(intra_dists_all), jnp.concatenate(inter_dists_all)
73
103
 
74
104
 
75
- def silhouette_samples(X: np.ndarray, labels: np.ndarray, chunk_size: int = 256) -> np.ndarray:
105
+ def silhouette_samples(
106
+ X: np.ndarray,
107
+ labels: np.ndarray,
108
+ chunk_size: int = 256,
109
+ metric: Literal["euclidean", "cosine"] = "euclidean",
110
+ between_cluster_distances: Literal["nearest", "mean_other", "furthest"] = "nearest",
111
+ ) -> np.ndarray:
76
112
  """Compute the Silhouette Coefficient for each observation.
77
113
 
78
114
  Implements :func:`sklearn.metrics.silhouette_samples`.
79
115
 
116
+ Default parameters ('euclidean', 'nearest') match scIB implementation.
117
+
118
+ Additional options enable BRAS compatible usage (see `bras()` documentation).
119
+
80
120
  Parameters
81
121
  ----------
82
122
  X
@@ -87,6 +127,13 @@ def silhouette_samples(X: np.ndarray, labels: np.ndarray, chunk_size: int = 256)
87
127
  for each observation.
88
128
  chunk_size
89
129
  Number of samples to process at a time for distance computation.
130
+ metric
131
+ The distance metric to use. The distance function can be 'euclidean' (default) or 'cosine'.
132
+ between_cluster_distances
133
+ Method for computing inter-cluster distances.
134
+ - 'nearest': Standard silhouette (distance to nearest cluster)
135
+ - 'mean_other': BRAS-specific (mean distance to all other clusters)
136
+ - 'furthest': BRAS-specific (distance to furthest cluster)
90
137
 
91
138
  Returns
92
139
  -------
@@ -97,8 +144,10 @@ def silhouette_samples(X: np.ndarray, labels: np.ndarray, chunk_size: int = 256)
97
144
  labels = pd.Categorical(labels).codes
98
145
  labels = jnp.asarray(labels)
99
146
  label_freqs = jnp.bincount(labels)
100
- reduce_fn = partial(_silhouette_reduce, labels=labels, label_freqs=label_freqs)
101
- results = _pairwise_distances_chunked(X, chunk_size=chunk_size, reduce_fn=reduce_fn)
147
+ reduce_fn = partial(
148
+ _silhouette_reduce, labels=labels, label_freqs=label_freqs, between_cluster_distances=between_cluster_distances
149
+ )
150
+ results = _pairwise_distances_chunked(X, chunk_size=chunk_size, reduce_fn=reduce_fn, metric=metric)
102
151
  intra_clust_dists, inter_clust_dists = results
103
152
 
104
153
  denom = jnp.take(label_freqs - 1, labels, mode="clip")