slide2vec 4.2.0__tar.gz → 4.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. {slide2vec-4.2.0 → slide2vec-4.4.0}/PKG-INFO +28 -5
  2. {slide2vec-4.2.0 → slide2vec-4.4.0}/README.md +19 -2
  3. {slide2vec-4.2.0 → slide2vec-4.4.0}/pyproject.toml +11 -4
  4. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/__init__.py +13 -2
  5. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/api.py +120 -25
  6. slide2vec-4.4.0/slide2vec/configs/__init__.py +4 -0
  7. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/configs/default.yaml +10 -7
  8. {slide2vec-4.2.0/slide2vec → slide2vec-4.4.0/slide2vec/configs}/resources.py +3 -6
  9. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/distributed/direct_embed_worker.py +14 -10
  10. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/distributed/pipeline_worker.py +16 -15
  11. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/registry.py +1 -1
  12. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/validation.py +14 -10
  13. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/inference.py +250 -1340
  14. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/progress.py +123 -5
  15. slide2vec-4.4.0/slide2vec/runtime/__init__.py +2 -0
  16. slide2vec-4.4.0/slide2vec/runtime/batching.py +472 -0
  17. slide2vec-4.4.0/slide2vec/runtime/distributed.py +195 -0
  18. slide2vec-4.4.0/slide2vec/runtime/embedding.py +157 -0
  19. slide2vec-4.4.0/slide2vec/runtime/hierarchical.py +105 -0
  20. {slide2vec-4.2.0/slide2vec → slide2vec-4.4.0/slide2vec/runtime}/model_settings.py +1 -0
  21. slide2vec-4.4.0/slide2vec/runtime/persistence.py +165 -0
  22. slide2vec-4.4.0/slide2vec/runtime/progress_bridge.py +52 -0
  23. {slide2vec-4.2.0/slide2vec → slide2vec-4.4.0/slide2vec/runtime}/registry.py +1 -0
  24. slide2vec-4.4.0/slide2vec/runtime/serialization.py +122 -0
  25. slide2vec-4.4.0/slide2vec/runtime/tiling.py +97 -0
  26. slide2vec-4.4.0/slide2vec/runtime/types.py +48 -0
  27. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/utils/config.py +1 -1
  28. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/utils/tiling_io.py +5 -0
  29. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec.egg-info/PKG-INFO +28 -5
  30. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec.egg-info/SOURCES.txt +15 -6
  31. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec.egg-info/requires.txt +9 -2
  32. slide2vec-4.4.0/tests/test_architecture_runtime_split.py +60 -0
  33. {slide2vec-4.2.0 → slide2vec-4.4.0}/tests/test_hs2p_package_cutover.py +6 -4
  34. {slide2vec-4.2.0 → slide2vec-4.4.0}/tests/test_output_consistency.py +8 -3
  35. {slide2vec-4.2.0 → slide2vec-4.4.0}/tests/test_progress.py +378 -10
  36. {slide2vec-4.2.0 → slide2vec-4.4.0}/tests/test_regression_core.py +113 -22
  37. {slide2vec-4.2.0 → slide2vec-4.4.0}/tests/test_regression_inference.py +878 -154
  38. {slide2vec-4.2.0 → slide2vec-4.4.0}/tests/test_regression_models.py +1 -1
  39. slide2vec-4.4.0/tests/test_runtime_batching.py +33 -0
  40. slide2vec-4.2.0/slide2vec/configs/__init__.py +0 -4
  41. slide2vec-4.2.0/slide2vec/runtime_types.py +0 -14
  42. slide2vec-4.2.0/tests/test_batch_collator_timing.py +0 -161
  43. slide2vec-4.2.0/tests/test_packaging_metadata.py +0 -23
  44. {slide2vec-4.2.0 → slide2vec-4.4.0}/LICENSE +0 -0
  45. {slide2vec-4.2.0 → slide2vec-4.4.0}/setup.cfg +0 -0
  46. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/__main__.py +0 -0
  47. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/artifacts.py +0 -0
  48. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/cli.py +0 -0
  49. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/data/__init__.py +0 -0
  50. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/data/dataset.py +0 -0
  51. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/data/tile_reader.py +0 -0
  52. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/data/tile_store.py +0 -0
  53. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/distributed/__init__.py +0 -0
  54. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/__init__.py +0 -0
  55. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/base.py +0 -0
  56. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/__init__.py +0 -0
  57. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/conch.py +0 -0
  58. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/gigapath.py +0 -0
  59. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/hibou.py +0 -0
  60. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/hoptimus.py +0 -0
  61. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/lunit.py +0 -0
  62. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/midnight.py +0 -0
  63. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/moozy/__init__.py +0 -0
  64. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/moozy/blocks.py +0 -0
  65. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/moozy/case.py +0 -0
  66. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/moozy/loading.py +0 -0
  67. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/moozy/slide.py +0 -0
  68. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/moozy/types.py +0 -0
  69. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/musk.py +0 -0
  70. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/phikon.py +0 -0
  71. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/prism.py +0 -0
  72. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/prost40m.py +0 -0
  73. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/titan.py +0 -0
  74. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/uni.py +0 -0
  75. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/encoders/models/virchow.py +0 -0
  76. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/main.py +0 -0
  77. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/utils/__init__.py +0 -0
  78. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/utils/coordinates.py +0 -0
  79. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/utils/log_utils.py +0 -0
  80. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec/utils/utils.py +0 -0
  81. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec.egg-info/dependency_links.txt +0 -0
  82. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec.egg-info/entry_points.txt +0 -0
  83. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec.egg-info/not-zip-safe +0 -0
  84. {slide2vec-4.2.0 → slide2vec-4.4.0}/slide2vec.egg-info/top_level.txt +0 -0
  85. {slide2vec-4.2.0 → slide2vec-4.4.0}/tests/test_encoder_registry.py +0 -0
  86. {slide2vec-4.2.0 → slide2vec-4.4.0}/tests/test_tile_store.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: slide2vec
3
- Version: 4.2.0
3
+ Version: 4.4.0
4
4
  Summary: Embedding of whole slide images with Foundation Models
5
5
  Author-email: Clément Grisi <clement.grisi@radboudumc.nl>
6
6
  License-Expression: Apache-2.0
@@ -15,7 +15,7 @@ Classifier: Programming Language :: Python :: 3.13
15
15
  Requires-Python: >=3.10
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
- Requires-Dist: hs2p[asap,cucim,openslide,vips]>=3.2.1
18
+ Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.0.1
19
19
  Requires-Dist: omegaconf
20
20
  Requires-Dist: matplotlib
21
21
  Requires-Dist: numpy<2
@@ -65,7 +65,7 @@ Requires-Dist: numpy<2; extra == "fm"
65
65
  Requires-Dist: pandas; extra == "fm"
66
66
  Requires-Dist: pillow; extra == "fm"
67
67
  Requires-Dist: rich; extra == "fm"
68
- Requires-Dist: hs2p[asap,cucim,openslide,vips]>=3.2.1; extra == "fm"
68
+ Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.0.1; extra == "fm"
69
69
  Requires-Dist: wandb; extra == "fm"
70
70
  Requires-Dist: torch<2.8,>=2.3; extra == "fm"
71
71
  Requires-Dist: torchvision>=0.18.0; extra == "fm"
@@ -89,6 +89,12 @@ Requires-Dist: fairscale; extra == "fm"
89
89
  Requires-Dist: packaging==23.2; extra == "fm"
90
90
  Requires-Dist: ninja==1.11.1.1; extra == "fm"
91
91
  Requires-Dist: psutil<6; extra == "fm"
92
+ Provides-Extra: docs
93
+ Requires-Dist: sphinx>=8.1; extra == "docs"
94
+ Requires-Dist: furo; extra == "docs"
95
+ Requires-Dist: myst-parser; extra == "docs"
96
+ Requires-Dist: sphinx-copybutton; extra == "docs"
97
+ Requires-Dist: sphinx-autodoc-typehints; extra == "docs"
92
98
  Provides-Extra: testing
93
99
  Requires-Dist: pytest>=6.0; extra == "testing"
94
100
  Requires-Dist: pytest-cov>=2.0; extra == "testing"
@@ -101,9 +107,12 @@ Dynamic: license-file
101
107
  # slide2vec
102
108
 
103
109
  [![PyPI version](https://img.shields.io/pypi/v/slide2vec?label=pypi&logo=pypi&color=3776AB)](https://pypi.org/project/slide2vec/)
110
+ [![Docs](https://img.shields.io/badge/docs-website-blue)](https://clemsgrs.github.io/slide2vec/)
104
111
 
105
112
  `slide2vec` is a Python package for efficient encoding of whole-slide images using publicly available foundation models. It builds on [`hs2p`](https://pypi.org/project/hs2p/) for fast preprocessing and exposes a focused surface around `Model`, `Pipeline`, and `ExecutionOptions`.
106
113
 
114
+ Documentation site: [https://clemsgrs.github.io/slide2vec/](https://clemsgrs.github.io/slide2vec/)
115
+
107
116
  ## Installation
108
117
 
109
118
  ```shell
@@ -121,6 +130,8 @@ pip install git+https://github.com/Mahmoodlab/CONCH.git
121
130
  pip install git+https://github.com/prov-gigapath/prov-gigapath.git
122
131
  ```
123
132
 
133
+ AtlasPatch-backed tissue segmentation is available through hs2p's `sam2` path in the bundled install.
134
+
124
135
  ## Python API
125
136
 
126
137
  ```python
@@ -137,6 +148,17 @@ x = embedded.x
137
148
  y = embedded.y
138
149
  ```
139
150
 
151
+ Use `list_models()` when you want to inspect the shipped presets programmatically:
152
+
153
+ ```python
154
+ from slide2vec import list_models
155
+
156
+ all_models = list_models()
157
+ tile_models = list_models("tile")
158
+ slide_models = list_models("slide")
159
+ patient_models = list_models("patient")
160
+ ```
161
+
140
162
  Use `Pipeline(...)` for manifest-driven batch processing when you want artifacts written to disk instead of only in-memory outputs:
141
163
 
142
164
  ```python
@@ -235,7 +257,8 @@ docker run --rm -it \
235
257
 
236
258
  ## Documentation
237
259
 
238
- - [`docs/cli.md`](docs/cli.md) for the config-driven CLI guide
260
+ - [Documentation website](https://clemsgrs.github.io/slide2vec/) for the polished docs site
239
261
  - [`docs/python-api.md`](docs/python-api.md) for the detailed API reference
240
- - [`tutorials/api_walkthrough.ipynb`](tutorials/api_walkthrough.ipynb) for a notebook walkthrough of the API
262
+ - [`docs/cli.md`](docs/cli.md) for the config-driven CLI guide
241
263
  - [`docs/models.md`](docs/models.md) for the full supported-model catalog
264
+ - [`tutorials/api_walkthrough.ipynb`](tutorials/api_walkthrough.ipynb) for a notebook walkthrough of the API
@@ -1,9 +1,12 @@
1
1
  # slide2vec
2
2
 
3
3
  [![PyPI version](https://img.shields.io/pypi/v/slide2vec?label=pypi&logo=pypi&color=3776AB)](https://pypi.org/project/slide2vec/)
4
+ [![Docs](https://img.shields.io/badge/docs-website-blue)](https://clemsgrs.github.io/slide2vec/)
4
5
 
5
6
  `slide2vec` is a Python package for efficient encoding of whole-slide images using publicly available foundation models. It builds on [`hs2p`](https://pypi.org/project/hs2p/) for fast preprocessing and exposes a focused surface around `Model`, `Pipeline`, and `ExecutionOptions`.
6
7
 
8
+ Documentation site: [https://clemsgrs.github.io/slide2vec/](https://clemsgrs.github.io/slide2vec/)
9
+
7
10
  ## Installation
8
11
 
9
12
  ```shell
@@ -21,6 +24,8 @@ pip install git+https://github.com/Mahmoodlab/CONCH.git
21
24
  pip install git+https://github.com/prov-gigapath/prov-gigapath.git
22
25
  ```
23
26
 
27
+ AtlasPatch-backed tissue segmentation is available through hs2p's `sam2` path in the bundled install.
28
+
24
29
  ## Python API
25
30
 
26
31
  ```python
@@ -37,6 +42,17 @@ x = embedded.x
37
42
  y = embedded.y
38
43
  ```
39
44
 
45
+ Use `list_models()` when you want to inspect the shipped presets programmatically:
46
+
47
+ ```python
48
+ from slide2vec import list_models
49
+
50
+ all_models = list_models()
51
+ tile_models = list_models("tile")
52
+ slide_models = list_models("slide")
53
+ patient_models = list_models("patient")
54
+ ```
55
+
40
56
  Use `Pipeline(...)` for manifest-driven batch processing when you want artifacts written to disk instead of only in-memory outputs:
41
57
 
42
58
  ```python
@@ -135,7 +151,8 @@ docker run --rm -it \
135
151
 
136
152
  ## Documentation
137
153
 
138
- - [`docs/cli.md`](docs/cli.md) for the config-driven CLI guide
154
+ - [Documentation website](https://clemsgrs.github.io/slide2vec/) for the polished docs site
139
155
  - [`docs/python-api.md`](docs/python-api.md) for the detailed API reference
140
- - [`tutorials/api_walkthrough.ipynb`](tutorials/api_walkthrough.ipynb) for a notebook walkthrough of the API
156
+ - [`docs/cli.md`](docs/cli.md) for the config-driven CLI guide
141
157
  - [`docs/models.md`](docs/models.md) for the full supported-model catalog
158
+ - [`tutorials/api_walkthrough.ipynb`](tutorials/api_walkthrough.ipynb) for a notebook walkthrough of the API
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "slide2vec"
7
- version = "4.2.0"
7
+ version = "4.4.0"
8
8
  description = "Embedding of whole slide images with Foundation Models"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -21,7 +21,7 @@ classifiers = [
21
21
  "Programming Language :: Python :: 3.13",
22
22
  ]
23
23
  dependencies = [
24
- "hs2p[asap,cucim,openslide,vips]>=3.2.1",
24
+ "hs2p[asap,cucim,openslide,sam2,vips]>=4.0.1",
25
25
  "omegaconf",
26
26
  "matplotlib",
27
27
  "numpy<2",
@@ -88,7 +88,7 @@ fm = [
88
88
  "pandas",
89
89
  "pillow",
90
90
  "rich",
91
- "hs2p[asap,cucim,openslide,vips]>=3.2.1",
91
+ "hs2p[asap,cucim,openslide,sam2,vips]>=4.0.1",
92
92
  "wandb",
93
93
  "torch>=2.3,<2.8",
94
94
  "torchvision>=0.18.0",
@@ -113,6 +113,13 @@ fm = [
113
113
  "ninja==1.11.1.1",
114
114
  "psutil<6",
115
115
  ]
116
+ docs = [
117
+ "sphinx>=8.1",
118
+ "furo",
119
+ "myst-parser",
120
+ "sphinx-copybutton",
121
+ "sphinx-autodoc-typehints",
122
+ ]
116
123
  testing = [
117
124
  "pytest>=6.0",
118
125
  "pytest-cov>=2.0",
@@ -157,7 +164,7 @@ no_implicit_reexport = true
157
164
  max-line-length = 160
158
165
 
159
166
  [tool.bumpver]
160
- current_version = "4.2.0"
167
+ current_version = "4.4.0"
161
168
  version_pattern = "MAJOR.MINOR.PATCH"
162
169
  commit = false # We do version bumping in CI, not as a commit
163
170
  tag = false # Git tag already exists — we don't auto-tag
@@ -1,15 +1,26 @@
1
- from slide2vec.api import EmbeddedSlide, ExecutionOptions, Model, Pipeline, PreprocessingConfig, RunResult
1
+ from slide2vec.api import (
2
+ EmbeddedPatient,
3
+ EmbeddedSlide,
4
+ ExecutionOptions,
5
+ Model,
6
+ Pipeline,
7
+ PreprocessingConfig,
8
+ RunResult,
9
+ list_models,
10
+ )
2
11
  from slide2vec.artifacts import HierarchicalEmbeddingArtifact, SlideEmbeddingArtifact, TileEmbeddingArtifact
3
12
 
4
13
 
5
- __version__ = "4.2.0"
14
+ __version__ = "4.4.0"
6
15
 
7
16
  __all__ = [
8
17
  "Model",
18
+ "list_models",
9
19
  "Pipeline",
10
20
  "PreprocessingConfig",
11
21
  "ExecutionOptions",
12
22
  "RunResult",
23
+ "EmbeddedPatient",
13
24
  "EmbeddedSlide",
14
25
  "SlideEmbeddingArtifact",
15
26
  "HierarchicalEmbeddingArtifact",
@@ -20,9 +20,9 @@ from slide2vec.encoders.registry import (
20
20
  resolve_preprocessing_defaults,
21
21
  )
22
22
  from slide2vec.encoders.validation import validate_encoder_config
23
- from slide2vec.model_settings import canonicalize_model_name, normalize_precision_name
23
+ from slide2vec.runtime.model_settings import canonicalize_model_name, normalize_precision_name
24
24
  from slide2vec.progress import emit_progress
25
- from slide2vec.runtime_types import LoadedModel
25
+ from slide2vec.runtime.types import LoadedModel
26
26
  from slide2vec.utils.utils import cpu_worker_limit, slurm_cpu_limit
27
27
 
28
28
  PathLike = str | Path
@@ -42,25 +42,53 @@ TilingResultsInput = Sequence[Any] | Mapping[str, Any]
42
42
 
43
43
  @dataclass(frozen=True, kw_only=True)
44
44
  class PreprocessingConfig:
45
+ """Configuration for slide tiling and preprocessing."""
46
+
47
+ #: Slide reading backend. ``"auto"`` tries cucim → openslide → vips in order.
48
+ #: Explicit choices: ``"cucim"``, ``"openslide"``, ``"vips"``, ``"asap"``.
45
49
  backend: str = "auto"
50
+ #: Target spacing in µm/px. Resolved from the model preset when ``None``.
46
51
  requested_spacing_um: float | None = None
52
+ #: Tile side length in pixels at *requested_spacing_um*.
53
+ #: Resolved from the model preset when ``None``.
47
54
  requested_tile_size_px: int | None = None
55
+ #: Parent region side length in pixels (hierarchical mode).
56
+ #: Auto-derived as ``requested_tile_size_px × region_tile_multiple`` when ``None``.
48
57
  requested_region_size_px: int | None = None
58
+ #: Region grid width/height in tiles (e.g. ``6`` → 6×6 = 36 tiles per region).
59
+ #: Enables hierarchical extraction when set; must be ≥ 2.
49
60
  region_tile_multiple: int | None = None
61
+ #: Relative spacing tolerance for pyramid level selection (default ``0.05``).
50
62
  tolerance: float = 0.05
63
+ #: Fractional tile overlap (``0.0`` = no overlap).
51
64
  overlap: float = 0.0
65
+ #: Minimum tissue fraction required to keep a tile (default ``0.01``).
52
66
  tissue_threshold: float = 0.01
67
+ #: Directory containing pre-extracted tile coordinates to reuse, skipping tiling.
53
68
  read_coordinates_from: Path | None = None
69
+ #: Directory containing pre-extracted tile images to skip the tiling step entirely.
54
70
  read_tiles_from: Path | None = None
71
+ #: Read and decode tiles on demand rather than pre-loading into memory.
55
72
  on_the_fly: bool = True
73
+ #: Decode tiles on the GPU via CuCIM / nvImageCodec when ``True``.
56
74
  gpu_decode: bool = False
75
+ #: Dynamically adjust batch size based on tile count.
57
76
  adaptive_batching: bool = False
77
+ #: Group adjacent tiles into supertile batches for faster I/O.
58
78
  use_supertiles: bool = True
79
+ #: JPEG decode library — ``"turbojpeg"`` (default) or ``"pillow"``.
59
80
  jpeg_backend: str = "turbojpeg"
81
+ #: Number of CuCIM reader threads.
60
82
  num_cucim_workers: int = 4
83
+ #: Skip slides already present in the output directory when ``True``.
61
84
  resume: bool = False
85
+ #: Forwarded to hs2p segmentation config. Supported keys: ``method``,
86
+ #: ``downsample``, ``sam2_device``. See :doc:`preprocessing` for details.
62
87
  segmentation: dict[str, Any] = field(default_factory=dict)
88
+ #: Forwarded to hs2p tile-filtering config.
63
89
  filtering: dict[str, Any] = field(default_factory=dict)
90
+ #: Controls whether hs2p writes mask and tiling preview images.
91
+ #: Keys: ``save_mask_preview``, ``save_tiling_preview``, ``downsample``.
64
92
  preview: dict[str, Any] = field(default_factory=dict)
65
93
 
66
94
  @classmethod
@@ -72,8 +100,17 @@ class PreprocessingConfig:
72
100
  gpu_decode = bool(tiling.gpu_decode)
73
101
  adaptive_batching = bool(tiling.adaptive_batching)
74
102
  preview_cfg = tiling.preview
75
- preview_save = bool(preview_cfg.save)
76
- preview_downsample = int(preview_cfg.downsample)
103
+ preview_save = bool(preview_cfg.save_mask_preview)
104
+ preview_tiling_save = bool(preview_cfg.save_tiling_preview)
105
+ preview_kwargs: dict[str, Any] = {
106
+ "save_mask_preview": preview_save,
107
+ "save_tiling_preview": preview_tiling_save,
108
+ "downsample": int(preview_cfg.downsample),
109
+ }
110
+ preview_kwargs["tissue_contour_color"] = tuple(
111
+ int(channel) for channel in preview_cfg.tissue_contour_color
112
+ )
113
+ preview_kwargs["mask_overlay_alpha"] = float(preview_cfg.mask_overlay_alpha)
77
114
  return cls(
78
115
  backend=tiling.backend,
79
116
  requested_spacing_um=float(tiling.params.requested_spacing_um),
@@ -104,11 +141,7 @@ class PreprocessingConfig:
104
141
  resume=bool(cfg.resume),
105
142
  segmentation=dict(tiling.seg_params),
106
143
  filtering=dict(tiling.filter_params),
107
- preview={
108
- "save_mask_preview": preview_save,
109
- "save_tiling_preview": preview_save,
110
- "downsample": preview_downsample,
111
- },
144
+ preview=preview_kwargs,
112
145
  )
113
146
 
114
147
  def with_backend(self, backend: str) -> "PreprocessingConfig":
@@ -118,31 +151,44 @@ class PreprocessingConfig:
118
151
 
119
152
  @dataclass(frozen=True, kw_only=True)
120
153
  class ExecutionOptions:
154
+ """Runtime execution and output settings."""
155
+
156
+ #: Directory where artifacts are written. Required for :class:`Pipeline` runs.
121
157
  output_dir: Path | None = None
158
+ #: Tensor serialization format — ``"pt"`` (PyTorch, default) or ``"npz"`` (NumPy).
122
159
  output_format: str = "pt"
123
- batch_size: int = 1
124
- num_workers: int | None = None
160
+ #: Number of tiles per forward pass.
161
+ batch_size: int = 32
162
+ #: DataLoader worker count per GPU rank. ``None`` means auto
163
+ #: (capped by CPU / SLURM limit, then split across the resolved GPU count).
164
+ num_workers_per_gpu: int | None = None
165
+ #: Tiling worker count. ``None`` means auto (capped by CPU / SLURM limit).
125
166
  num_preprocessing_workers: int | None = None
167
+ #: Number of GPUs to use. ``None`` defaults to all available GPUs.
126
168
  num_gpus: int | None = None
169
+ #: Forward-pass dtype — ``"fp16"``, ``"bf16"``, ``"fp32"``,
170
+ #: or ``None`` (auto-determined from the model preset).
127
171
  precision: str | None = None
172
+ #: DataLoader prefetch queue depth per worker (default ``4``).
128
173
  prefetch_factor: int = 4
129
- persistent_workers: bool = True
174
+ #: Persist tile embeddings to disk when running a slide-level model.
130
175
  save_tile_embeddings: bool = False
176
+ #: Persist slide embeddings to disk when running a patient-level model.
131
177
  save_slide_embeddings: bool = False
178
+ #: Persist encoder latent representations when available.
132
179
  save_latents: bool = False
133
180
 
134
181
  @classmethod
135
182
  def from_config(cls, cfg: Any, *, run_on_cpu: bool = False) -> "ExecutionOptions":
136
183
  configured_num_gpus = cfg.speed.num_gpus
137
184
  requested_precision = normalize_precision_name(cfg.speed.precision)
138
- num_workers = cfg.speed.num_dataloader_workers
185
+ num_workers_per_gpu = cfg.speed.num_dataloader_workers
139
186
  prefetch_factor = int(cfg.speed.prefetch_factor_embedding)
140
- persistent_workers = bool(cfg.speed.persistent_workers_embedding)
141
187
  return cls(
142
188
  output_dir=Path(cfg.output_dir),
143
189
  output_format="pt",
144
190
  batch_size=int(cfg.model.batch_size),
145
- num_workers=int(num_workers) if num_workers is not None else None,
191
+ num_workers_per_gpu=int(num_workers_per_gpu) if num_workers_per_gpu is not None else None,
146
192
  num_preprocessing_workers=(
147
193
  int(cfg.speed.num_preprocessing_workers)
148
194
  if cfg.speed.num_preprocessing_workers is not None
@@ -151,7 +197,6 @@ class ExecutionOptions:
151
197
  num_gpus=1 if run_on_cpu else (int(configured_num_gpus) if configured_num_gpus is not None else None),
152
198
  precision="fp32" if run_on_cpu else requested_precision,
153
199
  prefetch_factor=prefetch_factor,
154
- persistent_workers=persistent_workers,
155
200
  save_tile_embeddings=bool(cfg.model.save_tile_embeddings),
156
201
  save_slide_embeddings=bool(cfg.model.save_slide_embeddings),
157
202
  save_latents=bool(cfg.model.save_latents),
@@ -174,23 +219,25 @@ class ExecutionOptions:
174
219
  object.__setattr__(self, "num_preprocessing_workers", capped_num_preprocessing_workers)
175
220
  logger = logging.getLogger(__name__)
176
221
  cap_source = f"slurm_cpu_limit={slurm_limit}" if slurm_limit is not None else f"cpu_count={cpu_count}"
177
- resolved_num_workers = self.resolved_num_workers()
178
- num_workers_label = (
222
+ resolved_num_workers = self.resolved_num_workers_per_gpu()
223
+ num_workers_per_gpu_label = (
179
224
  f"{resolved_num_workers} (requested=auto)"
180
- if self.num_workers is None
225
+ if self.num_workers_per_gpu is None
181
226
  else str(resolved_num_workers)
182
227
  )
183
228
  logger.info(
184
- "ExecutionOptions: num_workers=%s, num_preprocessing_workers=%d "
229
+ "ExecutionOptions: num_workers_per_gpu=%s, num_preprocessing_workers=%d "
185
230
  "(preprocessing cap=%d via %s)",
186
- num_workers_label,
231
+ num_workers_per_gpu_label,
187
232
  capped_num_preprocessing_workers,
188
233
  cap,
189
234
  cap_source,
190
235
  )
191
236
 
192
- def resolved_num_workers(self) -> int:
193
- return cpu_worker_limit() if self.num_workers is None else int(self.num_workers)
237
+ def resolved_num_workers_per_gpu(self) -> int:
238
+ if self.num_workers_per_gpu is not None:
239
+ return self.num_workers_per_gpu
240
+ return max(1, cpu_worker_limit() // self.num_gpus)
194
241
 
195
242
  def with_output_dir(self, output_dir: PathLike | None) -> "ExecutionOptions":
196
243
  if output_dir is None:
@@ -200,33 +247,60 @@ class ExecutionOptions:
200
247
 
201
248
  @dataclass(frozen=True, kw_only=True)
202
249
  class RunResult:
250
+ """Return value of :meth:`Pipeline.run`."""
251
+
252
+ #: Tile embedding artifacts written to disk.
203
253
  tile_artifacts: list[TileEmbeddingArtifact]
254
+ #: Hierarchical embedding artifacts; empty when hierarchical mode is disabled.
204
255
  hierarchical_artifacts: list[HierarchicalEmbeddingArtifact]
256
+ #: Slide embedding artifacts written to disk.
205
257
  slide_artifacts: list[SlideEmbeddingArtifact]
258
+ #: Patient embedding artifacts; empty when no patient-level model is used.
206
259
  patient_artifacts: list[PatientEmbeddingArtifact] = field(default_factory=list)
260
+ #: Path to ``process_list.csv``, which tracks processing status per sample.
207
261
  process_list_path: Path | None = None
208
262
 
209
263
 
210
264
  @dataclass(frozen=True, kw_only=True)
211
265
  class EmbeddedPatient:
266
+ """In-memory result of embedding a single patient."""
267
+
268
+ #: Unique patient identifier.
212
269
  patient_id: str
213
- patient_embedding: Any # torch.Tensor [D]
214
- slide_embeddings: dict[str, Any] # {sample_id: torch.Tensor [D]}
270
+ #: Aggregated patient embedding — :class:`torch.Tensor` of shape ``(D,)``.
271
+ patient_embedding: Any
272
+ #: Slide-level embeddings keyed by ``sample_id`` — each a :class:`torch.Tensor` of shape ``(D,)``.
273
+ slide_embeddings: dict[str, Any]
215
274
 
216
275
 
217
276
  @dataclass(frozen=True, kw_only=True)
218
277
  class EmbeddedSlide:
278
+ """In-memory result of embedding a single slide."""
279
+
280
+ #: Unique slide identifier.
219
281
  sample_id: str
282
+ #: Tile embeddings — :class:`torch.Tensor` of shape ``(N, D)``.
220
283
  tile_embeddings: Any
284
+ #: Slide-level embedding — :class:`torch.Tensor` of shape ``(D,)`` for
285
+ #: slide-level encoders; ``None`` for tile-only encoders.
221
286
  slide_embedding: Any | None
287
+ #: x coordinate (pixels at level 0) of each tile's top-left corner — array of shape ``(N,)``.
222
288
  x: Any
289
+ #: y coordinate (pixels at level 0) of each tile's top-left corner — array of shape ``(N,)``.
223
290
  y: Any
291
+ #: Tile side length in pixels at level 0.
224
292
  tile_size_lv0: int
293
+ #: Path to the source slide file.
225
294
  image_path: Path
295
+ #: Path to the tissue mask used for tiling, if any.
226
296
  mask_path: Path | None = None
297
+ #: Number of tiles extracted from the slide.
227
298
  num_tiles: int | None = None
299
+ #: Path to the mask preview image, if generated.
228
300
  mask_preview_path: Path | None = None
301
+ #: Path to the tiling preview image, if generated.
229
302
  tiling_preview_path: Path | None = None
303
+ #: Encoder latent representations when available; ``None`` otherwise.
230
304
  latents: Any | None = None
231
305
 
232
306
 
@@ -444,6 +518,27 @@ class Model:
444
518
  return self._backend
445
519
 
446
520
 
521
+ def list_models(level: str | None = None) -> list[str]:
522
+ """Return the available preset model names in a stable order.
523
+
524
+ Args:
525
+ level: Optional model level filter. Supported values are ``"tile"``,
526
+ ``"slide"``, and ``"patient"``.
527
+ """
528
+ if level is None:
529
+ return sorted(encoder_registry.names())
530
+
531
+ normalized_level = str(level).strip().lower()
532
+ if normalized_level not in {"tile", "slide", "patient"}:
533
+ raise ValueError("list_models(level=...) must be one of: tile, slide, patient")
534
+
535
+ return sorted(
536
+ name
537
+ for name in encoder_registry.names()
538
+ if encoder_registry.info(name)["level"] == normalized_level
539
+ )
540
+
541
+
447
542
  class Pipeline:
448
543
  def __init__(
449
544
  self,
@@ -0,0 +1,4 @@
1
+ from slide2vec.configs.resources import load_config
2
+
3
+
4
+ default_config = load_config("default")
@@ -38,12 +38,15 @@ tiling:
38
38
  # downsample controls which pyramid level is read for tissue segmentation.
39
39
  # Larger values are faster and use less memory; smaller values can improve mask precision.
40
40
  downsample: 64 # find the closest downsample in the slide for tissue segmentation
41
- sthresh: 8 # segmentation threshold (positive integer, using a higher threshold leads to less foreground and more background detection) (not used when use_otsu=True)
41
+ sthresh: 8 # segmentation threshold (positive integer, using a higher threshold leads to less foreground and more background detection) (not used when method="otsu")
42
42
  sthresh_up: 255 # upper threshold value for scaling the binary mask
43
43
  mthresh: 7 # median filter size (positive, odd integer)
44
44
  close: 4 # additional morphological closing to apply following initial thresholding (positive integer)
45
- use_otsu: false # use otsu's method instead of simple binary thresholding
46
- use_hsv: true # use HSV thresholding instead of simple binary thresholding
45
+ method: # tissue segmentation method: "hsv", "otsu", "threshold", or "sam2"; ignored when precomputed tissue masks are provided
46
+ sam2_checkpoint_path: # optional when method="sam2"; if empty, hs2p downloads the default AtlasPatch checkpoint from Hugging Face
47
+ sam2_config_path: # optional local override for the SAM2 model config; if empty, hs2p downloads the default AtlasPatch config from Hugging Face
48
+ sam2_device: "cpu" # device for SAM2 inference, e.g. "cpu", "cuda", or "cuda:0"
49
+ sam2_num_workers: # optional cap on concurrent SAM2 mask-resolution workers; set to 1 to serialize GPU inference and avoid CUDA OOMs
47
50
  filter_params:
48
51
  ref_tile_size: ${tiling.params.requested_tile_size_px} # reference tile size at the target spacing
49
52
  a_t: 4 # area filter threshold for tissue (positive integer, the minimum size of detected foreground contours to consider, relative to the reference tile size ref_tile_size, e.g. a value 10 means only detected foreground contours of size greater than 10 [ref_tile_size, ref_tile_size] tiles at spacing tiling.params.requested_spacing_um will be kept)
@@ -60,19 +63,19 @@ tiling:
60
63
  blur_threshold: 50.0 # minimum blur score (higher is sharper)
61
64
  qc_spacing_um: 2.0 # spacing at which pixel-based QC is evaluated
62
65
  preview:
63
- save: true # save preview images of slide tiling and mask overlays
66
+ save_mask_preview: true # save preview images of mask overlays
67
+ save_tiling_preview: true # save preview images of tile layouts
64
68
  downsample: 32 # downsample to use for preview rendering
65
- mask_overlay_color: [157, 219, 129] # RGB color used for tissue overlays in batch mask previews
69
+ tissue_contour_color: [157, 219, 129] # RGB color used for tissue contours in batch mask previews
66
70
  mask_overlay_alpha: 0.5 # alpha used for tissue overlays in batch mask previews
67
71
 
68
72
  speed:
69
73
  precision: # model inference precision ["fp32", "fp16", "bf16"]; if not set, determined automatically based on model recommendations
70
- num_dataloader_workers: # number of DataLoader worker processes for reading tiles during embedding; defaults to auto (job CPU budget, except cuCIM on-the-fly uses cpu_budget // speed.num_cucim_workers)
74
+ num_dataloader_workers: # number of DataLoader worker processes per GPU rank for reading tiles during embedding; defaults to auto (job CPU budget split across GPUs, except cuCIM on-the-fly uses per-GPU budget // speed.num_cucim_workers)
71
75
  num_gpus: # number of GPUs to use for feature extraction; defaults to all available GPUs
72
76
  num_preprocessing_workers: # number of workers for hs2p tiling (WSI reading, JPEG encoding, tar writing); defaults to the runtime CPU budget capped at 64
73
77
  num_cucim_workers: 4 # number of internal cucim threads per read_region call (embedding path, on-the-fly only); DataLoader workers are auto-set to cpu_count // num_cucim_workers
74
78
  prefetch_factor_embedding: 4 # prefetch factor for tile embedding dataloaders
75
- persistent_workers_embedding: true # keep DataLoader workers alive across epochs/batches
76
79
 
77
80
  wandb:
78
81
  enable: false
@@ -1,11 +1,10 @@
1
- from importlib.resources import as_file, files
1
+ from contextlib import contextmanager
2
2
  from pathlib import Path
3
3
  from typing import Iterator
4
- from contextlib import contextmanager
5
4
 
6
5
 
7
6
  def config_resource(*parts: str):
8
- path = files("slide2vec").joinpath("configs")
7
+ path = Path(__file__).resolve().parent
9
8
  for part in parts:
10
9
  path = path.joinpath(part)
11
10
  return path.with_suffix(".yaml")
@@ -21,6 +20,4 @@ def load_config(*parts: str):
21
20
 
22
21
  @contextmanager
23
22
  def config_path(*parts: str) -> Iterator[Path]:
24
- resource = config_resource(*parts)
25
- with as_file(resource) as resolved:
26
- yield resolved
23
+ yield config_resource(*parts)
@@ -26,11 +26,10 @@ def main(argv=None) -> int:
26
26
  _compute_tile_embeddings_for_slide,
27
27
  _is_hierarchical_preprocessing,
28
28
  _resolve_hierarchical_geometry,
29
- deserialize_execution,
30
- deserialize_preprocessing,
31
29
  load_successful_tiled_slides,
32
30
  )
33
31
  from slide2vec.progress import JsonlProgressReporter, activate_progress_reporter
32
+ from slide2vec.runtime.serialization import deserialize_execution, deserialize_preprocessing
34
33
 
35
34
  parser = get_args_parser(add_help=True)
36
35
  args = parser.parse_args(argv)
@@ -49,6 +48,7 @@ def main(argv=None) -> int:
49
48
  model_spec["name"],
50
49
  device=f"cuda:{local_rank}",
51
50
  output_variant=model_spec.get("output_variant"),
51
+ allow_non_recommended_settings=bool(model_spec["allow_non_recommended_settings"]),
52
52
  )
53
53
  preprocessing = deserialize_preprocessing(request["preprocessing"])
54
54
  execution = deserialize_execution(request["execution"])
@@ -119,20 +119,24 @@ def main(argv=None) -> int:
119
119
  return 0
120
120
  assigned_slides = [paired_by_sample[sample_id][0] for sample_id in assigned_ids]
121
121
  assigned_tiling_results = [paired_by_sample[sample_id][1] for sample_id in assigned_ids]
122
- embedded_slides = _compute_embedded_slides(
123
- model,
124
- assigned_slides,
125
- assigned_tiling_results,
126
- preprocessing=preprocessing,
127
- execution=execution,
128
- )
129
- for embedded_slide in embedded_slides:
122
+
123
+ def _persist_embedded_slide(slide, tiling_result, embedded_slide) -> None:
130
124
  payload = {
131
125
  "tile_embeddings": _to_cpu_payload(embedded_slide.tile_embeddings),
132
126
  "slide_embedding": _to_cpu_payload(embedded_slide.slide_embedding),
133
127
  "latents": _to_cpu_payload(embedded_slide.latents),
134
128
  }
135
129
  torch.save(payload, coordination_dir / f"{embedded_slide.sample_id}.embedded.pt")
130
+
131
+ _compute_embedded_slides(
132
+ model,
133
+ assigned_slides,
134
+ assigned_tiling_results,
135
+ preprocessing=preprocessing,
136
+ execution=execution,
137
+ on_embedded_slide=_persist_embedded_slide,
138
+ collect_results=False,
139
+ )
136
140
  return 0
137
141
  finally:
138
142
  if dist.is_available() and dist.is_initialized():