slide2vec 4.3.0__tar.gz → 4.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. {slide2vec-4.3.0 → slide2vec-4.5.0}/PKG-INFO +3 -3
  2. {slide2vec-4.3.0 → slide2vec-4.5.0}/pyproject.toml +4 -4
  3. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/__init__.py +3 -1
  4. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/api.py +92 -31
  5. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/cli.py +5 -7
  6. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/configs/default.yaml +3 -3
  7. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/configs/resources.py +2 -6
  8. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/distributed/direct_embed_worker.py +47 -22
  9. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/distributed/pipeline_worker.py +30 -15
  10. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/__init__.py +1 -1
  11. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/validation.py +13 -9
  12. slide2vec-4.5.0/slide2vec/inference.py +919 -0
  13. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/progress.py +56 -10
  14. slide2vec-4.5.0/slide2vec/runtime/artifacts_collect.py +155 -0
  15. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/runtime/batching.py +58 -27
  16. slide2vec-4.5.0/slide2vec/runtime/cpu_budget.py +67 -0
  17. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/runtime/distributed.py +8 -5
  18. slide2vec-4.5.0/slide2vec/runtime/distributed_stage.py +278 -0
  19. slide2vec-4.5.0/slide2vec/runtime/embedding_persist.py +131 -0
  20. slide2vec-4.5.0/slide2vec/runtime/embedding_pipeline.py +416 -0
  21. slide2vec-4.5.0/slide2vec/runtime/manifest.py +134 -0
  22. slide2vec-4.5.0/slide2vec/runtime/patient_pipeline.py +130 -0
  23. slide2vec-4.5.0/slide2vec/runtime/persist_callbacks.py +170 -0
  24. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/runtime/persistence.py +33 -10
  25. slide2vec-4.5.0/slide2vec/runtime/process_list.py +301 -0
  26. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/runtime/serialization.py +23 -20
  27. slide2vec-4.5.0/slide2vec/runtime/slide_encode.py +52 -0
  28. slide2vec-4.5.0/slide2vec/runtime/tiling_pipeline.py +196 -0
  29. slide2vec-4.5.0/slide2vec/runtime/worker_io.py +51 -0
  30. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/utils/config.py +15 -24
  31. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/utils/log_utils.py +1 -1
  32. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/utils/tiling_io.py +34 -0
  33. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec.egg-info/PKG-INFO +3 -3
  34. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec.egg-info/SOURCES.txt +14 -1
  35. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec.egg-info/requires.txt +2 -2
  36. {slide2vec-4.3.0 → slide2vec-4.5.0}/tests/test_architecture_runtime_split.py +0 -1
  37. {slide2vec-4.3.0 → slide2vec-4.5.0}/tests/test_hs2p_package_cutover.py +69 -0
  38. slide2vec-4.5.0/tests/test_output_consistency.py +192 -0
  39. {slide2vec-4.3.0 → slide2vec-4.5.0}/tests/test_progress.py +212 -88
  40. {slide2vec-4.3.0 → slide2vec-4.5.0}/tests/test_regression_core.py +66 -58
  41. {slide2vec-4.3.0 → slide2vec-4.5.0}/tests/test_regression_inference.py +1064 -281
  42. slide2vec-4.5.0/tests/test_runtime_batching.py +33 -0
  43. slide2vec-4.3.0/slide2vec/inference.py +0 -2537
  44. slide2vec-4.3.0/slide2vec/main.py +0 -8
  45. {slide2vec-4.3.0 → slide2vec-4.5.0}/LICENSE +0 -0
  46. {slide2vec-4.3.0 → slide2vec-4.5.0}/README.md +0 -0
  47. {slide2vec-4.3.0 → slide2vec-4.5.0}/setup.cfg +0 -0
  48. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/__main__.py +0 -0
  49. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/artifacts.py +0 -0
  50. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/configs/__init__.py +0 -0
  51. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/data/__init__.py +0 -0
  52. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/data/dataset.py +0 -0
  53. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/data/tile_reader.py +0 -0
  54. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/data/tile_store.py +0 -0
  55. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/distributed/__init__.py +0 -0
  56. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/base.py +0 -0
  57. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/__init__.py +0 -0
  58. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/conch.py +0 -0
  59. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/gigapath.py +0 -0
  60. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/hibou.py +0 -0
  61. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/hoptimus.py +0 -0
  62. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/lunit.py +0 -0
  63. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/midnight.py +0 -0
  64. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/moozy/__init__.py +0 -0
  65. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/moozy/blocks.py +0 -0
  66. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/moozy/case.py +0 -0
  67. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/moozy/loading.py +0 -0
  68. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/moozy/slide.py +0 -0
  69. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/moozy/types.py +0 -0
  70. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/musk.py +0 -0
  71. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/phikon.py +0 -0
  72. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/prism.py +0 -0
  73. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/prost40m.py +0 -0
  74. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/titan.py +0 -0
  75. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/uni.py +0 -0
  76. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/models/virchow.py +0 -0
  77. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/encoders/registry.py +0 -0
  78. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/runtime/__init__.py +0 -0
  79. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/runtime/embedding.py +0 -0
  80. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/runtime/hierarchical.py +0 -0
  81. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/runtime/model_settings.py +0 -0
  82. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/runtime/progress_bridge.py +0 -0
  83. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/runtime/registry.py +0 -0
  84. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/runtime/tiling.py +1 -1
  85. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/runtime/types.py +0 -0
  86. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/utils/__init__.py +0 -0
  87. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/utils/coordinates.py +0 -0
  88. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec/utils/utils.py +0 -0
  89. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec.egg-info/dependency_links.txt +0 -0
  90. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec.egg-info/entry_points.txt +0 -0
  91. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec.egg-info/not-zip-safe +0 -0
  92. {slide2vec-4.3.0 → slide2vec-4.5.0}/slide2vec.egg-info/top_level.txt +0 -0
  93. {slide2vec-4.3.0 → slide2vec-4.5.0}/tests/test_encoder_registry.py +0 -0
  94. {slide2vec-4.3.0 → slide2vec-4.5.0}/tests/test_regression_models.py +0 -0
  95. {slide2vec-4.3.0 → slide2vec-4.5.0}/tests/test_tile_store.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: slide2vec
3
- Version: 4.3.0
3
+ Version: 4.5.0
4
4
  Summary: Embedding of whole slide images with Foundation Models
5
5
  Author-email: Clément Grisi <clement.grisi@radboudumc.nl>
6
6
  License-Expression: Apache-2.0
@@ -15,7 +15,7 @@ Classifier: Programming Language :: Python :: 3.13
15
15
  Requires-Python: >=3.10
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
- Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.0.0
18
+ Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.0.5
19
19
  Requires-Dist: omegaconf
20
20
  Requires-Dist: matplotlib
21
21
  Requires-Dist: numpy<2
@@ -65,7 +65,7 @@ Requires-Dist: numpy<2; extra == "fm"
65
65
  Requires-Dist: pandas; extra == "fm"
66
66
  Requires-Dist: pillow; extra == "fm"
67
67
  Requires-Dist: rich; extra == "fm"
68
- Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.0.0; extra == "fm"
68
+ Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.0.5; extra == "fm"
69
69
  Requires-Dist: wandb; extra == "fm"
70
70
  Requires-Dist: torch<2.8,>=2.3; extra == "fm"
71
71
  Requires-Dist: torchvision>=0.18.0; extra == "fm"
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "slide2vec"
7
- version = "4.3.0"
7
+ version = "4.5.0"
8
8
  description = "Embedding of whole slide images with Foundation Models"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -21,7 +21,7 @@ classifiers = [
21
21
  "Programming Language :: Python :: 3.13",
22
22
  ]
23
23
  dependencies = [
24
- "hs2p[asap,cucim,openslide,sam2,vips]>=4.0.0",
24
+ "hs2p[asap,cucim,openslide,sam2,vips]>=4.0.5",
25
25
  "omegaconf",
26
26
  "matplotlib",
27
27
  "numpy<2",
@@ -88,7 +88,7 @@ fm = [
88
88
  "pandas",
89
89
  "pillow",
90
90
  "rich",
91
- "hs2p[asap,cucim,openslide,sam2,vips]>=4.0.0",
91
+ "hs2p[asap,cucim,openslide,sam2,vips]>=4.0.5",
92
92
  "wandb",
93
93
  "torch>=2.3,<2.8",
94
94
  "torchvision>=0.18.0",
@@ -164,7 +164,7 @@ no_implicit_reexport = true
164
164
  max-line-length = 160
165
165
 
166
166
  [tool.bumpver]
167
- current_version = "4.3.0"
167
+ current_version = "4.5.0"
168
168
  version_pattern = "MAJOR.MINOR.PATCH"
169
169
  commit = false # We do version bumping in CI, not as a commit
170
170
  tag = false # Git tag already exists — we don't auto-tag
@@ -1,4 +1,5 @@
1
1
  from slide2vec.api import (
2
+ EmbeddedPatient,
2
3
  EmbeddedSlide,
3
4
  ExecutionOptions,
4
5
  Model,
@@ -10,7 +11,7 @@ from slide2vec.api import (
10
11
  from slide2vec.artifacts import HierarchicalEmbeddingArtifact, SlideEmbeddingArtifact, TileEmbeddingArtifact
11
12
 
12
13
 
13
- __version__ = "4.3.0"
14
+ __version__ = "4.5.0"
14
15
 
15
16
  __all__ = [
16
17
  "Model",
@@ -19,6 +20,7 @@ __all__ = [
19
20
  "PreprocessingConfig",
20
21
  "ExecutionOptions",
21
22
  "RunResult",
23
+ "EmbeddedPatient",
22
24
  "EmbeddedSlide",
23
25
  "SlideEmbeddingArtifact",
24
26
  "HierarchicalEmbeddingArtifact",
@@ -42,25 +42,53 @@ TilingResultsInput = Sequence[Any] | Mapping[str, Any]
42
42
 
43
43
  @dataclass(frozen=True, kw_only=True)
44
44
  class PreprocessingConfig:
45
+ """Configuration for slide tiling and preprocessing."""
46
+
47
+ #: Slide reading backend. ``"auto"`` tries cucim → openslide → vips in order.
48
+ #: Explicit choices: ``"cucim"``, ``"openslide"``, ``"vips"``, ``"asap"``.
45
49
  backend: str = "auto"
50
+ #: Target spacing in µm/px. Resolved from the model preset when ``None``.
46
51
  requested_spacing_um: float | None = None
52
+ #: Tile side length in pixels at *requested_spacing_um*.
53
+ #: Resolved from the model preset when ``None``.
47
54
  requested_tile_size_px: int | None = None
55
+ #: Parent region side length in pixels (hierarchical mode).
56
+ #: Auto-derived as ``requested_tile_size_px × region_tile_multiple`` when ``None``.
48
57
  requested_region_size_px: int | None = None
58
+ #: Region grid width/height in tiles (e.g. ``6`` → 6×6 = 36 tiles per region).
59
+ #: Enables hierarchical extraction when set; must be ≥ 2.
49
60
  region_tile_multiple: int | None = None
61
+ #: Relative spacing tolerance for pyramid level selection (default ``0.05``).
50
62
  tolerance: float = 0.05
63
+ #: Fractional tile overlap (``0.0`` = no overlap).
51
64
  overlap: float = 0.0
65
+ #: Minimum tissue fraction required to keep a tile (default ``0.01``).
52
66
  tissue_threshold: float = 0.01
67
+ #: Directory containing pre-extracted tile coordinates to reuse, skipping tiling.
53
68
  read_coordinates_from: Path | None = None
69
+ #: Directory containing pre-extracted tile images to skip the tiling step entirely.
54
70
  read_tiles_from: Path | None = None
71
+ #: Read and decode tiles on demand rather than pre-loading into memory.
55
72
  on_the_fly: bool = True
73
+ #: Decode tiles on the GPU via CuCIM / nvImageCodec when ``True``.
56
74
  gpu_decode: bool = False
75
+ #: Dynamically adjust batch size based on tile count.
57
76
  adaptive_batching: bool = False
77
+ #: Group adjacent tiles into supertile batches for faster I/O.
58
78
  use_supertiles: bool = True
79
+ #: JPEG decode library — ``"turbojpeg"`` (default) or ``"pillow"``.
59
80
  jpeg_backend: str = "turbojpeg"
81
+ #: Number of CuCIM reader threads.
60
82
  num_cucim_workers: int = 4
83
+ #: Skip slides already present in the output directory when ``True``.
61
84
  resume: bool = False
85
+ #: Forwarded to hs2p segmentation config. Supported keys: ``method``,
86
+ #: ``downsample``, ``sam2_device``. See :doc:`preprocessing` for details.
62
87
  segmentation: dict[str, Any] = field(default_factory=dict)
88
+ #: Forwarded to hs2p tile-filtering config.
63
89
  filtering: dict[str, Any] = field(default_factory=dict)
90
+ #: Controls whether hs2p writes mask and tiling preview images.
91
+ #: Keys: ``save_mask_preview``, ``save_tiling_preview``, ``downsample``.
64
92
  preview: dict[str, Any] = field(default_factory=dict)
65
93
 
66
94
  @classmethod
@@ -83,20 +111,14 @@ class PreprocessingConfig:
83
111
  int(channel) for channel in preview_cfg.tissue_contour_color
84
112
  )
85
113
  preview_kwargs["mask_overlay_alpha"] = float(preview_cfg.mask_overlay_alpha)
114
+ region_size_px = getattr(tiling.params, "requested_region_size_px", None)
115
+ region_tile_multiple = getattr(tiling.params, "region_tile_multiple", None)
86
116
  return cls(
87
117
  backend=tiling.backend,
88
118
  requested_spacing_um=float(tiling.params.requested_spacing_um),
89
119
  requested_tile_size_px=int(tiling.params.requested_tile_size_px),
90
- requested_region_size_px=(
91
- int(v)
92
- if (v := getattr(tiling.params, "requested_region_size_px", None)) is not None
93
- else None
94
- ),
95
- region_tile_multiple=(
96
- int(v)
97
- if (v := getattr(tiling.params, "region_tile_multiple", None)) is not None
98
- else None
99
- ),
120
+ requested_region_size_px=int(region_size_px) if region_size_px is not None else None,
121
+ region_tile_multiple=int(region_tile_multiple) if region_tile_multiple is not None else None,
100
122
  tolerance=float(tiling.params.tolerance),
101
123
  overlap=float(tiling.params.overlap),
102
124
  tissue_threshold=float(tiling.params.tissue_threshold),
@@ -123,31 +145,44 @@ class PreprocessingConfig:
123
145
 
124
146
  @dataclass(frozen=True, kw_only=True)
125
147
  class ExecutionOptions:
148
+ """Runtime execution and output settings."""
149
+
150
+ #: Directory where artifacts are written. Required for :class:`Pipeline` runs.
126
151
  output_dir: Path | None = None
152
+ #: Tensor serialization format — ``"pt"`` (PyTorch, default) or ``"npz"`` (NumPy).
127
153
  output_format: str = "pt"
128
- batch_size: int = 1
129
- num_workers: int | None = None
154
+ #: Number of tiles per forward pass.
155
+ batch_size: int = 32
156
+ #: DataLoader worker count per GPU rank. ``None`` means auto
157
+ #: (capped by CPU / SLURM limit, then split across the resolved GPU count).
158
+ num_workers_per_gpu: int | None = None
159
+ #: Tiling worker count. ``None`` means auto (capped by CPU / SLURM limit).
130
160
  num_preprocessing_workers: int | None = None
161
+ #: Number of GPUs to use. ``None`` defaults to all available GPUs.
131
162
  num_gpus: int | None = None
163
+ #: Forward-pass dtype — ``"fp16"``, ``"bf16"``, ``"fp32"``,
164
+ #: or ``None`` (auto-determined from the model preset).
132
165
  precision: str | None = None
166
+ #: DataLoader prefetch queue depth per worker (default ``4``).
133
167
  prefetch_factor: int = 4
134
- persistent_workers: bool = True
168
+ #: Persist tile embeddings to disk when running a slide-level model.
135
169
  save_tile_embeddings: bool = False
170
+ #: Persist slide embeddings to disk when running a patient-level model.
136
171
  save_slide_embeddings: bool = False
172
+ #: Persist encoder latent representations when available.
137
173
  save_latents: bool = False
138
174
 
139
175
  @classmethod
140
176
  def from_config(cls, cfg: Any, *, run_on_cpu: bool = False) -> "ExecutionOptions":
141
177
  configured_num_gpus = cfg.speed.num_gpus
142
178
  requested_precision = normalize_precision_name(cfg.speed.precision)
143
- num_workers = cfg.speed.num_dataloader_workers
179
+ num_workers_per_gpu = cfg.speed.num_dataloader_workers
144
180
  prefetch_factor = int(cfg.speed.prefetch_factor_embedding)
145
- persistent_workers = bool(cfg.speed.persistent_workers_embedding)
146
181
  return cls(
147
182
  output_dir=Path(cfg.output_dir),
148
183
  output_format="pt",
149
184
  batch_size=int(cfg.model.batch_size),
150
- num_workers=int(num_workers) if num_workers is not None else None,
185
+ num_workers_per_gpu=int(num_workers_per_gpu) if num_workers_per_gpu is not None else None,
151
186
  num_preprocessing_workers=(
152
187
  int(cfg.speed.num_preprocessing_workers)
153
188
  if cfg.speed.num_preprocessing_workers is not None
@@ -156,7 +191,6 @@ class ExecutionOptions:
156
191
  num_gpus=1 if run_on_cpu else (int(configured_num_gpus) if configured_num_gpus is not None else None),
157
192
  precision="fp32" if run_on_cpu else requested_precision,
158
193
  prefetch_factor=prefetch_factor,
159
- persistent_workers=persistent_workers,
160
194
  save_tile_embeddings=bool(cfg.model.save_tile_embeddings),
161
195
  save_slide_embeddings=bool(cfg.model.save_slide_embeddings),
162
196
  save_latents=bool(cfg.model.save_latents),
@@ -179,23 +213,25 @@ class ExecutionOptions:
179
213
  object.__setattr__(self, "num_preprocessing_workers", capped_num_preprocessing_workers)
180
214
  logger = logging.getLogger(__name__)
181
215
  cap_source = f"slurm_cpu_limit={slurm_limit}" if slurm_limit is not None else f"cpu_count={cpu_count}"
182
- resolved_num_workers = self.resolved_num_workers()
183
- num_workers_label = (
216
+ resolved_num_workers = self.resolved_num_workers_per_gpu()
217
+ num_workers_per_gpu_label = (
184
218
  f"{resolved_num_workers} (requested=auto)"
185
- if self.num_workers is None
219
+ if self.num_workers_per_gpu is None
186
220
  else str(resolved_num_workers)
187
221
  )
188
222
  logger.info(
189
- "ExecutionOptions: num_workers=%s, num_preprocessing_workers=%d "
223
+ "ExecutionOptions: num_workers_per_gpu=%s, num_preprocessing_workers=%d "
190
224
  "(preprocessing cap=%d via %s)",
191
- num_workers_label,
225
+ num_workers_per_gpu_label,
192
226
  capped_num_preprocessing_workers,
193
227
  cap,
194
228
  cap_source,
195
229
  )
196
230
 
197
- def resolved_num_workers(self) -> int:
198
- return cpu_worker_limit() if self.num_workers is None else int(self.num_workers)
231
+ def resolved_num_workers_per_gpu(self) -> int:
232
+ if self.num_workers_per_gpu is not None:
233
+ return self.num_workers_per_gpu
234
+ return max(1, cpu_worker_limit() // self.num_gpus)
199
235
 
200
236
  def with_output_dir(self, output_dir: PathLike | None) -> "ExecutionOptions":
201
237
  if output_dir is None:
@@ -205,33 +241,60 @@ class ExecutionOptions:
205
241
 
206
242
  @dataclass(frozen=True, kw_only=True)
207
243
  class RunResult:
244
+ """Return value of :meth:`Pipeline.run`."""
245
+
246
+ #: Tile embedding artifacts written to disk.
208
247
  tile_artifacts: list[TileEmbeddingArtifact]
248
+ #: Hierarchical embedding artifacts; empty when hierarchical mode is disabled.
209
249
  hierarchical_artifacts: list[HierarchicalEmbeddingArtifact]
250
+ #: Slide embedding artifacts written to disk.
210
251
  slide_artifacts: list[SlideEmbeddingArtifact]
252
+ #: Patient embedding artifacts; empty when no patient-level model is used.
211
253
  patient_artifacts: list[PatientEmbeddingArtifact] = field(default_factory=list)
254
+ #: Path to ``process_list.csv``, which tracks processing status per sample.
212
255
  process_list_path: Path | None = None
213
256
 
214
257
 
215
258
  @dataclass(frozen=True, kw_only=True)
216
259
  class EmbeddedPatient:
260
+ """In-memory result of embedding a single patient."""
261
+
262
+ #: Unique patient identifier.
217
263
  patient_id: str
218
- patient_embedding: Any # torch.Tensor [D]
219
- slide_embeddings: dict[str, Any] # {sample_id: torch.Tensor [D]}
264
+ #: Aggregated patient embedding — :class:`torch.Tensor` of shape ``(D,)``.
265
+ patient_embedding: Any
266
+ #: Slide-level embeddings keyed by ``sample_id`` — each a :class:`torch.Tensor` of shape ``(D,)``.
267
+ slide_embeddings: dict[str, Any]
220
268
 
221
269
 
222
270
  @dataclass(frozen=True, kw_only=True)
223
271
  class EmbeddedSlide:
272
+ """In-memory result of embedding a single slide."""
273
+
274
+ #: Unique slide identifier.
224
275
  sample_id: str
276
+ #: Tile embeddings — :class:`torch.Tensor` of shape ``(N, D)``.
225
277
  tile_embeddings: Any
278
+ #: Slide-level embedding — :class:`torch.Tensor` of shape ``(D,)`` for
279
+ #: slide-level encoders; ``None`` for tile-only encoders.
226
280
  slide_embedding: Any | None
281
+ #: x coordinate (pixels at level 0) of each tile's top-left corner — array of shape ``(N,)``.
227
282
  x: Any
283
+ #: y coordinate (pixels at level 0) of each tile's top-left corner — array of shape ``(N,)``.
228
284
  y: Any
285
+ #: Tile side length in pixels at level 0.
229
286
  tile_size_lv0: int
287
+ #: Path to the source slide file.
230
288
  image_path: Path
289
+ #: Path to the tissue mask used for tiling, if any.
231
290
  mask_path: Path | None = None
291
+ #: Number of tiles extracted from the slide.
232
292
  num_tiles: int | None = None
293
+ #: Path to the mask preview image, if generated.
233
294
  mask_preview_path: Path | None = None
295
+ #: Path to the tiling preview image, if generated.
234
296
  tiling_preview_path: Path | None = None
297
+ #: Encoder latent representations when available; ``None`` otherwise.
235
298
  latents: Any | None = None
236
299
 
237
300
 
@@ -546,11 +609,9 @@ def _require_output_dir_for_persistence(execution: ExecutionOptions, *, method_n
546
609
 
547
610
 
548
611
  def _recommended_execution_precision(model: Model | None) -> str:
549
- name = None if model is None else model.name
550
- if name and name in encoder_registry:
551
- info = encoder_registry.info(name)
552
- return info["precision"] if "precision" in info and info["precision"] is not None else "fp32"
553
- return "fp32"
612
+ if model is None or model.name not in encoder_registry:
613
+ return "fp32"
614
+ return encoder_registry.info(model.name).get("precision") or "fp32"
554
615
 
555
616
 
556
617
  def _resolve_direct_api_preprocessing(
@@ -1,8 +1,8 @@
1
1
  import argparse
2
2
 
3
3
  from slide2vec.api import ExecutionOptions, Model, Pipeline, PreprocessingConfig
4
+ from slide2vec.progress import activate_progress_reporter, create_cli_progress_reporter
4
5
  from slide2vec.utils.config import setup, hf_login
5
- import slide2vec.progress as progress
6
6
 
7
7
 
8
8
  def get_args_parser(add_help: bool = True):
@@ -27,10 +27,8 @@ def build_model_and_pipeline(args):
27
27
  hf_login()
28
28
  model = Model.from_preset(
29
29
  cfg.model.name,
30
- output_variant=getattr(cfg.model, "output_variant", None),
31
- allow_non_recommended_settings=bool(
32
- getattr(cfg.model, "allow_non_recommended_settings", False)
33
- ),
30
+ output_variant=cfg.model.output_variant,
31
+ allow_non_recommended_settings=bool(cfg.model.allow_non_recommended_settings),
34
32
  device="cpu" if args.run_on_cpu else "auto",
35
33
  )
36
34
  preprocessing = PreprocessingConfig.from_config(cfg)
@@ -42,8 +40,8 @@ def build_model_and_pipeline(args):
42
40
  def main(argv=None):
43
41
  args = parse_args(argv)
44
42
  pipeline, cfg = build_model_and_pipeline(args)
45
- reporter = progress.create_cli_progress_reporter(output_dir=getattr(cfg, "output_dir", None))
46
- with progress.activate_progress_reporter(reporter):
43
+ reporter = create_cli_progress_reporter(output_dir=cfg.output_dir)
44
+ with activate_progress_reporter(reporter):
47
45
  return pipeline.run(
48
46
  manifest_path=cfg.csv,
49
47
  tiling_only=args.tiling_only,
@@ -42,10 +42,11 @@ tiling:
42
42
  sthresh_up: 255 # upper threshold value for scaling the binary mask
43
43
  mthresh: 7 # median filter size (positive, odd integer)
44
44
  close: 4 # additional morphological closing to apply following initial thresholding (positive integer)
45
- method: "hsv" # tissue segmentation method: "hsv", "otsu", "threshold", or "sam2"
45
+ method: # tissue segmentation method: "hsv", "otsu", "threshold", or "sam2"; ignored when precomputed tissue masks are provided
46
46
  sam2_checkpoint_path: # optional when method="sam2"; if empty, hs2p downloads the default AtlasPatch checkpoint from Hugging Face
47
47
  sam2_config_path: # optional local override for the SAM2 model config; if empty, hs2p downloads the default AtlasPatch config from Hugging Face
48
48
  sam2_device: "cpu" # device for SAM2 inference, e.g. "cpu", "cuda", or "cuda:0"
49
+ sam2_num_workers: # optional cap on concurrent SAM2 mask-resolution workers; set to 1 to serialize GPU inference and avoid CUDA OOMs
49
50
  filter_params:
50
51
  ref_tile_size: ${tiling.params.requested_tile_size_px} # reference tile size at the target spacing
51
52
  a_t: 4 # area filter threshold for tissue (positive integer, the minimum size of detected foreground contours to consider, relative to the reference tile size ref_tile_size, e.g. a value 10 means only detected foreground contours of size greater than 10 [ref_tile_size, ref_tile_size] tiles at spacing tiling.params.requested_spacing_um will be kept)
@@ -70,12 +71,11 @@ tiling:
70
71
 
71
72
  speed:
72
73
  precision: # model inference precision ["fp32", "fp16", "bf16"]; if not set, determined automatically based on model recommendations
73
- num_dataloader_workers: # number of DataLoader worker processes for reading tiles during embedding; defaults to auto (job CPU budget, except cuCIM on-the-fly uses cpu_budget // speed.num_cucim_workers)
74
+ num_dataloader_workers: # number of DataLoader worker processes per GPU rank for reading tiles during embedding; defaults to auto (job CPU budget split across GPUs, except cuCIM on-the-fly uses per-GPU budget // speed.num_cucim_workers)
74
75
  num_gpus: # number of GPUs to use for feature extraction; defaults to all available GPUs
75
76
  num_preprocessing_workers: # number of workers for hs2p tiling (WSI reading, JPEG encoding, tar writing); defaults to the runtime CPU budget capped at 64
76
77
  num_cucim_workers: 4 # number of internal cucim threads per read_region call (embedding path, on-the-fly only); DataLoader workers are auto-set to cpu_count // num_cucim_workers
77
78
  prefetch_factor_embedding: 4 # prefetch factor for tile embedding dataloaders
78
- persistent_workers_embedding: true # keep DataLoader workers alive across epochs/batches
79
79
 
80
80
  wandb:
81
81
  enable: false
@@ -1,11 +1,10 @@
1
1
  from contextlib import contextmanager
2
- from importlib.resources import as_file, files
3
2
  from pathlib import Path
4
3
  from typing import Iterator
5
4
 
6
5
 
7
6
  def config_resource(*parts: str):
8
- path = files("slide2vec").joinpath("configs")
7
+ path = Path(__file__).resolve().parent
9
8
  for part in parts:
10
9
  path = path.joinpath(part)
11
10
  return path.with_suffix(".yaml")
@@ -21,7 +20,4 @@ def load_config(*parts: str):
21
20
 
22
21
  @contextmanager
23
22
  def config_path(*parts: str) -> Iterator[Path]:
24
- resource = config_resource(*parts)
25
- with as_file(resource) as resolved:
26
- yield resolved
27
-
23
+ yield config_resource(*parts)
@@ -18,15 +18,12 @@ def main(argv=None) -> int:
18
18
  import torch.distributed as dist
19
19
 
20
20
  import slide2vec.distributed as distributed
21
+ import slide2vec.inference as inference
21
22
  from slide2vec.api import Model
22
- from slide2vec.inference import (
23
- _build_hierarchical_index,
24
- _compute_embedded_slides,
25
- _compute_hierarchical_embedding_shard_for_slide,
26
- _compute_tile_embeddings_for_slide,
27
- _is_hierarchical_preprocessing,
28
- _resolve_hierarchical_geometry,
29
- load_successful_tiled_slides,
23
+ from slide2vec.runtime.hierarchical import (
24
+ build_hierarchical_index,
25
+ is_hierarchical_preprocessing,
26
+ resolve_hierarchical_geometry,
30
27
  )
31
28
  from slide2vec.progress import JsonlProgressReporter, activate_progress_reporter
32
29
  from slide2vec.runtime.serialization import deserialize_execution, deserialize_preprocessing
@@ -52,7 +49,10 @@ def main(argv=None) -> int:
52
49
  )
53
50
  preprocessing = deserialize_preprocessing(request["preprocessing"])
54
51
  execution = deserialize_execution(request["execution"])
55
- slide_records, tiling_results = load_successful_tiled_slides(output_dir)
52
+ load_successful_tiled_slides_fn = getattr(inference, "load_successful_tiled_slides", None)
53
+ if not callable(load_successful_tiled_slides_fn):
54
+ from slide2vec.runtime.manifest import load_successful_tiled_slides as load_successful_tiled_slides_fn
55
+ slide_records, tiling_results = load_successful_tiled_slides_fn(output_dir)
56
56
  paired_by_sample = {
57
57
  slide.sample_id: (slide, tiling_result)
58
58
  for slide, tiling_result in zip(slide_records, tiling_results)
@@ -74,15 +74,24 @@ def main(argv=None) -> int:
74
74
  sample_id = request["sample_id"]
75
75
  slide, tiling_result = paired_by_sample[sample_id]
76
76
  loaded = model._load_backend()
77
- if _is_hierarchical_preprocessing(preprocessing):
78
- geometry = _resolve_hierarchical_geometry(preprocessing, tiling_result)
79
- index = _build_hierarchical_index(
77
+ if is_hierarchical_preprocessing(preprocessing):
78
+ geometry = resolve_hierarchical_geometry(preprocessing, tiling_result)
79
+ index = build_hierarchical_index(
80
80
  tiling_result,
81
81
  region_tile_multiple=int(preprocessing.region_tile_multiple),
82
82
  tile_size_lv0=int(geometry["tile_size_lv0"]),
83
83
  )
84
84
  flat_indices = np.array_split(index.flat_index, world_size)[global_rank]
85
- shard_indices, tile_embeddings = _compute_hierarchical_embedding_shard_for_slide(
85
+ compute_hierarchical_embedding_shard_for_slide_fn = getattr(
86
+ inference,
87
+ "_compute_hierarchical_embedding_shard_for_slide",
88
+ None,
89
+ )
90
+ if not callable(compute_hierarchical_embedding_shard_for_slide_fn):
91
+ from slide2vec.runtime.embedding_pipeline import (
92
+ compute_hierarchical_embedding_shard_for_slide as compute_hierarchical_embedding_shard_for_slide_fn,
93
+ )
94
+ shard_indices, tile_embeddings = compute_hierarchical_embedding_shard_for_slide_fn(
86
95
  loaded,
87
96
  slide,
88
97
  tiling_result,
@@ -98,7 +107,16 @@ def main(argv=None) -> int:
98
107
  else:
99
108
  num_tiles = len(tiling_result.x)
100
109
  tile_indices = np.array_split(np.arange(num_tiles, dtype=np.int64), world_size)[global_rank]
101
- tile_embeddings = _compute_tile_embeddings_for_slide(
110
+ compute_tile_embeddings_for_slide_fn = getattr(
111
+ inference,
112
+ "_compute_tile_embeddings_for_slide",
113
+ None,
114
+ )
115
+ if not callable(compute_tile_embeddings_for_slide_fn):
116
+ from slide2vec.runtime.embedding_pipeline import (
117
+ compute_tile_embeddings_for_slide as compute_tile_embeddings_for_slide_fn,
118
+ )
119
+ tile_embeddings = compute_tile_embeddings_for_slide_fn(
102
120
  loaded,
103
121
  model,
104
122
  slide,
@@ -119,20 +137,27 @@ def main(argv=None) -> int:
119
137
  return 0
120
138
  assigned_slides = [paired_by_sample[sample_id][0] for sample_id in assigned_ids]
121
139
  assigned_tiling_results = [paired_by_sample[sample_id][1] for sample_id in assigned_ids]
122
- embedded_slides = _compute_embedded_slides(
123
- model,
124
- assigned_slides,
125
- assigned_tiling_results,
126
- preprocessing=preprocessing,
127
- execution=execution,
128
- )
129
- for embedded_slide in embedded_slides:
140
+
141
+ def _persist_embedded_slide(slide, tiling_result, embedded_slide) -> None:
130
142
  payload = {
131
143
  "tile_embeddings": _to_cpu_payload(embedded_slide.tile_embeddings),
132
144
  "slide_embedding": _to_cpu_payload(embedded_slide.slide_embedding),
133
145
  "latents": _to_cpu_payload(embedded_slide.latents),
134
146
  }
135
147
  torch.save(payload, coordination_dir / f"{embedded_slide.sample_id}.embedded.pt")
148
+
149
+ compute_embedded_slides_fn = getattr(inference, "_compute_embedded_slides", None)
150
+ if not callable(compute_embedded_slides_fn):
151
+ from slide2vec.runtime.embedding_pipeline import compute_embedded_slides as compute_embedded_slides_fn
152
+ compute_embedded_slides_fn(
153
+ model,
154
+ assigned_slides,
155
+ assigned_tiling_results,
156
+ preprocessing=preprocessing,
157
+ execution=execution,
158
+ on_embedded_slide=_persist_embedded_slide,
159
+ collect_results=False,
160
+ )
136
161
  return 0
137
162
  finally:
138
163
  if dist.is_available() and dist.is_initialized():
@@ -17,12 +17,9 @@ def main(argv=None) -> int:
17
17
  import torch.distributed as dist
18
18
 
19
19
  import slide2vec.distributed as distributed
20
+ import slide2vec.inference as inference
20
21
  from slide2vec.api import Model
21
- from slide2vec.inference import (
22
- _compute_embedded_slides,
23
- _persist_embedded_slide,
24
- load_successful_tiled_slides,
25
- )
22
+ from slide2vec.runtime.persist_callbacks import build_incremental_persist_callback
26
23
  from slide2vec.progress import JsonlProgressReporter, activate_progress_reporter
27
24
  from slide2vec.runtime.serialization import deserialize_execution, deserialize_preprocessing
28
25
 
@@ -46,7 +43,21 @@ def main(argv=None) -> int:
46
43
  )
47
44
  preprocessing = deserialize_preprocessing(request["preprocessing"])
48
45
  execution = deserialize_execution(request["execution"])
49
- slide_records, tiling_results = load_successful_tiled_slides(output_dir)
46
+ tiling_input_dir = Path(request.get("tiling_input_dir", str(output_dir)))
47
+ load_successful_tiled_slides_fn = getattr(inference, "load_successful_tiled_slides", None)
48
+ if not callable(load_successful_tiled_slides_fn):
49
+ from slide2vec.runtime.manifest import load_successful_tiled_slides as load_successful_tiled_slides_fn
50
+ slide_records, tiling_results = load_successful_tiled_slides_fn(tiling_input_dir)
51
+ requested_sample_ids = request.get("sample_ids")
52
+ if requested_sample_ids is not None:
53
+ requested_sample_id_set = {str(sample_id) for sample_id in requested_sample_ids}
54
+ paired = [
55
+ (slide, tiling_result)
56
+ for slide, tiling_result in zip(slide_records, tiling_results)
57
+ if slide.sample_id in requested_sample_id_set
58
+ ]
59
+ slide_records = [slide for slide, _ in paired]
60
+ tiling_results = [tiling_result for _, tiling_result in paired]
50
61
  assignments = assign_slides_to_ranks(slide_records, tiling_results, num_gpus=world_size)
51
62
  assigned_ids = assignments.get(global_rank, [])
52
63
  if not assigned_ids:
@@ -69,21 +80,25 @@ def main(argv=None) -> int:
69
80
  )
70
81
  context = activate_progress_reporter(reporter) if reporter is not None else nullcontext()
71
82
  with context:
72
- embedded_slides = _compute_embedded_slides(
83
+ build_incremental_persist_callback_fn = getattr(inference, "_build_incremental_persist_callback", build_incremental_persist_callback)
84
+ persist_callback, _, _ = build_incremental_persist_callback_fn(
85
+ model=model,
86
+ preprocessing=preprocessing,
87
+ execution=execution,
88
+ process_list_path=None,
89
+ )
90
+ compute_embedded_slides_fn = getattr(inference, "_compute_embedded_slides", None)
91
+ if not callable(compute_embedded_slides_fn):
92
+ from slide2vec.runtime.embedding_pipeline import compute_embedded_slides as compute_embedded_slides_fn
93
+ compute_embedded_slides_fn(
73
94
  model,
74
95
  assigned_slides,
75
96
  assigned_tiling_results,
76
97
  preprocessing=preprocessing,
77
98
  execution=execution,
99
+ on_embedded_slide=persist_callback,
100
+ collect_results=False,
78
101
  )
79
- for embedded_slide, tiling_result in zip(embedded_slides, assigned_tiling_results):
80
- _persist_embedded_slide(
81
- model,
82
- embedded_slide,
83
- tiling_result,
84
- preprocessing=preprocessing,
85
- execution=execution,
86
- )
87
102
  return 0
88
103
  finally:
89
104
  if dist.is_available() and dist.is_initialized():
@@ -21,7 +21,7 @@ from slide2vec.encoders.registry import (
21
21
  )
22
22
 
23
23
  # Trigger registration of all built-in encoders.
24
- from slide2vec.encoders import models as _models_pkg # noqa: F401
24
+ from slide2vec.encoders import models # noqa: F401
25
25
 
26
26
  __all__ = [
27
27
  "Encoder",
@@ -63,14 +63,18 @@ def validate_encoder_config(
63
63
  if not mismatches:
64
64
  return
65
65
 
66
- message = (
67
- f"Model '{encoder_name}' is configured with "
68
- f"{'; '.join(mismatches)}. "
69
- "Set `model.allow_non_recommended_settings=true` in YAML/CLI or "
70
- "`allow_non_recommended_settings=True` in `Model.from_preset(...)` "
71
- "to continue with a warning."
72
- )
73
66
  if allow_non_recommended:
74
- logger.warning(message)
67
+ logger.warning(
68
+ f"Model '{encoder_name}' is configured with "
69
+ f"{'; '.join(mismatches)}. "
70
+ "Warning-only mode is enabled because "
71
+ "`allow_non_recommended_settings=True`."
72
+ )
75
73
  else:
76
- raise ValueError(message)
74
+ raise ValueError(
75
+ f"Model '{encoder_name}' is configured with "
76
+ f"{'; '.join(mismatches)}. "
77
+ "Set `model.allow_non_recommended_settings=true` in YAML/CLI or "
78
+ "`allow_non_recommended_settings=True` in `Model.from_preset(...)` "
79
+ "to continue."
80
+ )