slide2vec 4.7.0__tar.gz → 5.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. {slide2vec-4.7.0 → slide2vec-5.0.0}/PKG-INFO +4 -4
  2. {slide2vec-4.7.0 → slide2vec-5.0.0}/README.md +1 -1
  3. {slide2vec-4.7.0 → slide2vec-5.0.0}/pyproject.toml +4 -4
  4. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/__init__.py +1 -1
  5. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/api.py +159 -8
  6. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/artifacts.py +71 -5
  7. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/configs/default.yaml +18 -1
  8. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/distributed/direct_embed_worker.py +24 -9
  9. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/distributed/pipeline_worker.py +21 -17
  10. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/inference.py +17 -0
  11. slide2vec-5.0.0/slide2vec/runtime/artifacts_collect.py +299 -0
  12. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/distributed.py +85 -9
  13. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/distributed_stage.py +30 -12
  14. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/embedding.py +16 -0
  15. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/embedding_persist.py +7 -0
  16. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/embedding_pipeline.py +3 -0
  17. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/persist_callbacks.py +63 -11
  18. slide2vec-5.0.0/slide2vec/runtime/persistence.py +299 -0
  19. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/process_list.py +59 -13
  20. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/serialization.py +5 -2
  21. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/tiling.py +32 -8
  22. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/tiling_pipeline.py +31 -15
  23. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/utils/tiling_io.py +11 -0
  24. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec.egg-info/PKG-INFO +4 -4
  25. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec.egg-info/requires.txt +2 -2
  26. {slide2vec-4.7.0 → slide2vec-5.0.0}/tests/test_output_consistency.py +10 -1
  27. {slide2vec-4.7.0 → slide2vec-5.0.0}/tests/test_progress.py +1 -1
  28. {slide2vec-4.7.0 → slide2vec-5.0.0}/tests/test_regression_core.py +349 -5
  29. {slide2vec-4.7.0 → slide2vec-5.0.0}/tests/test_regression_inference.py +1901 -43
  30. {slide2vec-4.7.0 → slide2vec-5.0.0}/tests/test_regression_models.py +182 -114
  31. slide2vec-4.7.0/slide2vec/runtime/artifacts_collect.py +0 -155
  32. slide2vec-4.7.0/slide2vec/runtime/persistence.py +0 -188
  33. {slide2vec-4.7.0 → slide2vec-5.0.0}/LICENSE +0 -0
  34. {slide2vec-4.7.0 → slide2vec-5.0.0}/setup.cfg +0 -0
  35. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/__main__.py +0 -0
  36. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/cli.py +0 -0
  37. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/configs/__init__.py +0 -0
  38. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/configs/resources.py +0 -0
  39. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/data/__init__.py +0 -0
  40. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/data/dataset.py +0 -0
  41. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/data/tile_reader.py +0 -0
  42. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/data/tile_store.py +0 -0
  43. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/distributed/__init__.py +0 -0
  44. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/__init__.py +0 -0
  45. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/base.py +0 -0
  46. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/__init__.py +0 -0
  47. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/conch.py +0 -0
  48. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/gigapath.py +0 -0
  49. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/hibou.py +0 -0
  50. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/hoptimus.py +0 -0
  51. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/lunit.py +0 -0
  52. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/midnight.py +0 -0
  53. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/moozy/__init__.py +0 -0
  54. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/moozy/blocks.py +0 -0
  55. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/moozy/case.py +0 -0
  56. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/moozy/loading.py +0 -0
  57. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/moozy/slide.py +0 -0
  58. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/moozy/types.py +0 -0
  59. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/musk.py +0 -0
  60. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/phikon.py +0 -0
  61. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/prism.py +0 -0
  62. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/prost40m.py +0 -0
  63. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/titan.py +0 -0
  64. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/uni.py +0 -0
  65. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/models/virchow.py +0 -0
  66. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/registry.py +0 -0
  67. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/encoders/validation.py +0 -0
  68. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/progress.py +0 -0
  69. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/__init__.py +0 -0
  70. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/batching.py +0 -0
  71. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/cpu_budget.py +0 -0
  72. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/dense_regions.py +0 -0
  73. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/hierarchical.py +0 -0
  74. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/manifest.py +0 -0
  75. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/model_settings.py +0 -0
  76. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/patient_pipeline.py +0 -0
  77. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/progress_bridge.py +0 -0
  78. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/registry.py +0 -0
  79. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/slide_encode.py +0 -0
  80. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/types.py +0 -0
  81. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/runtime/worker_io.py +0 -0
  82. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/utils/__init__.py +0 -0
  83. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/utils/config.py +0 -0
  84. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/utils/coordinates.py +0 -0
  85. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/utils/log_utils.py +0 -0
  86. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/utils/utils.py +0 -0
  87. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec.egg-info/SOURCES.txt +0 -0
  88. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec.egg-info/dependency_links.txt +0 -0
  89. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec.egg-info/entry_points.txt +0 -0
  90. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec.egg-info/not-zip-safe +0 -0
  91. {slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec.egg-info/top_level.txt +0 -0
  92. {slide2vec-4.7.0 → slide2vec-5.0.0}/tests/test_architecture_runtime_split.py +0 -0
  93. {slide2vec-4.7.0 → slide2vec-5.0.0}/tests/test_attention_extraction.py +0 -0
  94. {slide2vec-4.7.0 → slide2vec-5.0.0}/tests/test_dense_extraction.py +0 -0
  95. {slide2vec-4.7.0 → slide2vec-5.0.0}/tests/test_dense_locality_gated.py +0 -0
  96. {slide2vec-4.7.0 → slide2vec-5.0.0}/tests/test_dense_regions.py +0 -0
  97. {slide2vec-4.7.0 → slide2vec-5.0.0}/tests/test_encoder_registry.py +0 -0
  98. {slide2vec-4.7.0 → slide2vec-5.0.0}/tests/test_hs2p_package_cutover.py +0 -0
  99. {slide2vec-4.7.0 → slide2vec-5.0.0}/tests/test_runtime_batching.py +0 -0
  100. {slide2vec-4.7.0 → slide2vec-5.0.0}/tests/test_tile_store.py +0 -0
  101. {slide2vec-4.7.0 → slide2vec-5.0.0}/tests/test_tiling_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: slide2vec
3
- Version: 4.7.0
3
+ Version: 5.0.0
4
4
  Summary: Embedding of whole slide images with Foundation Models
5
5
  Author-email: Clément Grisi <clement.grisi@radboudumc.nl>
6
6
  License-Expression: Apache-2.0
@@ -15,7 +15,7 @@ Classifier: Programming Language :: Python :: 3.13
15
15
  Requires-Python: >=3.10
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
- Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.0.8
18
+ Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.2.0
19
19
  Requires-Dist: omegaconf
20
20
  Requires-Dist: matplotlib
21
21
  Requires-Dist: numpy<2
@@ -65,7 +65,7 @@ Requires-Dist: numpy<2; extra == "fm"
65
65
  Requires-Dist: pandas; extra == "fm"
66
66
  Requires-Dist: pillow; extra == "fm"
67
67
  Requires-Dist: rich; extra == "fm"
68
- Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.0.8; extra == "fm"
68
+ Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.2.0; extra == "fm"
69
69
  Requires-Dist: wandb; extra == "fm"
70
70
  Requires-Dist: torch<2.8,>=2.3; extra == "fm"
71
71
  Requires-Dist: torchvision>=0.18.0; extra == "fm"
@@ -169,7 +169,7 @@ pipeline = Pipeline(
169
169
  preprocessing=PreprocessingConfig(
170
170
  requested_spacing_um=0.5,
171
171
  requested_tile_size_px=224,
172
- tissue_threshold=0.1,
172
+ masks={"min_coverage": {"tissue": 0.1}},
173
173
  ),
174
174
  execution=ExecutionOptions(output_dir="outputs/demo"),
175
175
  )
@@ -63,7 +63,7 @@ pipeline = Pipeline(
63
63
  preprocessing=PreprocessingConfig(
64
64
  requested_spacing_um=0.5,
65
65
  requested_tile_size_px=224,
66
- tissue_threshold=0.1,
66
+ masks={"min_coverage": {"tissue": 0.1}},
67
67
  ),
68
68
  execution=ExecutionOptions(output_dir="outputs/demo"),
69
69
  )
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "slide2vec"
7
- version = "4.7.0"
7
+ version = "5.0.0"
8
8
  description = "Embedding of whole slide images with Foundation Models"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -21,7 +21,7 @@ classifiers = [
21
21
  "Programming Language :: Python :: 3.13",
22
22
  ]
23
23
  dependencies = [
24
- "hs2p[asap,cucim,openslide,sam2,vips]>=4.0.8",
24
+ "hs2p[asap,cucim,openslide,sam2,vips]>=4.2.0",
25
25
  "omegaconf",
26
26
  "matplotlib",
27
27
  "numpy<2",
@@ -88,7 +88,7 @@ fm = [
88
88
  "pandas",
89
89
  "pillow",
90
90
  "rich",
91
- "hs2p[asap,cucim,openslide,sam2,vips]>=4.0.8",
91
+ "hs2p[asap,cucim,openslide,sam2,vips]>=4.2.0",
92
92
  "wandb",
93
93
  "torch>=2.3,<2.8",
94
94
  "torchvision>=0.18.0",
@@ -164,7 +164,7 @@ no_implicit_reexport = true
164
164
  max-line-length = 160
165
165
 
166
166
  [tool.bumpver]
167
- current_version = "4.7.0"
167
+ current_version = "5.0.0"
168
168
  version_pattern = "MAJOR.MINOR.PATCH"
169
169
  commit = false # We do version bumping in CI, not as a commit
170
170
  tag = false # Git tag already exists — we don't auto-tag
@@ -11,7 +11,7 @@ from slide2vec.api import (
11
11
  from slide2vec.artifacts import HierarchicalEmbeddingArtifact, SlideEmbeddingArtifact, TileEmbeddingArtifact
12
12
 
13
13
 
14
- __version__ = "4.7.0"
14
+ __version__ = "5.0.0"
15
15
 
16
16
  __all__ = [
17
17
  "Model",
@@ -1,4 +1,5 @@
1
1
 
2
+ import copy
2
3
  import logging
3
4
  import os
4
5
  from dataclasses import dataclass, field, replace
@@ -40,6 +41,55 @@ SlideSequence = Sequence[SlideInput]
40
41
  TilingResultsInput = Sequence[Any] | Mapping[str, Any]
41
42
 
42
43
 
44
+ #: Default annotation-mask vocabulary — plain binary tissue tiling. Mirrors hs2p's
45
+ #: shipped default ``{background: 0, tissue: 1}``; leaving it untouched keeps a run
46
+ #: behaving exactly as a tissue-only run. ``min_coverage.tissue`` is the single source
47
+ #: of truth for the tissue threshold (the standalone ``tissue_threshold`` knob is gone).
48
+ #: A :class:`PreprocessingConfig` ``masks`` value is deep-merged over this default, so
49
+ #: callers only state what they override (e.g. ``{"min_coverage": {"tissue": 0.1}}``).
50
+ DEFAULT_MASKS: dict[str, Any] = {
51
+ "output_mode": "per_annotation",
52
+ "pixel_mapping": {"background": 0, "tissue": 1},
53
+ "colors": {"background": None, "tissue": [157, 219, 129]},
54
+ "min_coverage": {"background": None, "tissue": 0.01},
55
+ }
56
+
57
+
58
+ def _deep_merge_masks(base: Mapping[str, Any], override: Mapping[str, Any]) -> dict[str, Any]:
59
+ """Deep-merge *override* onto a copy of *base* (nested dicts merge key-by-key)."""
60
+ merged = copy.deepcopy(dict(base))
61
+ for key, value in override.items():
62
+ existing = merged.get(key)
63
+ if isinstance(value, Mapping) and isinstance(existing, dict):
64
+ merged[key] = _deep_merge_masks(existing, value)
65
+ else:
66
+ merged[key] = copy.deepcopy(value)
67
+ return merged
68
+
69
+
70
+ def resolve_masks(masks: Mapping[str, Any] | None) -> dict[str, Any]:
71
+ """Complete a (possibly partial) ``masks`` mapping by merging it over :data:`DEFAULT_MASKS`."""
72
+ if not masks:
73
+ return copy.deepcopy(DEFAULT_MASKS)
74
+ return _deep_merge_masks(DEFAULT_MASKS, masks)
75
+
76
+
77
+ def _masks_to_plain_dict(node: Any) -> dict[str, Any]:
78
+ """Normalize a masks config node (OmegaConf, mapping, or namespace) to a plain dict."""
79
+ if node is None:
80
+ return {}
81
+ try:
82
+ from omegaconf import OmegaConf
83
+
84
+ if OmegaConf.is_config(node):
85
+ return copy.deepcopy(OmegaConf.to_container(node, resolve=True)) # type: ignore[return-value]
86
+ except ImportError:
87
+ pass
88
+ if isinstance(node, Mapping):
89
+ return copy.deepcopy(dict(node))
90
+ return copy.deepcopy(dict(vars(node)))
91
+
92
+
43
93
  @dataclass(frozen=True, kw_only=True)
44
94
  class PreprocessingConfig:
45
95
  """Configuration for slide tiling and preprocessing."""
@@ -62,8 +112,6 @@ class PreprocessingConfig:
62
112
  tolerance: float = 0.05
63
113
  #: Fractional tile overlap (``0.0`` = no overlap).
64
114
  overlap: float = 0.0
65
- #: Minimum tissue fraction required to keep a tile (default ``0.01``).
66
- tissue_threshold: float = 0.01
67
115
  #: Directory containing pre-extracted tile coordinates to reuse, skipping tiling.
68
116
  read_coordinates_from: Path | None = None
69
117
  #: Directory containing pre-extracted tile images to skip the tiling step entirely.
@@ -90,6 +138,20 @@ class PreprocessingConfig:
90
138
  #: Controls whether hs2p writes mask and tiling preview images.
91
139
  #: Keys: ``save_mask_preview``, ``save_tiling_preview``, ``downsample``.
92
140
  preview: dict[str, Any] = field(default_factory=dict)
141
+ #: Annotation-mask vocabulary forwarded to hs2p's sampling resolver. Keys:
142
+ #: ``output_mode``, ``pixel_mapping``, ``colors``, ``min_coverage``. A partial
143
+ #: mapping is deep-merged over :data:`DEFAULT_MASKS`, so callers only state what
144
+ #: they override (e.g. ``{"min_coverage": {"tissue": 0.1}}``). The default
145
+ #: ``{background, tissue}`` block is plain tissue tiling; ``min_coverage.tissue``
146
+ #: is the single source of truth for the tissue threshold.
147
+ masks: dict[str, Any] = field(default_factory=dict)
148
+ #: When annotation sampling is active, tile each class independently (``True``)
149
+ #: vs jointly across classes (``False``).
150
+ independent_sampling: bool = True
151
+
152
+ def __post_init__(self) -> None:
153
+ # Complete a (possibly partial) masks mapping against the shipped default.
154
+ object.__setattr__(self, "masks", resolve_masks(self.masks))
93
155
 
94
156
  @classmethod
95
157
  def from_config(cls, cfg: Any) -> "PreprocessingConfig":
@@ -121,7 +183,8 @@ class PreprocessingConfig:
121
183
  region_tile_multiple=int(region_tile_multiple) if region_tile_multiple is not None else None,
122
184
  tolerance=float(tiling.params.tolerance),
123
185
  overlap=float(tiling.params.overlap),
124
- tissue_threshold=float(tiling.params.tissue_threshold),
186
+ masks=_masks_to_plain_dict(getattr(tiling, "masks", None)),
187
+ independent_sampling=bool(getattr(tiling, "independent_sampling", True)),
125
188
  read_coordinates_from=Path(read_coordinates_from) if read_coordinates_from else None,
126
189
  read_tiles_from=(
127
190
  Path(read_tiles_from) if read_tiles_from else None
@@ -288,6 +351,11 @@ class EmbeddedSlide:
288
351
  image_path: Path
289
352
  #: Path to the tissue mask used for tiling, if any.
290
353
  mask_path: Path | None = None
354
+ #: Annotation class this bag of tiles was sampled for. ``"tissue"`` for the
355
+ #: default tissue-only path, ``"merged"`` for the union output mode, or the
356
+ #: class name (e.g. ``"tumor"``) when annotation-aware sampling fans a slide
357
+ #: out into one bag per class. See the annotation-aware sampling documentation.
358
+ annotation: str | None = None
291
359
  #: Number of tiles extracted from the slide.
292
360
  num_tiles: int | None = None
293
361
  #: Path to the mask preview image, if generated.
@@ -379,12 +447,13 @@ class Model:
379
447
  self,
380
448
  slide: SlideInput,
381
449
  *,
450
+ annotation: str | list[str] | None = None,
382
451
  preprocessing: PreprocessingConfig | None = None,
383
452
  execution: ExecutionOptions | None = None,
384
453
  sample_id: str | None = None,
385
454
  mask_path: PathLike | None = None,
386
455
  spacing_at_level_0: float | None = None,
387
- ) -> EmbeddedSlide:
456
+ ) -> EmbeddedSlide | list[EmbeddedSlide]:
388
457
  if isinstance(slide, (str, Path)):
389
458
  slide = {
390
459
  "sample_id": sample_id or Path(slide).stem,
@@ -396,31 +465,42 @@ class Model:
396
465
  raise ValueError(
397
466
  "sample_id, mask_path, and spacing_at_level_0 overrides are only supported when slide is a path-like input"
398
467
  )
399
- return self.embed_slides(
468
+ requested = None if isinstance(annotation, str) else annotation
469
+ grouped = self.embed_slides(
400
470
  [slide],
471
+ annotations=requested,
401
472
  preprocessing=preprocessing,
402
473
  execution=execution,
403
- )[0]
474
+ )
475
+ # Single slide in → at most one outer key out. Flatten to the inner
476
+ # {label: EmbeddedSlide} mapping (empty when the run produced nothing).
477
+ bags: dict[str, EmbeddedSlide] = {}
478
+ for inner in grouped.values():
479
+ bags = inner
480
+ break
481
+ return _select_embedded_bag(bags, annotation)
404
482
 
405
483
  def embed_slides(
406
484
  self,
407
485
  slides: SlideSequence,
408
486
  *,
487
+ annotations: list[str] | None = None,
409
488
  preprocessing: PreprocessingConfig | None = None,
410
489
  execution: ExecutionOptions | None = None,
411
- ) -> list[EmbeddedSlide]:
490
+ ) -> dict[str, dict[str, EmbeddedSlide]]:
412
491
  from slide2vec.inference import embed_slides
413
492
 
414
493
  resolved = _coerce_execution_options(execution, model=self)
415
494
  resolved_preprocessing = _resolve_direct_api_preprocessing(self, preprocessing)
416
495
  with _auto_progress_reporting(output_dir=resolved.output_dir):
417
496
  _validate_model_config(self, resolved_preprocessing, resolved)
418
- return embed_slides(
497
+ embedded = embed_slides(
419
498
  self,
420
499
  slides,
421
500
  preprocessing=resolved_preprocessing,
422
501
  execution=resolved,
423
502
  )
503
+ return _group_embedded_slides(embedded, annotations=annotations)
424
504
 
425
505
  def embed_patient(
426
506
  self,
@@ -587,6 +667,77 @@ class Pipeline:
587
667
  )
588
668
 
589
669
 
670
+ def _select_embedded_bag(
671
+ bags: Mapping[str, EmbeddedSlide],
672
+ annotation: str | list[str] | None,
673
+ ) -> EmbeddedSlide | list[EmbeddedSlide]:
674
+ """Select per-class bag(s) from a single slide's ``{label: EmbeddedSlide}`` map.
675
+
676
+ numpy-style shape-in/shape-out:
677
+
678
+ - a single class string returns one :class:`EmbeddedSlide`;
679
+ - a list of class strings returns a list in the requested order;
680
+ - ``None`` returns the single bag when the run produced exactly one,
681
+ otherwise raises naming the available bags and directing to
682
+ :meth:`Model.embed_slides`.
683
+
684
+ Requesting a class the run did not produce raises naming what is available.
685
+ """
686
+ available = sorted(bags)
687
+ if isinstance(annotation, str):
688
+ if annotation not in bags:
689
+ raise ValueError(
690
+ f"embed_slide() found no '{annotation}' annotation bag for this "
691
+ f"slide; available bags: {available}."
692
+ )
693
+ return bags[annotation]
694
+ if annotation is not None:
695
+ selected: list[EmbeddedSlide] = []
696
+ for label in annotation:
697
+ if label not in bags:
698
+ raise ValueError(
699
+ f"embed_slide() found no '{label}' annotation bag for this "
700
+ f"slide; available bags: {available}."
701
+ )
702
+ selected.append(bags[label])
703
+ return selected
704
+ if len(bags) == 1:
705
+ return next(iter(bags.values()))
706
+ raise ValueError(
707
+ f"embed_slide() received {len(bags)} annotation bags for this slide "
708
+ f"({available}); annotation-aware sampling produces one bag per class. "
709
+ "Pass annotation=... to select a class, or use Model.embed_slides(...) "
710
+ "to receive every per-class EmbeddedSlide (each carries its .annotation)."
711
+ )
712
+
713
+
714
+ def _group_embedded_slides(
715
+ embedded: Sequence[EmbeddedSlide],
716
+ *,
717
+ annotations: list[str] | None = None,
718
+ ) -> dict[str, dict[str, EmbeddedSlide]]:
719
+ """Group flat per-row :class:`EmbeddedSlide` results into a nested mapping.
720
+
721
+ The outer key is ``sample_id``; the inner key is the bag's informative
722
+ annotation label (``"tissue"``/``"merged"``/class name), never ``None``.
723
+ A bag whose ``.annotation`` is ``None`` (defensive — post-#173 real runs
724
+ always carry a label) does not produce a ``None`` key.
725
+
726
+ When *annotations* is given, the inner keys are restricted to the named
727
+ classes (in encounter order).
728
+ """
729
+ requested = None if annotations is None else set(annotations)
730
+ grouped: dict[str, dict[str, EmbeddedSlide]] = {}
731
+ for bag in embedded:
732
+ label = bag.annotation
733
+ if label is None:
734
+ continue
735
+ if requested is not None and label not in requested:
736
+ continue
737
+ grouped.setdefault(bag.sample_id, {})[label] = bag
738
+ return grouped
739
+
740
+
590
741
  def _coerce_execution_options(
591
742
  options: ExecutionOptions | None,
592
743
  *,
@@ -5,6 +5,7 @@ from typing import Any
5
5
 
6
6
  import numpy as np
7
7
  import torch
8
+ from hs2p.fileops import is_flattened_annotation
8
9
 
9
10
 
10
11
  @dataclass(frozen=True, kw_only=True)
@@ -29,6 +30,7 @@ class SlideEmbeddingArtifact:
29
30
  format: str
30
31
  feature_dim: int
31
32
  latent_path: Path | None = None
33
+ annotation: str | None = None
32
34
 
33
35
  @property
34
36
  def metadata(self) -> dict[str, Any]:
@@ -58,6 +60,7 @@ class HierarchicalEmbeddingArtifact:
58
60
  feature_dim: int
59
61
  num_regions: int
60
62
  tiles_per_region: int
63
+ annotation: str | None = None
61
64
 
62
65
  @property
63
66
  def metadata(self) -> dict[str, Any]:
@@ -90,6 +93,53 @@ def _write_metadata(path: Path, metadata: dict[str, Any]) -> None:
90
93
  path.write_text(json.dumps(metadata, indent=2, sort_keys=True), encoding="utf-8")
91
94
 
92
95
 
96
+ def tile_embeddings_subdir(annotation: str | None) -> str:
97
+ """Namespace the ``tile_embeddings`` output dir per annotation class.
98
+
99
+ Reuses hs2p's flatten rule (the single source of truth): ``None`` and the sentinel
100
+ ``"tissue"`` collapse to the flat ``tile_embeddings`` root, so the default tissue-only
101
+ path is byte-for-byte unchanged; any real class label gets its own
102
+ ``tile_embeddings/<class>`` subdirectory.
103
+ """
104
+ if is_flattened_annotation(annotation):
105
+ return "tile_embeddings"
106
+ return f"tile_embeddings/{annotation}"
107
+
108
+
109
+ def slide_embeddings_subdir(annotation: str | None) -> str:
110
+ """Namespace the ``slide_embeddings`` output dir per annotation class.
111
+
112
+ Reuses hs2p's flatten rule (the single source of truth, shared with
113
+ :func:`tile_embeddings_subdir`): ``None`` and the sentinel ``"tissue"`` collapse to the
114
+ flat ``slide_embeddings`` root, so the default tissue-only path is byte-for-byte
115
+ unchanged; any real class label gets its own ``slide_embeddings/<class>`` subdirectory.
116
+ """
117
+ if is_flattened_annotation(annotation):
118
+ return "slide_embeddings"
119
+ return f"slide_embeddings/{annotation}"
120
+
121
+
122
+ def slide_latents_subdir(annotation: str | None) -> str:
123
+ """Namespace the ``slide_latents`` output dir per annotation class (mirrors slide embeddings)."""
124
+ if is_flattened_annotation(annotation):
125
+ return "slide_latents"
126
+ return f"slide_latents/{annotation}"
127
+
128
+
129
+ def hierarchical_embeddings_subdir(annotation: str | None) -> str:
130
+ """Namespace the ``hierarchical_embeddings`` output dir per annotation class.
131
+
132
+ Reuses hs2p's flatten rule (the single source of truth, shared with
133
+ :func:`tile_embeddings_subdir` and :func:`slide_embeddings_subdir`): ``None`` and the
134
+ sentinel ``"tissue"`` collapse to the flat ``hierarchical_embeddings`` root, so the
135
+ default tissue-only path is byte-for-byte unchanged; any real class label gets its own
136
+ ``hierarchical_embeddings/<class>`` subdirectory.
137
+ """
138
+ if is_flattened_annotation(annotation):
139
+ return "hierarchical_embeddings"
140
+ return f"hierarchical_embeddings/{annotation}"
141
+
142
+
93
143
  def _setup_artifact_paths(
94
144
  output_dir: str | Path, subdir: str, sample_id: str, output_format: str
95
145
  ) -> tuple[Path, Path]:
@@ -142,9 +192,12 @@ def write_tile_embeddings(
142
192
  output_format: str = "pt",
143
193
  metadata: dict[str, Any] | None = None,
144
194
  tile_index: Any | None = None,
195
+ annotation: str | None = None,
145
196
  ) -> TileEmbeddingArtifact:
146
197
  output_format = _validate_output_format(output_format)
147
- artifact_path, metadata_path = _setup_artifact_paths(output_dir, "tile_embeddings", sample_id, output_format)
198
+ artifact_path, metadata_path = _setup_artifact_paths(
199
+ output_dir, tile_embeddings_subdir(annotation), sample_id, output_format
200
+ )
148
201
  feature_array = _ensure_array(features)
149
202
  if output_format == "pt":
150
203
  torch.save(_ensure_tensor(features), artifact_path)
@@ -180,9 +233,12 @@ def write_tile_embedding_metadata(
180
233
  feature_dim: int | None = None,
181
234
  num_tiles: int = 0,
182
235
  metadata: dict[str, Any] | None = None,
236
+ annotation: str | None = None,
183
237
  ) -> Path:
184
238
  output_format = _validate_output_format(output_format)
185
- _, metadata_path = _setup_artifact_paths(output_dir, "tile_embeddings", sample_id, output_format)
239
+ _, metadata_path = _setup_artifact_paths(
240
+ output_dir, tile_embeddings_subdir(annotation), sample_id, output_format
241
+ )
186
242
  tile_metadata = _build_tile_embedding_metadata(
187
243
  sample_id,
188
244
  output_format=output_format,
@@ -202,9 +258,12 @@ def write_slide_embeddings(
202
258
  output_format: str = "pt",
203
259
  metadata: dict[str, Any] | None = None,
204
260
  latents: Any | None = None,
261
+ annotation: str | None = None,
205
262
  ) -> SlideEmbeddingArtifact:
206
263
  output_format = _validate_output_format(output_format)
207
- artifact_path, metadata_path = _setup_artifact_paths(output_dir, "slide_embeddings", sample_id, output_format)
264
+ artifact_path, metadata_path = _setup_artifact_paths(
265
+ output_dir, slide_embeddings_subdir(annotation), sample_id, output_format
266
+ )
208
267
  embedding_array = _ensure_array(embedding)
209
268
  latent_path = None
210
269
  if output_format == "pt":
@@ -212,7 +271,9 @@ def write_slide_embeddings(
212
271
  else:
213
272
  np.savez_compressed(artifact_path, features=embedding_array)
214
273
  if latents is not None:
215
- latent_path, _ = _setup_artifact_paths(output_dir, "slide_latents", sample_id, output_format)
274
+ latent_path, _ = _setup_artifact_paths(
275
+ output_dir, slide_latents_subdir(annotation), sample_id, output_format
276
+ )
216
277
  if output_format == "pt":
217
278
  torch.save(_ensure_tensor(latents), latent_path)
218
279
  else:
@@ -234,6 +295,7 @@ def write_slide_embeddings(
234
295
  format=output_format,
235
296
  feature_dim=slide_metadata["feature_dim"],
236
297
  latent_path=latent_path,
298
+ annotation=annotation,
237
299
  )
238
300
 
239
301
 
@@ -283,9 +345,12 @@ def write_hierarchical_embeddings(
283
345
  output_dir: str | Path,
284
346
  output_format: str = "pt",
285
347
  metadata: dict[str, Any] | None = None,
348
+ annotation: str | None = None,
286
349
  ) -> HierarchicalEmbeddingArtifact:
287
350
  output_format = _validate_output_format(output_format)
288
- artifact_path, metadata_path = _setup_artifact_paths(output_dir, "hierarchical_embeddings", sample_id, output_format)
351
+ artifact_path, metadata_path = _setup_artifact_paths(
352
+ output_dir, hierarchical_embeddings_subdir(annotation), sample_id, output_format
353
+ )
289
354
  feature_array = _ensure_array(features)
290
355
  if feature_array.ndim != 3:
291
356
  raise ValueError(
@@ -315,4 +380,5 @@ def write_hierarchical_embeddings(
315
380
  feature_dim=int(hierarchical_metadata["feature_dim"]),
316
381
  num_regions=int(hierarchical_metadata["num_regions"]),
317
382
  tiles_per_region=int(hierarchical_metadata["tiles_per_region"]),
383
+ annotation=annotation,
318
384
  )
@@ -26,6 +26,24 @@ tiling:
26
26
  read_coordinates_from: # path to an existing directory containing pre-extracted `.coordinates.npz` / `.coordinates.meta.json` artifacts to reuse instead of starting tiling from scratch
27
27
  read_tiles_from: # path to an existing directory containing pre-extracted `.tiles.tar` tile stores to reuse instead of starting tiling from scratch
28
28
  backend: "auto" # backend to use for slide reading; "auto" lets hs2p resolve the best backend per slide, preferring cuCIM when available
29
+ independent_sampling: true # selection strategy when annotation sampling is active. true: sample each class independently against its own binary mask (independent selection); false: sample once over the union of active classes, then post-filter per class by coverage (joint selection). Ignored when the masks vocabulary is left at the tissue-only default.
30
+ masks:
31
+ # Annotation-mask vocabulary forwarded to hs2p's sampling resolver. The shipped default
32
+ # ({background:0, tissue:1}) is plain binary tissue tiling — leave it untouched and the run
33
+ # behaves exactly as a tissue-only run. Customising the vocabulary (e.g. adding a `tumor`
34
+ # class with its own pixel value + min_coverage) opts the run into annotation-aware sampling,
35
+ # where `mask_path` is read as a multi-label raster. The `tissue` min_coverage entry below is
36
+ # the single source of truth for the tissue threshold.
37
+ output_mode: per_annotation # how sampled tiles are grouped into artifacts. per_annotation: one flat artifact set per sampled class, namespaced under a `<class>/` subdir (the `tissue` class collapses to the flat root). merged: a single flat artifact set per slide over the union of tiles passing any active class threshold — it carries no class label, so it lands at the flat output root (no `<class>/` subdir).
38
+ pixel_mapping: # {class_name: integer pixel value in the mask raster}
39
+ background: 0
40
+ tissue: 1
41
+ colors: # {class_name: [r, g, b] | null} used when rendering previews
42
+ background:
43
+ tissue: [157, 219, 129]
44
+ min_coverage: # {class_name: float | null}; minimum fraction of a tile that must be covered to keep it; null = don't sample that class
45
+ background:
46
+ tissue: 0.1
29
47
  params:
30
48
  requested_spacing_um: # spacing at which to tile the slide, in microns per pixel; filled from a preset model when available
31
49
  tolerance: 0.05 # tolerance for matching the spacing (float between 0 and 1, deciding how much the spacing can deviate from the one specified in the slide metadata)
@@ -33,7 +51,6 @@ tiling:
33
51
  requested_region_size_px: # size of hierarchical parent regions in pixels; when unset and region_tile_multiple is set, derived from requested_tile_size_px * region_tile_multiple
34
52
  region_tile_multiple: # hierarchical region grid width/height in tiles; e.g. 6 means 6x6 tiles per region
35
53
  overlap: 0.0 # percentage of overlap between two consecutive tiles (float between 0 and 1)
36
- tissue_threshold: 0.1 # minimum fraction of pixels that must be tissue to keep a tile (float between 0 and 1)
37
54
  seg_params:
38
55
  # downsample controls which pyramid level is read for tissue segmentation.
39
56
  # Larger values are faster and use less memory; smaller values can improve mask precision.
@@ -49,12 +49,21 @@ def main(argv=None) -> int:
49
49
  )
50
50
  preprocessing = deserialize_preprocessing(request["preprocessing"])
51
51
  execution = deserialize_execution(request["execution"])
52
+ from slide2vec.runtime.distributed import (
53
+ decode_work_unit,
54
+ encode_work_unit,
55
+ work_unit_shard_stem,
56
+ )
57
+ from slide2vec.runtime.embedding import tiling_result_annotation
58
+
52
59
  load_successful_tiled_slides_fn = getattr(inference, "load_successful_tiled_slides", None)
53
60
  if not callable(load_successful_tiled_slides_fn):
54
61
  from slide2vec.runtime.manifest import load_successful_tiled_slides as load_successful_tiled_slides_fn
55
62
  slide_records, tiling_results = load_successful_tiled_slides_fn(output_dir)
56
- paired_by_sample = {
57
- slide.sample_id: (slide, tiling_result)
63
+ # Key by the composite (sample_id, annotation) work unit so a multi-class slide's sibling
64
+ # classes never overwrite each other; flat units collapse to the bare sample_id key.
65
+ paired_by_unit = {
66
+ encode_work_unit(slide.sample_id, tiling_result_annotation(tiling_result)): (slide, tiling_result)
58
67
  for slide, tiling_result in zip(slide_records, tiling_results)
59
68
  }
60
69
  progress_events_path = request.get("progress_events_path")
@@ -71,8 +80,9 @@ def main(argv=None) -> int:
71
80
 
72
81
  with context:
73
82
  if request["strategy"] == "tile_shard":
74
- sample_id = request["sample_id"]
75
- slide, tiling_result = paired_by_sample[sample_id]
83
+ work_unit = request["work_unit"]
84
+ shard_stem = work_unit_shard_stem(*decode_work_unit(work_unit))
85
+ slide, tiling_result = paired_by_unit[work_unit]
76
86
  loaded = model._load_backend()
77
87
  if is_hierarchical_preprocessing(preprocessing):
78
88
  geometry = resolve_hierarchical_geometry(preprocessing, tiling_result)
@@ -103,7 +113,7 @@ def main(argv=None) -> int:
103
113
  "flat_index": torch.as_tensor(shard_indices, dtype=torch.long),
104
114
  "tile_embeddings": tile_embeddings.detach().cpu() if torch.is_tensor(tile_embeddings) else torch.as_tensor(tile_embeddings),
105
115
  }
106
- torch.save(payload, coordination_dir / f"{sample_id}.hier.rank{global_rank}.pt")
116
+ torch.save(payload, coordination_dir / f"{shard_stem}.hier.rank{global_rank}.pt")
107
117
  else:
108
118
  num_tiles = len(tiling_result.x)
109
119
  tile_indices = np.array_split(np.arange(num_tiles, dtype=np.int64), world_size)[global_rank]
@@ -129,14 +139,14 @@ def main(argv=None) -> int:
129
139
  "tile_index": torch.as_tensor(tile_indices, dtype=torch.long),
130
140
  "tile_embeddings": tile_embeddings.detach().cpu() if torch.is_tensor(tile_embeddings) else torch.as_tensor(tile_embeddings),
131
141
  }
132
- torch.save(payload, coordination_dir / f"{sample_id}.tiles.rank{global_rank}.pt")
142
+ torch.save(payload, coordination_dir / f"{shard_stem}.tiles.rank{global_rank}.pt")
133
143
  return 0
134
144
 
135
145
  assigned_ids = list(request.get("assignments", {}).get(str(global_rank), []))
136
146
  if not assigned_ids:
137
147
  return 0
138
- assigned_slides = [paired_by_sample[sample_id][0] for sample_id in assigned_ids]
139
- assigned_tiling_results = [paired_by_sample[sample_id][1] for sample_id in assigned_ids]
148
+ assigned_slides = [paired_by_unit[unit_key][0] for unit_key in assigned_ids]
149
+ assigned_tiling_results = [paired_by_unit[unit_key][1] for unit_key in assigned_ids]
140
150
 
141
151
  def _persist_embedded_slide(slide, tiling_result, embedded_slide) -> None:
142
152
  payload = {
@@ -144,7 +154,12 @@ def main(argv=None) -> int:
144
154
  "slide_embedding": _to_cpu_payload(embedded_slide.slide_embedding),
145
155
  "latents": _to_cpu_payload(embedded_slide.latents),
146
156
  }
147
- torch.save(payload, coordination_dir / f"{embedded_slide.sample_id}.embedded.pt")
157
+ # Stem by (sample_id, annotation) so two classes of one slide never overwrite each
158
+ # other; flat units keep the bare-sample_id filename for backward compatibility.
159
+ stem = work_unit_shard_stem(
160
+ embedded_slide.sample_id, tiling_result_annotation(tiling_result)
161
+ )
162
+ torch.save(payload, coordination_dir / f"{stem}.embedded.pt")
148
163
 
149
164
  compute_embedded_slides_fn = getattr(inference, "_compute_embedded_slides", None)
150
165
  if not callable(compute_embedded_slides_fn):
@@ -3,7 +3,8 @@ from contextlib import nullcontext
3
3
  import json
4
4
  from pathlib import Path
5
5
 
6
- from slide2vec.runtime.distributed import assign_slides_to_ranks
6
+ from slide2vec.runtime.distributed import assign_slides_to_ranks, encode_work_unit
7
+ from slide2vec.runtime.embedding import tiling_result_annotation
7
8
 
8
9
 
9
10
  def get_args_parser(add_help: bool = True) -> argparse.ArgumentParser:
@@ -48,26 +49,29 @@ def main(argv=None) -> int:
48
49
  if not callable(load_successful_tiled_slides_fn):
49
50
  from slide2vec.runtime.manifest import load_successful_tiled_slides as load_successful_tiled_slides_fn
50
51
  slide_records, tiling_results = load_successful_tiled_slides_fn(tiling_input_dir)
51
- requested_sample_ids = request.get("sample_ids")
52
- if requested_sample_ids is not None:
53
- requested_sample_id_set = {str(sample_id) for sample_id in requested_sample_ids}
54
- paired = [
55
- (slide, tiling_result)
56
- for slide, tiling_result in zip(slide_records, tiling_results)
57
- if slide.sample_id in requested_sample_id_set
58
- ]
59
- slide_records = [slide for slide, _ in paired]
60
- tiling_results = [tiling_result for _, tiling_result in paired]
52
+ # Each (sample_id, annotation) row is an independent work unit; key by the composite so a
53
+ # multi-class slide's sibling classes never overwrite each other. Flat units (None / tissue /
54
+ # merged) encode to the bare sample_id, byte-identical to pre-#168 single-class runs.
55
+ paired_by_unit = {
56
+ encode_work_unit(slide.sample_id, tiling_result_annotation(tiling_result)): (slide, tiling_result)
57
+ for slide, tiling_result in zip(slide_records, tiling_results)
58
+ }
59
+ requested_work_units = request.get("work_units")
60
+ if requested_work_units is not None:
61
+ requested_unit_set = {str(unit) for unit in requested_work_units}
62
+ paired_by_unit = {
63
+ unit_key: pair
64
+ for unit_key, pair in paired_by_unit.items()
65
+ if unit_key in requested_unit_set
66
+ }
67
+ slide_records = [slide for slide, _ in paired_by_unit.values()]
68
+ tiling_results = [tiling_result for _, tiling_result in paired_by_unit.values()]
61
69
  assignments = assign_slides_to_ranks(slide_records, tiling_results, num_gpus=world_size)
62
70
  assigned_ids = assignments.get(global_rank, [])
63
71
  if not assigned_ids:
64
72
  return 0
65
- paired_by_sample = {
66
- slide.sample_id: (slide, tiling_result)
67
- for slide, tiling_result in zip(slide_records, tiling_results)
68
- }
69
- assigned_slides = [paired_by_sample[sample_id][0] for sample_id in assigned_ids]
70
- assigned_tiling_results = [paired_by_sample[sample_id][1] for sample_id in assigned_ids]
73
+ assigned_slides = [paired_by_unit[unit_key][0] for unit_key in assigned_ids]
74
+ assigned_tiling_results = [paired_by_unit[unit_key][1] for unit_key in assigned_ids]
71
75
  progress_events_path = request.get("progress_events_path")
72
76
  reporter = (
73
77
  JsonlProgressReporter(