slide2vec 4.7.0__tar.gz → 4.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. {slide2vec-4.7.0 → slide2vec-4.8.0}/PKG-INFO +3 -3
  2. {slide2vec-4.7.0 → slide2vec-4.8.0}/pyproject.toml +4 -4
  3. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/__init__.py +1 -1
  4. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/api.py +66 -3
  5. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/artifacts.py +71 -5
  6. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/configs/default.yaml +18 -1
  7. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/distributed/direct_embed_worker.py +24 -9
  8. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/distributed/pipeline_worker.py +21 -17
  9. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/inference.py +17 -0
  10. slide2vec-4.8.0/slide2vec/runtime/artifacts_collect.py +300 -0
  11. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/distributed.py +85 -9
  12. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/distributed_stage.py +30 -12
  13. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/embedding.py +16 -0
  14. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/embedding_persist.py +6 -0
  15. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/embedding_pipeline.py +3 -0
  16. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/persist_callbacks.py +63 -11
  17. slide2vec-4.8.0/slide2vec/runtime/persistence.py +298 -0
  18. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/process_list.py +59 -13
  19. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/serialization.py +5 -2
  20. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/tiling.py +31 -8
  21. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/tiling_pipeline.py +31 -15
  22. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/utils/tiling_io.py +12 -0
  23. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec.egg-info/PKG-INFO +3 -3
  24. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec.egg-info/requires.txt +2 -2
  25. {slide2vec-4.7.0 → slide2vec-4.8.0}/tests/test_output_consistency.py +10 -1
  26. {slide2vec-4.7.0 → slide2vec-4.8.0}/tests/test_progress.py +1 -1
  27. {slide2vec-4.7.0 → slide2vec-4.8.0}/tests/test_regression_core.py +273 -5
  28. {slide2vec-4.7.0 → slide2vec-4.8.0}/tests/test_regression_inference.py +1889 -57
  29. slide2vec-4.7.0/slide2vec/runtime/artifacts_collect.py +0 -155
  30. slide2vec-4.7.0/slide2vec/runtime/persistence.py +0 -188
  31. {slide2vec-4.7.0 → slide2vec-4.8.0}/LICENSE +0 -0
  32. {slide2vec-4.7.0 → slide2vec-4.8.0}/README.md +0 -0
  33. {slide2vec-4.7.0 → slide2vec-4.8.0}/setup.cfg +0 -0
  34. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/__main__.py +0 -0
  35. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/cli.py +0 -0
  36. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/configs/__init__.py +0 -0
  37. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/configs/resources.py +0 -0
  38. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/data/__init__.py +0 -0
  39. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/data/dataset.py +0 -0
  40. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/data/tile_reader.py +0 -0
  41. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/data/tile_store.py +0 -0
  42. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/distributed/__init__.py +0 -0
  43. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/__init__.py +0 -0
  44. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/base.py +0 -0
  45. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/__init__.py +0 -0
  46. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/conch.py +0 -0
  47. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/gigapath.py +0 -0
  48. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/hibou.py +0 -0
  49. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/hoptimus.py +0 -0
  50. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/lunit.py +0 -0
  51. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/midnight.py +0 -0
  52. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/moozy/__init__.py +0 -0
  53. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/moozy/blocks.py +0 -0
  54. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/moozy/case.py +0 -0
  55. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/moozy/loading.py +0 -0
  56. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/moozy/slide.py +0 -0
  57. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/moozy/types.py +0 -0
  58. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/musk.py +0 -0
  59. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/phikon.py +0 -0
  60. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/prism.py +0 -0
  61. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/prost40m.py +0 -0
  62. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/titan.py +0 -0
  63. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/uni.py +0 -0
  64. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/models/virchow.py +0 -0
  65. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/registry.py +0 -0
  66. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/encoders/validation.py +0 -0
  67. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/progress.py +0 -0
  68. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/__init__.py +0 -0
  69. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/batching.py +0 -0
  70. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/cpu_budget.py +0 -0
  71. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/dense_regions.py +0 -0
  72. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/hierarchical.py +0 -0
  73. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/manifest.py +0 -0
  74. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/model_settings.py +0 -0
  75. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/patient_pipeline.py +0 -0
  76. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/progress_bridge.py +0 -0
  77. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/registry.py +0 -0
  78. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/slide_encode.py +0 -0
  79. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/types.py +0 -0
  80. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/runtime/worker_io.py +0 -0
  81. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/utils/__init__.py +0 -0
  82. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/utils/config.py +0 -0
  83. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/utils/coordinates.py +0 -0
  84. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/utils/log_utils.py +0 -0
  85. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/utils/utils.py +0 -0
  86. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec.egg-info/SOURCES.txt +0 -0
  87. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec.egg-info/dependency_links.txt +0 -0
  88. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec.egg-info/entry_points.txt +0 -0
  89. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec.egg-info/not-zip-safe +0 -0
  90. {slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec.egg-info/top_level.txt +0 -0
  91. {slide2vec-4.7.0 → slide2vec-4.8.0}/tests/test_architecture_runtime_split.py +0 -0
  92. {slide2vec-4.7.0 → slide2vec-4.8.0}/tests/test_attention_extraction.py +0 -0
  93. {slide2vec-4.7.0 → slide2vec-4.8.0}/tests/test_dense_extraction.py +0 -0
  94. {slide2vec-4.7.0 → slide2vec-4.8.0}/tests/test_dense_locality_gated.py +0 -0
  95. {slide2vec-4.7.0 → slide2vec-4.8.0}/tests/test_dense_regions.py +0 -0
  96. {slide2vec-4.7.0 → slide2vec-4.8.0}/tests/test_encoder_registry.py +0 -0
  97. {slide2vec-4.7.0 → slide2vec-4.8.0}/tests/test_hs2p_package_cutover.py +0 -0
  98. {slide2vec-4.7.0 → slide2vec-4.8.0}/tests/test_regression_models.py +0 -0
  99. {slide2vec-4.7.0 → slide2vec-4.8.0}/tests/test_runtime_batching.py +0 -0
  100. {slide2vec-4.7.0 → slide2vec-4.8.0}/tests/test_tile_store.py +0 -0
  101. {slide2vec-4.7.0 → slide2vec-4.8.0}/tests/test_tiling_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: slide2vec
3
- Version: 4.7.0
3
+ Version: 4.8.0
4
4
  Summary: Embedding of whole slide images with Foundation Models
5
5
  Author-email: Clément Grisi <clement.grisi@radboudumc.nl>
6
6
  License-Expression: Apache-2.0
@@ -15,7 +15,7 @@ Classifier: Programming Language :: Python :: 3.13
15
15
  Requires-Python: >=3.10
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
- Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.0.8
18
+ Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.1.1
19
19
  Requires-Dist: omegaconf
20
20
  Requires-Dist: matplotlib
21
21
  Requires-Dist: numpy<2
@@ -65,7 +65,7 @@ Requires-Dist: numpy<2; extra == "fm"
65
65
  Requires-Dist: pandas; extra == "fm"
66
66
  Requires-Dist: pillow; extra == "fm"
67
67
  Requires-Dist: rich; extra == "fm"
68
- Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.0.8; extra == "fm"
68
+ Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.1.1; extra == "fm"
69
69
  Requires-Dist: wandb; extra == "fm"
70
70
  Requires-Dist: torch<2.8,>=2.3; extra == "fm"
71
71
  Requires-Dist: torchvision>=0.18.0; extra == "fm"
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "slide2vec"
7
- version = "4.7.0"
7
+ version = "4.8.0"
8
8
  description = "Embedding of whole slide images with Foundation Models"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -21,7 +21,7 @@ classifiers = [
21
21
  "Programming Language :: Python :: 3.13",
22
22
  ]
23
23
  dependencies = [
24
- "hs2p[asap,cucim,openslide,sam2,vips]>=4.0.8",
24
+ "hs2p[asap,cucim,openslide,sam2,vips]>=4.1.1",
25
25
  "omegaconf",
26
26
  "matplotlib",
27
27
  "numpy<2",
@@ -88,7 +88,7 @@ fm = [
88
88
  "pandas",
89
89
  "pillow",
90
90
  "rich",
91
- "hs2p[asap,cucim,openslide,sam2,vips]>=4.0.8",
91
+ "hs2p[asap,cucim,openslide,sam2,vips]>=4.1.1",
92
92
  "wandb",
93
93
  "torch>=2.3,<2.8",
94
94
  "torchvision>=0.18.0",
@@ -164,7 +164,7 @@ no_implicit_reexport = true
164
164
  max-line-length = 160
165
165
 
166
166
  [tool.bumpver]
167
- current_version = "4.7.0"
167
+ current_version = "4.8.0"
168
168
  version_pattern = "MAJOR.MINOR.PATCH"
169
169
  commit = false # We do version bumping in CI, not as a commit
170
170
  tag = false # Git tag already exists — we don't auto-tag
@@ -11,7 +11,7 @@ from slide2vec.api import (
11
11
  from slide2vec.artifacts import HierarchicalEmbeddingArtifact, SlideEmbeddingArtifact, TileEmbeddingArtifact
12
12
 
13
13
 
14
- __version__ = "4.7.0"
14
+ __version__ = "4.8.0"
15
15
 
16
16
  __all__ = [
17
17
  "Model",
@@ -1,4 +1,5 @@
1
1
 
2
+ import copy
2
3
  import logging
3
4
  import os
4
5
  from dataclasses import dataclass, field, replace
@@ -40,6 +41,55 @@ SlideSequence = Sequence[SlideInput]
40
41
  TilingResultsInput = Sequence[Any] | Mapping[str, Any]
41
42
 
42
43
 
44
+ #: Default annotation-mask vocabulary — plain binary tissue tiling. Mirrors hs2p's
45
+ #: shipped default ``{background: 0, tissue: 1}``; leaving it untouched keeps a run
46
+ #: behaving exactly as a tissue-only run. ``min_coverage.tissue`` is the single source
47
+ #: of truth for the tissue threshold (the standalone ``tissue_threshold`` knob is gone).
48
+ #: A :class:`PreprocessingConfig` ``masks`` value is deep-merged over this default, so
49
+ #: callers only state what they override (e.g. ``{"min_coverage": {"tissue": 0.1}}``).
50
+ DEFAULT_MASKS: dict[str, Any] = {
51
+ "output_mode": "per_annotation",
52
+ "pixel_mapping": {"background": 0, "tissue": 1},
53
+ "colors": {"background": None, "tissue": [157, 219, 129]},
54
+ "min_coverage": {"background": None, "tissue": 0.01},
55
+ }
56
+
57
+
58
+ def _deep_merge_masks(base: Mapping[str, Any], override: Mapping[str, Any]) -> dict[str, Any]:
59
+ """Deep-merge *override* onto a copy of *base* (nested dicts merge key-by-key)."""
60
+ merged = copy.deepcopy(dict(base))
61
+ for key, value in override.items():
62
+ existing = merged.get(key)
63
+ if isinstance(value, Mapping) and isinstance(existing, dict):
64
+ merged[key] = _deep_merge_masks(existing, value)
65
+ else:
66
+ merged[key] = copy.deepcopy(value)
67
+ return merged
68
+
69
+
70
+ def resolve_masks(masks: Mapping[str, Any] | None) -> dict[str, Any]:
71
+ """Complete a (possibly partial) ``masks`` mapping by merging it over :data:`DEFAULT_MASKS`."""
72
+ if not masks:
73
+ return copy.deepcopy(DEFAULT_MASKS)
74
+ return _deep_merge_masks(DEFAULT_MASKS, masks)
75
+
76
+
77
+ def _masks_to_plain_dict(node: Any) -> dict[str, Any]:
78
+ """Normalize a masks config node (OmegaConf, mapping, or namespace) to a plain dict."""
79
+ if node is None:
80
+ return {}
81
+ try:
82
+ from omegaconf import OmegaConf
83
+
84
+ if OmegaConf.is_config(node):
85
+ return copy.deepcopy(OmegaConf.to_container(node, resolve=True)) # type: ignore[return-value]
86
+ except ImportError:
87
+ pass
88
+ if isinstance(node, Mapping):
89
+ return copy.deepcopy(dict(node))
90
+ return copy.deepcopy(dict(vars(node)))
91
+
92
+
43
93
  @dataclass(frozen=True, kw_only=True)
44
94
  class PreprocessingConfig:
45
95
  """Configuration for slide tiling and preprocessing."""
@@ -62,8 +112,6 @@ class PreprocessingConfig:
62
112
  tolerance: float = 0.05
63
113
  #: Fractional tile overlap (``0.0`` = no overlap).
64
114
  overlap: float = 0.0
65
- #: Minimum tissue fraction required to keep a tile (default ``0.01``).
66
- tissue_threshold: float = 0.01
67
115
  #: Directory containing pre-extracted tile coordinates to reuse, skipping tiling.
68
116
  read_coordinates_from: Path | None = None
69
117
  #: Directory containing pre-extracted tile images to skip the tiling step entirely.
@@ -90,6 +138,20 @@ class PreprocessingConfig:
90
138
  #: Controls whether hs2p writes mask and tiling preview images.
91
139
  #: Keys: ``save_mask_preview``, ``save_tiling_preview``, ``downsample``.
92
140
  preview: dict[str, Any] = field(default_factory=dict)
141
+ #: Annotation-mask vocabulary forwarded to hs2p's sampling resolver. Keys:
142
+ #: ``output_mode``, ``pixel_mapping``, ``colors``, ``min_coverage``. A partial
143
+ #: mapping is deep-merged over :data:`DEFAULT_MASKS`, so callers only state what
144
+ #: they override (e.g. ``{"min_coverage": {"tissue": 0.1}}``). The default
145
+ #: ``{background, tissue}`` block is plain tissue tiling; ``min_coverage.tissue``
146
+ #: is the single source of truth for the tissue threshold.
147
+ masks: dict[str, Any] = field(default_factory=dict)
148
+ #: When annotation sampling is active, tile each class independently (``True``)
149
+ #: vs jointly across classes (``False``).
150
+ independent_sampling: bool = True
151
+
152
+ def __post_init__(self) -> None:
153
+ # Complete a (possibly partial) masks mapping against the shipped default.
154
+ object.__setattr__(self, "masks", resolve_masks(self.masks))
93
155
 
94
156
  @classmethod
95
157
  def from_config(cls, cfg: Any) -> "PreprocessingConfig":
@@ -121,7 +183,8 @@ class PreprocessingConfig:
121
183
  region_tile_multiple=int(region_tile_multiple) if region_tile_multiple is not None else None,
122
184
  tolerance=float(tiling.params.tolerance),
123
185
  overlap=float(tiling.params.overlap),
124
- tissue_threshold=float(tiling.params.tissue_threshold),
186
+ masks=_masks_to_plain_dict(getattr(tiling, "masks", None)),
187
+ independent_sampling=bool(getattr(tiling, "independent_sampling", True)),
125
188
  read_coordinates_from=Path(read_coordinates_from) if read_coordinates_from else None,
126
189
  read_tiles_from=(
127
190
  Path(read_tiles_from) if read_tiles_from else None
@@ -5,6 +5,7 @@ from typing import Any
5
5
 
6
6
  import numpy as np
7
7
  import torch
8
+ from hs2p.fileops import is_flattened_annotation
8
9
 
9
10
 
10
11
  @dataclass(frozen=True, kw_only=True)
@@ -29,6 +30,7 @@ class SlideEmbeddingArtifact:
29
30
  format: str
30
31
  feature_dim: int
31
32
  latent_path: Path | None = None
33
+ annotation: str | None = None
32
34
 
33
35
  @property
34
36
  def metadata(self) -> dict[str, Any]:
@@ -58,6 +60,7 @@ class HierarchicalEmbeddingArtifact:
58
60
  feature_dim: int
59
61
  num_regions: int
60
62
  tiles_per_region: int
63
+ annotation: str | None = None
61
64
 
62
65
  @property
63
66
  def metadata(self) -> dict[str, Any]:
@@ -90,6 +93,53 @@ def _write_metadata(path: Path, metadata: dict[str, Any]) -> None:
90
93
  path.write_text(json.dumps(metadata, indent=2, sort_keys=True), encoding="utf-8")
91
94
 
92
95
 
96
+ def tile_embeddings_subdir(annotation: str | None) -> str:
97
+ """Namespace the ``tile_embeddings`` output dir per annotation class.
98
+
99
+ Reuses hs2p's flatten rule (the single source of truth): ``None`` and the sentinel
100
+ ``"tissue"`` collapse to the flat ``tile_embeddings`` root, so the default tissue-only
101
+ path is byte-for-byte unchanged; any real class label gets its own
102
+ ``tile_embeddings/<class>`` subdirectory.
103
+ """
104
+ if is_flattened_annotation(annotation):
105
+ return "tile_embeddings"
106
+ return f"tile_embeddings/{annotation}"
107
+
108
+
109
+ def slide_embeddings_subdir(annotation: str | None) -> str:
110
+ """Namespace the ``slide_embeddings`` output dir per annotation class.
111
+
112
+ Reuses hs2p's flatten rule (the single source of truth, shared with
113
+ :func:`tile_embeddings_subdir`): ``None`` and the sentinel ``"tissue"`` collapse to the
114
+ flat ``slide_embeddings`` root, so the default tissue-only path is byte-for-byte
115
+ unchanged; any real class label gets its own ``slide_embeddings/<class>`` subdirectory.
116
+ """
117
+ if is_flattened_annotation(annotation):
118
+ return "slide_embeddings"
119
+ return f"slide_embeddings/{annotation}"
120
+
121
+
122
+ def slide_latents_subdir(annotation: str | None) -> str:
123
+ """Namespace the ``slide_latents`` output dir per annotation class (mirrors slide embeddings)."""
124
+ if is_flattened_annotation(annotation):
125
+ return "slide_latents"
126
+ return f"slide_latents/{annotation}"
127
+
128
+
129
+ def hierarchical_embeddings_subdir(annotation: str | None) -> str:
130
+ """Namespace the ``hierarchical_embeddings`` output dir per annotation class.
131
+
132
+ Reuses hs2p's flatten rule (the single source of truth, shared with
133
+ :func:`tile_embeddings_subdir` and :func:`slide_embeddings_subdir`): ``None`` and the
134
+ sentinel ``"tissue"`` collapse to the flat ``hierarchical_embeddings`` root, so the
135
+ default tissue-only path is byte-for-byte unchanged; any real class label gets its own
136
+ ``hierarchical_embeddings/<class>`` subdirectory.
137
+ """
138
+ if is_flattened_annotation(annotation):
139
+ return "hierarchical_embeddings"
140
+ return f"hierarchical_embeddings/{annotation}"
141
+
142
+
93
143
  def _setup_artifact_paths(
94
144
  output_dir: str | Path, subdir: str, sample_id: str, output_format: str
95
145
  ) -> tuple[Path, Path]:
@@ -142,9 +192,12 @@ def write_tile_embeddings(
142
192
  output_format: str = "pt",
143
193
  metadata: dict[str, Any] | None = None,
144
194
  tile_index: Any | None = None,
195
+ annotation: str | None = None,
145
196
  ) -> TileEmbeddingArtifact:
146
197
  output_format = _validate_output_format(output_format)
147
- artifact_path, metadata_path = _setup_artifact_paths(output_dir, "tile_embeddings", sample_id, output_format)
198
+ artifact_path, metadata_path = _setup_artifact_paths(
199
+ output_dir, tile_embeddings_subdir(annotation), sample_id, output_format
200
+ )
148
201
  feature_array = _ensure_array(features)
149
202
  if output_format == "pt":
150
203
  torch.save(_ensure_tensor(features), artifact_path)
@@ -180,9 +233,12 @@ def write_tile_embedding_metadata(
180
233
  feature_dim: int | None = None,
181
234
  num_tiles: int = 0,
182
235
  metadata: dict[str, Any] | None = None,
236
+ annotation: str | None = None,
183
237
  ) -> Path:
184
238
  output_format = _validate_output_format(output_format)
185
- _, metadata_path = _setup_artifact_paths(output_dir, "tile_embeddings", sample_id, output_format)
239
+ _, metadata_path = _setup_artifact_paths(
240
+ output_dir, tile_embeddings_subdir(annotation), sample_id, output_format
241
+ )
186
242
  tile_metadata = _build_tile_embedding_metadata(
187
243
  sample_id,
188
244
  output_format=output_format,
@@ -202,9 +258,12 @@ def write_slide_embeddings(
202
258
  output_format: str = "pt",
203
259
  metadata: dict[str, Any] | None = None,
204
260
  latents: Any | None = None,
261
+ annotation: str | None = None,
205
262
  ) -> SlideEmbeddingArtifact:
206
263
  output_format = _validate_output_format(output_format)
207
- artifact_path, metadata_path = _setup_artifact_paths(output_dir, "slide_embeddings", sample_id, output_format)
264
+ artifact_path, metadata_path = _setup_artifact_paths(
265
+ output_dir, slide_embeddings_subdir(annotation), sample_id, output_format
266
+ )
208
267
  embedding_array = _ensure_array(embedding)
209
268
  latent_path = None
210
269
  if output_format == "pt":
@@ -212,7 +271,9 @@ def write_slide_embeddings(
212
271
  else:
213
272
  np.savez_compressed(artifact_path, features=embedding_array)
214
273
  if latents is not None:
215
- latent_path, _ = _setup_artifact_paths(output_dir, "slide_latents", sample_id, output_format)
274
+ latent_path, _ = _setup_artifact_paths(
275
+ output_dir, slide_latents_subdir(annotation), sample_id, output_format
276
+ )
216
277
  if output_format == "pt":
217
278
  torch.save(_ensure_tensor(latents), latent_path)
218
279
  else:
@@ -234,6 +295,7 @@ def write_slide_embeddings(
234
295
  format=output_format,
235
296
  feature_dim=slide_metadata["feature_dim"],
236
297
  latent_path=latent_path,
298
+ annotation=annotation,
237
299
  )
238
300
 
239
301
 
@@ -283,9 +345,12 @@ def write_hierarchical_embeddings(
283
345
  output_dir: str | Path,
284
346
  output_format: str = "pt",
285
347
  metadata: dict[str, Any] | None = None,
348
+ annotation: str | None = None,
286
349
  ) -> HierarchicalEmbeddingArtifact:
287
350
  output_format = _validate_output_format(output_format)
288
- artifact_path, metadata_path = _setup_artifact_paths(output_dir, "hierarchical_embeddings", sample_id, output_format)
351
+ artifact_path, metadata_path = _setup_artifact_paths(
352
+ output_dir, hierarchical_embeddings_subdir(annotation), sample_id, output_format
353
+ )
289
354
  feature_array = _ensure_array(features)
290
355
  if feature_array.ndim != 3:
291
356
  raise ValueError(
@@ -315,4 +380,5 @@ def write_hierarchical_embeddings(
315
380
  feature_dim=int(hierarchical_metadata["feature_dim"]),
316
381
  num_regions=int(hierarchical_metadata["num_regions"]),
317
382
  tiles_per_region=int(hierarchical_metadata["tiles_per_region"]),
383
+ annotation=annotation,
318
384
  )
@@ -26,6 +26,24 @@ tiling:
26
26
  read_coordinates_from: # path to an existing directory containing pre-extracted `.coordinates.npz` / `.coordinates.meta.json` artifacts to reuse instead of starting tiling from scratch
27
27
  read_tiles_from: # path to an existing directory containing pre-extracted `.tiles.tar` tile stores to reuse instead of starting tiling from scratch
28
28
  backend: "auto" # backend to use for slide reading; "auto" lets hs2p resolve the best backend per slide, preferring cuCIM when available
29
+ independent_sampling: true # selection strategy when annotation sampling is active. true: sample each class independently against its own binary mask (independent selection); false: sample once over the union of active classes, then post-filter per class by coverage (joint selection). Ignored when the masks vocabulary is left at the tissue-only default.
30
+ masks:
31
+ # Annotation-mask vocabulary forwarded to hs2p's sampling resolver. The shipped default
32
+ # ({background:0, tissue:1}) is plain binary tissue tiling — leave it untouched and the run
33
+ # behaves exactly as a tissue-only run. Customising the vocabulary (e.g. adding a `tumor`
34
+ # class with its own pixel value + min_coverage) opts the run into annotation-aware sampling,
35
+ # where `mask_path` is read as a multi-label raster. The `tissue` min_coverage entry below is
36
+ # the single source of truth for the tissue threshold.
37
+ output_mode: per_annotation # how sampled tiles are grouped into artifacts. per_annotation: one flat artifact set per sampled class, namespaced under a `<class>/` subdir (the `tissue` class collapses to the flat root). merged: a single flat artifact set per slide over the union of tiles passing any active class threshold — it carries no class label, so it lands at the flat output root (no `<class>/` subdir).
38
+ pixel_mapping: # {class_name: integer pixel value in the mask raster}
39
+ background: 0
40
+ tissue: 1
41
+ colors: # {class_name: [r, g, b] | null} used when rendering previews
42
+ background:
43
+ tissue: [157, 219, 129]
44
+ min_coverage: # {class_name: float | null}; minimum fraction of a tile that must be covered to keep it; null = don't sample that class
45
+ background:
46
+ tissue: 0.1
29
47
  params:
30
48
  requested_spacing_um: # spacing at which to tile the slide, in microns per pixel; filled from a preset model when available
31
49
  tolerance: 0.05 # tolerance for matching the spacing (float between 0 and 1, deciding how much the spacing can deviate from the one specified in the slide metadata)
@@ -33,7 +51,6 @@ tiling:
33
51
  requested_region_size_px: # size of hierarchical parent regions in pixels; when unset and region_tile_multiple is set, derived from requested_tile_size_px * region_tile_multiple
34
52
  region_tile_multiple: # hierarchical region grid width/height in tiles; e.g. 6 means 6x6 tiles per region
35
53
  overlap: 0.0 # percentage of overlap between two consecutive tiles (float between 0 and 1)
36
- tissue_threshold: 0.1 # minimum fraction of pixels that must be tissue to keep a tile (float between 0 and 1)
37
54
  seg_params:
38
55
  # downsample controls which pyramid level is read for tissue segmentation.
39
56
  # Larger values are faster and use less memory; smaller values can improve mask precision.
@@ -49,12 +49,21 @@ def main(argv=None) -> int:
49
49
  )
50
50
  preprocessing = deserialize_preprocessing(request["preprocessing"])
51
51
  execution = deserialize_execution(request["execution"])
52
+ from slide2vec.runtime.distributed import (
53
+ decode_work_unit,
54
+ encode_work_unit,
55
+ work_unit_shard_stem,
56
+ )
57
+ from slide2vec.runtime.embedding import tiling_result_annotation
58
+
52
59
  load_successful_tiled_slides_fn = getattr(inference, "load_successful_tiled_slides", None)
53
60
  if not callable(load_successful_tiled_slides_fn):
54
61
  from slide2vec.runtime.manifest import load_successful_tiled_slides as load_successful_tiled_slides_fn
55
62
  slide_records, tiling_results = load_successful_tiled_slides_fn(output_dir)
56
- paired_by_sample = {
57
- slide.sample_id: (slide, tiling_result)
63
+ # Key by the composite (sample_id, annotation) work unit so a multi-class slide's sibling
64
+ # classes never overwrite each other; flat units collapse to the bare sample_id key.
65
+ paired_by_unit = {
66
+ encode_work_unit(slide.sample_id, tiling_result_annotation(tiling_result)): (slide, tiling_result)
58
67
  for slide, tiling_result in zip(slide_records, tiling_results)
59
68
  }
60
69
  progress_events_path = request.get("progress_events_path")
@@ -71,8 +80,9 @@ def main(argv=None) -> int:
71
80
 
72
81
  with context:
73
82
  if request["strategy"] == "tile_shard":
74
- sample_id = request["sample_id"]
75
- slide, tiling_result = paired_by_sample[sample_id]
83
+ work_unit = request["work_unit"]
84
+ shard_stem = work_unit_shard_stem(*decode_work_unit(work_unit))
85
+ slide, tiling_result = paired_by_unit[work_unit]
76
86
  loaded = model._load_backend()
77
87
  if is_hierarchical_preprocessing(preprocessing):
78
88
  geometry = resolve_hierarchical_geometry(preprocessing, tiling_result)
@@ -103,7 +113,7 @@ def main(argv=None) -> int:
103
113
  "flat_index": torch.as_tensor(shard_indices, dtype=torch.long),
104
114
  "tile_embeddings": tile_embeddings.detach().cpu() if torch.is_tensor(tile_embeddings) else torch.as_tensor(tile_embeddings),
105
115
  }
106
- torch.save(payload, coordination_dir / f"{sample_id}.hier.rank{global_rank}.pt")
116
+ torch.save(payload, coordination_dir / f"{shard_stem}.hier.rank{global_rank}.pt")
107
117
  else:
108
118
  num_tiles = len(tiling_result.x)
109
119
  tile_indices = np.array_split(np.arange(num_tiles, dtype=np.int64), world_size)[global_rank]
@@ -129,14 +139,14 @@ def main(argv=None) -> int:
129
139
  "tile_index": torch.as_tensor(tile_indices, dtype=torch.long),
130
140
  "tile_embeddings": tile_embeddings.detach().cpu() if torch.is_tensor(tile_embeddings) else torch.as_tensor(tile_embeddings),
131
141
  }
132
- torch.save(payload, coordination_dir / f"{sample_id}.tiles.rank{global_rank}.pt")
142
+ torch.save(payload, coordination_dir / f"{shard_stem}.tiles.rank{global_rank}.pt")
133
143
  return 0
134
144
 
135
145
  assigned_ids = list(request.get("assignments", {}).get(str(global_rank), []))
136
146
  if not assigned_ids:
137
147
  return 0
138
- assigned_slides = [paired_by_sample[sample_id][0] for sample_id in assigned_ids]
139
- assigned_tiling_results = [paired_by_sample[sample_id][1] for sample_id in assigned_ids]
148
+ assigned_slides = [paired_by_unit[unit_key][0] for unit_key in assigned_ids]
149
+ assigned_tiling_results = [paired_by_unit[unit_key][1] for unit_key in assigned_ids]
140
150
 
141
151
  def _persist_embedded_slide(slide, tiling_result, embedded_slide) -> None:
142
152
  payload = {
@@ -144,7 +154,12 @@ def main(argv=None) -> int:
144
154
  "slide_embedding": _to_cpu_payload(embedded_slide.slide_embedding),
145
155
  "latents": _to_cpu_payload(embedded_slide.latents),
146
156
  }
147
- torch.save(payload, coordination_dir / f"{embedded_slide.sample_id}.embedded.pt")
157
+ # Stem by (sample_id, annotation) so two classes of one slide never overwrite each
158
+ # other; flat units keep the bare-sample_id filename for backward compatibility.
159
+ stem = work_unit_shard_stem(
160
+ embedded_slide.sample_id, tiling_result_annotation(tiling_result)
161
+ )
162
+ torch.save(payload, coordination_dir / f"{stem}.embedded.pt")
148
163
 
149
164
  compute_embedded_slides_fn = getattr(inference, "_compute_embedded_slides", None)
150
165
  if not callable(compute_embedded_slides_fn):
@@ -3,7 +3,8 @@ from contextlib import nullcontext
3
3
  import json
4
4
  from pathlib import Path
5
5
 
6
- from slide2vec.runtime.distributed import assign_slides_to_ranks
6
+ from slide2vec.runtime.distributed import assign_slides_to_ranks, encode_work_unit
7
+ from slide2vec.runtime.embedding import tiling_result_annotation
7
8
 
8
9
 
9
10
  def get_args_parser(add_help: bool = True) -> argparse.ArgumentParser:
@@ -48,26 +49,29 @@ def main(argv=None) -> int:
48
49
  if not callable(load_successful_tiled_slides_fn):
49
50
  from slide2vec.runtime.manifest import load_successful_tiled_slides as load_successful_tiled_slides_fn
50
51
  slide_records, tiling_results = load_successful_tiled_slides_fn(tiling_input_dir)
51
- requested_sample_ids = request.get("sample_ids")
52
- if requested_sample_ids is not None:
53
- requested_sample_id_set = {str(sample_id) for sample_id in requested_sample_ids}
54
- paired = [
55
- (slide, tiling_result)
56
- for slide, tiling_result in zip(slide_records, tiling_results)
57
- if slide.sample_id in requested_sample_id_set
58
- ]
59
- slide_records = [slide for slide, _ in paired]
60
- tiling_results = [tiling_result for _, tiling_result in paired]
52
+ # Each (sample_id, annotation) row is an independent work unit; key by the composite so a
53
+ # multi-class slide's sibling classes never overwrite each other. Flat units (None / tissue /
54
+ # merged) encode to the bare sample_id, byte-identical to pre-#168 single-class runs.
55
+ paired_by_unit = {
56
+ encode_work_unit(slide.sample_id, tiling_result_annotation(tiling_result)): (slide, tiling_result)
57
+ for slide, tiling_result in zip(slide_records, tiling_results)
58
+ }
59
+ requested_work_units = request.get("work_units")
60
+ if requested_work_units is not None:
61
+ requested_unit_set = {str(unit) for unit in requested_work_units}
62
+ paired_by_unit = {
63
+ unit_key: pair
64
+ for unit_key, pair in paired_by_unit.items()
65
+ if unit_key in requested_unit_set
66
+ }
67
+ slide_records = [slide for slide, _ in paired_by_unit.values()]
68
+ tiling_results = [tiling_result for _, tiling_result in paired_by_unit.values()]
61
69
  assignments = assign_slides_to_ranks(slide_records, tiling_results, num_gpus=world_size)
62
70
  assigned_ids = assignments.get(global_rank, [])
63
71
  if not assigned_ids:
64
72
  return 0
65
- paired_by_sample = {
66
- slide.sample_id: (slide, tiling_result)
67
- for slide, tiling_result in zip(slide_records, tiling_results)
68
- }
69
- assigned_slides = [paired_by_sample[sample_id][0] for sample_id in assigned_ids]
70
- assigned_tiling_results = [paired_by_sample[sample_id][1] for sample_id in assigned_ids]
73
+ assigned_slides = [paired_by_unit[unit_key][0] for unit_key in assigned_ids]
74
+ assigned_tiling_results = [paired_by_unit[unit_key][1] for unit_key in assigned_ids]
71
75
  progress_events_path = request.get("progress_events_path")
72
76
  reporter = (
73
77
  JsonlProgressReporter(
@@ -156,6 +156,7 @@ def _reconcile_embedding_process_list(
156
156
  process_list_path,
157
157
  embeddable_slides,
158
158
  output_dir,
159
+ embeddable_tiling_results=None,
159
160
  ):
160
161
  """Reconcile the process_list with the embeddings on disk once, at end of run.
161
162
 
@@ -170,6 +171,16 @@ def _reconcile_embedding_process_list(
170
171
  persist_hierarchical_embeddings = hierarchical.is_hierarchical_preprocessing(preprocessing)
171
172
  include_slide_embeddings = model.level == "slide"
172
173
  include_tile_embeddings = persist_tile_embeddings and not persist_hierarchical_embeddings
174
+ annotations = None
175
+ if (include_slide_embeddings or persist_hierarchical_embeddings) and embeddable_tiling_results is not None:
176
+ # Re-read each class's namespaced slide- or hierarchical-embedding artifact so the
177
+ # final reconcile records the per-class feature path instead of collapsing every
178
+ # annotation row onto the flat path. The default tissue-only path leaves annotations
179
+ # None.
180
+ annotations = [
181
+ embedding.tiling_result_annotation(tiling_result)
182
+ for tiling_result in embeddable_tiling_results
183
+ ]
173
184
  tile_artifacts, hierarchical_artifacts, slide_artifacts = artifacts_collect.collect_pipeline_artifacts(
174
185
  embeddable_slides,
175
186
  output_dir=output_dir,
@@ -177,6 +188,7 @@ def _reconcile_embedding_process_list(
177
188
  include_tile_embeddings=include_tile_embeddings,
178
189
  include_hierarchical_embeddings=persist_hierarchical_embeddings,
179
190
  include_slide_embeddings=include_slide_embeddings,
191
+ annotations=annotations,
180
192
  )
181
193
  if process_list_path is not None and Path(process_list_path).is_file():
182
194
  persistence.update_process_list_after_embedding(
@@ -323,6 +335,7 @@ def embed_slides(
323
335
  process_list_path=process_list_path,
324
336
  embeddable_slides=embeddable_slides,
325
337
  output_dir=Path(execution.output_dir),
338
+ embeddable_tiling_results=embeddable_tiling_results,
326
339
  )
327
340
  emit_progress(
328
341
  "embedding.finished",
@@ -574,6 +587,7 @@ def embed_tiles(
574
587
  backend=tiling.resolve_slide_backend(resolved_preprocessing, tiling_result),
575
588
  preprocessing=resolved_preprocessing,
576
589
  ),
590
+ annotation=embedding.tiling_result_annotation(tiling_result),
577
591
  )
578
592
  else:
579
593
  features = embedding_pipeline.compute_tile_embeddings_for_slide(
@@ -597,6 +611,7 @@ def embed_tiles(
597
611
  features,
598
612
  execution=execution,
599
613
  metadata=metadata,
614
+ annotation=embedding.tiling_result_annotation(tiling_result),
600
615
  )
601
616
  artifacts.append(artifact)
602
617
  return artifacts
@@ -854,6 +869,7 @@ def run_pipeline(
854
869
  process_list_path=process_list_path,
855
870
  embeddable_slides=embeddable_slides,
856
871
  output_dir=output_dir,
872
+ embeddable_tiling_results=embeddable_tiling_results,
857
873
  )
858
874
  emit_progress(
859
875
  "embedding.finished",
@@ -974,6 +990,7 @@ def run_pipeline_with_coordinates(
974
990
  process_list_path=process_list_path,
975
991
  embeddable_slides=embeddable_slides,
976
992
  output_dir=output_dir,
993
+ embeddable_tiling_results=embeddable_tiling_results,
977
994
  )
978
995
  return RunResult(
979
996
  tile_artifacts=tile_artifacts,