PyPI - slide2vec - Versions diffs - 4.7.0__tar.gz → 4.8.0__tar.gz - Mend

slide2vec 4.7.0tar.gz → 4.8.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

{slide2vec-4.7.0 → slide2vec-4.8.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: slide2vec
-Version: 4.7.0
+Version: 4.8.0
 Summary: Embedding of whole slide images with Foundation Models
 Author-email: Clément Grisi <clement.grisi@radboudumc.nl>
 License-Expression: Apache-2.0
@@ -15,7 +15,7 @@ Classifier: Programming Language :: Python :: 3.13
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.0.8
+Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.1.1
 Requires-Dist: omegaconf
 Requires-Dist: matplotlib
 Requires-Dist: numpy<2
@@ -65,7 +65,7 @@ Requires-Dist: numpy<2; extra == "fm"
 Requires-Dist: pandas; extra == "fm"
 Requires-Dist: pillow; extra == "fm"
 Requires-Dist: rich; extra == "fm"
-Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.0.8; extra == "fm"
+Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.1.1; extra == "fm"
 Requires-Dist: wandb; extra == "fm"
 Requires-Dist: torch<2.8,>=2.3; extra == "fm"
 Requires-Dist: torchvision>=0.18.0; extra == "fm"

{slide2vec-4.7.0 → slide2vec-4.8.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "slide2vec"
-version = "4.7.0"
+version = "4.8.0"
 description = "Embedding of whole slide images with Foundation Models"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -21,7 +21,7 @@ classifiers = [
     "Programming Language :: Python :: 3.13",
 ]
 dependencies = [
-    "hs2p[asap,cucim,openslide,sam2,vips]>=4.0.8",
+    "hs2p[asap,cucim,openslide,sam2,vips]>=4.1.1",
     "omegaconf",
     "matplotlib",
     "numpy<2",
@@ -88,7 +88,7 @@ fm = [
     "pandas",
     "pillow",
     "rich",
-    "hs2p[asap,cucim,openslide,sam2,vips]>=4.0.8",
+    "hs2p[asap,cucim,openslide,sam2,vips]>=4.1.1",
     "wandb",
     "torch>=2.3,<2.8",
     "torchvision>=0.18.0",
@@ -164,7 +164,7 @@ no_implicit_reexport = true
 max-line-length = 160
 [tool.bumpver]
-current_version = "4.7.0"
+current_version = "4.8.0"
 version_pattern = "MAJOR.MINOR.PATCH"
 commit = false       # We do version bumping in CI, not as a commit
 tag = false          # Git tag already exists — we don't auto-tag

{slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/__init__.py RENAMED Viewed

@@ -11,7 +11,7 @@ from slide2vec.api import (
 from slide2vec.artifacts import HierarchicalEmbeddingArtifact, SlideEmbeddingArtifact, TileEmbeddingArtifact
-__version__ = "4.7.0"
+__version__ = "4.8.0"
 __all__ = [
     "Model",

{slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/api.py RENAMED Viewed

@@ -1,4 +1,5 @@
+import copy
 import logging
 import os
 from dataclasses import dataclass, field, replace
@@ -40,6 +41,55 @@ SlideSequence = Sequence[SlideInput]
 TilingResultsInput = Sequence[Any] | Mapping[str, Any]
+#: Default annotation-mask vocabulary — plain binary tissue tiling. Mirrors hs2p's
+#: shipped default ``{background: 0, tissue: 1}``; leaving it untouched keeps a run
+#: behaving exactly as a tissue-only run. ``min_coverage.tissue`` is the single source
+#: of truth for the tissue threshold (the standalone ``tissue_threshold`` knob is gone).
+#: A :class:`PreprocessingConfig` ``masks`` value is deep-merged over this default, so
+#: callers only state what they override (e.g. ``{"min_coverage": {"tissue": 0.1}}``).
+DEFAULT_MASKS: dict[str, Any] = {
+    "output_mode": "per_annotation",
+    "pixel_mapping": {"background": 0, "tissue": 1},
+    "colors": {"background": None, "tissue": [157, 219, 129]},
+    "min_coverage": {"background": None, "tissue": 0.01},
+}
+def _deep_merge_masks(base: Mapping[str, Any], override: Mapping[str, Any]) -> dict[str, Any]:
+    """Deep-merge *override* onto a copy of *base* (nested dicts merge key-by-key)."""
+    merged = copy.deepcopy(dict(base))
+    for key, value in override.items():
+        existing = merged.get(key)
+        if isinstance(value, Mapping) and isinstance(existing, dict):
+            merged[key] = _deep_merge_masks(existing, value)
+        else:
+            merged[key] = copy.deepcopy(value)
+    return merged
+def resolve_masks(masks: Mapping[str, Any] | None) -> dict[str, Any]:
+    """Complete a (possibly partial) ``masks`` mapping by merging it over :data:`DEFAULT_MASKS`."""
+    if not masks:
+        return copy.deepcopy(DEFAULT_MASKS)
+    return _deep_merge_masks(DEFAULT_MASKS, masks)
+def _masks_to_plain_dict(node: Any) -> dict[str, Any]:
+    """Normalize a masks config node (OmegaConf, mapping, or namespace) to a plain dict."""
+    if node is None:
+        return {}
+    try:
+        from omegaconf import OmegaConf
+        if OmegaConf.is_config(node):
+            return copy.deepcopy(OmegaConf.to_container(node, resolve=True))  # type: ignore[return-value]
+    except ImportError:
+        pass
+    if isinstance(node, Mapping):
+        return copy.deepcopy(dict(node))
+    return copy.deepcopy(dict(vars(node)))
 @dataclass(frozen=True, kw_only=True)
 class PreprocessingConfig:
     """Configuration for slide tiling and preprocessing."""
@@ -62,8 +112,6 @@ class PreprocessingConfig:
     tolerance: float = 0.05
     #: Fractional tile overlap (``0.0`` = no overlap).
     overlap: float = 0.0
-    #: Minimum tissue fraction required to keep a tile (default ``0.01``).
-    tissue_threshold: float = 0.01
     #: Directory containing pre-extracted tile coordinates to reuse, skipping tiling.
     read_coordinates_from: Path | None = None
     #: Directory containing pre-extracted tile images to skip the tiling step entirely.
@@ -90,6 +138,20 @@ class PreprocessingConfig:
     #: Controls whether hs2p writes mask and tiling preview images.
     #: Keys: ``save_mask_preview``, ``save_tiling_preview``, ``downsample``.
     preview: dict[str, Any] = field(default_factory=dict)
+    #: Annotation-mask vocabulary forwarded to hs2p's sampling resolver. Keys:
+    #: ``output_mode``, ``pixel_mapping``, ``colors``, ``min_coverage``. A partial
+    #: mapping is deep-merged over :data:`DEFAULT_MASKS`, so callers only state what
+    #: they override (e.g. ``{"min_coverage": {"tissue": 0.1}}``). The default
+    #: ``{background, tissue}`` block is plain tissue tiling; ``min_coverage.tissue``
+    #: is the single source of truth for the tissue threshold.
+    masks: dict[str, Any] = field(default_factory=dict)
+    #: When annotation sampling is active, tile each class independently (``True``)
+    #: vs jointly across classes (``False``).
+    independent_sampling: bool = True
+    def __post_init__(self) -> None:
+        # Complete a (possibly partial) masks mapping against the shipped default.
+        object.__setattr__(self, "masks", resolve_masks(self.masks))
     @classmethod
     def from_config(cls, cfg: Any) -> "PreprocessingConfig":
@@ -121,7 +183,8 @@ class PreprocessingConfig:
             region_tile_multiple=int(region_tile_multiple) if region_tile_multiple is not None else None,
             tolerance=float(tiling.params.tolerance),
             overlap=float(tiling.params.overlap),
-            tissue_threshold=float(tiling.params.tissue_threshold),
+            masks=_masks_to_plain_dict(getattr(tiling, "masks", None)),
+            independent_sampling=bool(getattr(tiling, "independent_sampling", True)),
             read_coordinates_from=Path(read_coordinates_from) if read_coordinates_from else None,
             read_tiles_from=(
                 Path(read_tiles_from) if read_tiles_from else None

{slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/artifacts.py RENAMED Viewed

@@ -5,6 +5,7 @@ from typing import Any
 import numpy as np
 import torch
+from hs2p.fileops import is_flattened_annotation
 @dataclass(frozen=True, kw_only=True)
@@ -29,6 +30,7 @@ class SlideEmbeddingArtifact:
     format: str
     feature_dim: int
     latent_path: Path | None = None
+    annotation: str | None = None
     @property
     def metadata(self) -> dict[str, Any]:
@@ -58,6 +60,7 @@ class HierarchicalEmbeddingArtifact:
     feature_dim: int
     num_regions: int
     tiles_per_region: int
+    annotation: str | None = None
     @property
     def metadata(self) -> dict[str, Any]:
@@ -90,6 +93,53 @@ def _write_metadata(path: Path, metadata: dict[str, Any]) -> None:
     path.write_text(json.dumps(metadata, indent=2, sort_keys=True), encoding="utf-8")
+def tile_embeddings_subdir(annotation: str | None) -> str:
+    """Namespace the ``tile_embeddings`` output dir per annotation class.
+    Reuses hs2p's flatten rule (the single source of truth): ``None`` and the sentinel
+    ``"tissue"`` collapse to the flat ``tile_embeddings`` root, so the default tissue-only
+    path is byte-for-byte unchanged; any real class label gets its own
+    ``tile_embeddings/<class>`` subdirectory.
+    """
+    if is_flattened_annotation(annotation):
+        return "tile_embeddings"
+    return f"tile_embeddings/{annotation}"
+def slide_embeddings_subdir(annotation: str | None) -> str:
+    """Namespace the ``slide_embeddings`` output dir per annotation class.
+    Reuses hs2p's flatten rule (the single source of truth, shared with
+    :func:`tile_embeddings_subdir`): ``None`` and the sentinel ``"tissue"`` collapse to the
+    flat ``slide_embeddings`` root, so the default tissue-only path is byte-for-byte
+    unchanged; any real class label gets its own ``slide_embeddings/<class>`` subdirectory.
+    """
+    if is_flattened_annotation(annotation):
+        return "slide_embeddings"
+    return f"slide_embeddings/{annotation}"
+def slide_latents_subdir(annotation: str | None) -> str:
+    """Namespace the ``slide_latents`` output dir per annotation class (mirrors slide embeddings)."""
+    if is_flattened_annotation(annotation):
+        return "slide_latents"
+    return f"slide_latents/{annotation}"
+def hierarchical_embeddings_subdir(annotation: str | None) -> str:
+    """Namespace the ``hierarchical_embeddings`` output dir per annotation class.
+    Reuses hs2p's flatten rule (the single source of truth, shared with
+    :func:`tile_embeddings_subdir` and :func:`slide_embeddings_subdir`): ``None`` and the
+    sentinel ``"tissue"`` collapse to the flat ``hierarchical_embeddings`` root, so the
+    default tissue-only path is byte-for-byte unchanged; any real class label gets its own
+    ``hierarchical_embeddings/<class>`` subdirectory.
+    """
+    if is_flattened_annotation(annotation):
+        return "hierarchical_embeddings"
+    return f"hierarchical_embeddings/{annotation}"
 def _setup_artifact_paths(
     output_dir: str | Path, subdir: str, sample_id: str, output_format: str
 ) -> tuple[Path, Path]:
@@ -142,9 +192,12 @@ def write_tile_embeddings(
     output_format: str = "pt",
     metadata: dict[str, Any] | None = None,
     tile_index: Any | None = None,
+    annotation: str | None = None,
 ) -> TileEmbeddingArtifact:
     output_format = _validate_output_format(output_format)
-    artifact_path, metadata_path = _setup_artifact_paths(output_dir, "tile_embeddings", sample_id, output_format)
+    artifact_path, metadata_path = _setup_artifact_paths(
+        output_dir, tile_embeddings_subdir(annotation), sample_id, output_format
+    )
     feature_array = _ensure_array(features)
     if output_format == "pt":
         torch.save(_ensure_tensor(features), artifact_path)
@@ -180,9 +233,12 @@ def write_tile_embedding_metadata(
     feature_dim: int | None = None,
     num_tiles: int = 0,
     metadata: dict[str, Any] | None = None,
+    annotation: str | None = None,
 ) -> Path:
     output_format = _validate_output_format(output_format)
-    _, metadata_path = _setup_artifact_paths(output_dir, "tile_embeddings", sample_id, output_format)
+    _, metadata_path = _setup_artifact_paths(
+        output_dir, tile_embeddings_subdir(annotation), sample_id, output_format
+    )
     tile_metadata = _build_tile_embedding_metadata(
         sample_id,
         output_format=output_format,
@@ -202,9 +258,12 @@ def write_slide_embeddings(
     output_format: str = "pt",
     metadata: dict[str, Any] | None = None,
     latents: Any | None = None,
+    annotation: str | None = None,
 ) -> SlideEmbeddingArtifact:
     output_format = _validate_output_format(output_format)
-    artifact_path, metadata_path = _setup_artifact_paths(output_dir, "slide_embeddings", sample_id, output_format)
+    artifact_path, metadata_path = _setup_artifact_paths(
+        output_dir, slide_embeddings_subdir(annotation), sample_id, output_format
+    )
     embedding_array = _ensure_array(embedding)
     latent_path = None
     if output_format == "pt":
@@ -212,7 +271,9 @@ def write_slide_embeddings(
     else:
         np.savez_compressed(artifact_path, features=embedding_array)
     if latents is not None:
-        latent_path, _ = _setup_artifact_paths(output_dir, "slide_latents", sample_id, output_format)
+        latent_path, _ = _setup_artifact_paths(
+            output_dir, slide_latents_subdir(annotation), sample_id, output_format
+        )
         if output_format == "pt":
             torch.save(_ensure_tensor(latents), latent_path)
         else:
@@ -234,6 +295,7 @@ def write_slide_embeddings(
         format=output_format,
         feature_dim=slide_metadata["feature_dim"],
         latent_path=latent_path,
+        annotation=annotation,
     )
@@ -283,9 +345,12 @@ def write_hierarchical_embeddings(
     output_dir: str | Path,
     output_format: str = "pt",
     metadata: dict[str, Any] | None = None,
+    annotation: str | None = None,
 ) -> HierarchicalEmbeddingArtifact:
     output_format = _validate_output_format(output_format)
-    artifact_path, metadata_path = _setup_artifact_paths(output_dir, "hierarchical_embeddings", sample_id, output_format)
+    artifact_path, metadata_path = _setup_artifact_paths(
+        output_dir, hierarchical_embeddings_subdir(annotation), sample_id, output_format
+    )
     feature_array = _ensure_array(features)
     if feature_array.ndim != 3:
         raise ValueError(
@@ -315,4 +380,5 @@ def write_hierarchical_embeddings(
         feature_dim=int(hierarchical_metadata["feature_dim"]),
         num_regions=int(hierarchical_metadata["num_regions"]),
         tiles_per_region=int(hierarchical_metadata["tiles_per_region"]),
+        annotation=annotation,
     )

{slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/configs/default.yaml RENAMED Viewed

@@ -26,6 +26,24 @@ tiling:
   read_coordinates_from: # path to an existing directory containing pre-extracted `.coordinates.npz` / `.coordinates.meta.json` artifacts to reuse instead of starting tiling from scratch
   read_tiles_from: # path to an existing directory containing pre-extracted `.tiles.tar` tile stores to reuse instead of starting tiling from scratch
   backend: "auto" # backend to use for slide reading; "auto" lets hs2p resolve the best backend per slide, preferring cuCIM when available
+  independent_sampling: true # selection strategy when annotation sampling is active. true: sample each class independently against its own binary mask (independent selection); false: sample once over the union of active classes, then post-filter per class by coverage (joint selection). Ignored when the masks vocabulary is left at the tissue-only default.
+  masks:
+    # Annotation-mask vocabulary forwarded to hs2p's sampling resolver. The shipped default
+    # ({background:0, tissue:1}) is plain binary tissue tiling — leave it untouched and the run
+    # behaves exactly as a tissue-only run. Customising the vocabulary (e.g. adding a `tumor`
+    # class with its own pixel value + min_coverage) opts the run into annotation-aware sampling,
+    # where `mask_path` is read as a multi-label raster. The `tissue` min_coverage entry below is
+    # the single source of truth for the tissue threshold.
+    output_mode: per_annotation # how sampled tiles are grouped into artifacts. per_annotation: one flat artifact set per sampled class, namespaced under a `<class>/` subdir (the `tissue` class collapses to the flat root). merged: a single flat artifact set per slide over the union of tiles passing any active class threshold — it carries no class label, so it lands at the flat output root (no `<class>/` subdir).
+    pixel_mapping: # {class_name: integer pixel value in the mask raster}
+      background: 0
+      tissue: 1
+    colors: # {class_name: [r, g, b] | null} used when rendering previews
+      background:
+      tissue: [157, 219, 129]
+    min_coverage: # {class_name: float | null}; minimum fraction of a tile that must be covered to keep it; null = don't sample that class
+      background:
+      tissue: 0.1
   params:
     requested_spacing_um: # spacing at which to tile the slide, in microns per pixel; filled from a preset model when available
     tolerance: 0.05 # tolerance for matching the spacing (float between 0 and 1, deciding how much the spacing can deviate from the one specified in the slide metadata)
@@ -33,7 +51,6 @@ tiling:
     requested_region_size_px: # size of hierarchical parent regions in pixels; when unset and region_tile_multiple is set, derived from requested_tile_size_px * region_tile_multiple
     region_tile_multiple: # hierarchical region grid width/height in tiles; e.g. 6 means 6x6 tiles per region
     overlap: 0.0 # percentage of overlap between two consecutive tiles (float between 0 and 1)
-    tissue_threshold: 0.1 # minimum fraction of pixels that must be tissue to keep a tile (float between 0 and 1)
   seg_params:
     # downsample controls which pyramid level is read for tissue segmentation.
     # Larger values are faster and use less memory; smaller values can improve mask precision.

{slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/distributed/direct_embed_worker.py RENAMED Viewed

@@ -49,12 +49,21 @@ def main(argv=None) -> int:
         )
         preprocessing = deserialize_preprocessing(request["preprocessing"])
         execution = deserialize_execution(request["execution"])
+        from slide2vec.runtime.distributed import (
+            decode_work_unit,
+            encode_work_unit,
+            work_unit_shard_stem,
+        )
+        from slide2vec.runtime.embedding import tiling_result_annotation
         load_successful_tiled_slides_fn = getattr(inference, "load_successful_tiled_slides", None)
         if not callable(load_successful_tiled_slides_fn):
             from slide2vec.runtime.manifest import load_successful_tiled_slides as load_successful_tiled_slides_fn
         slide_records, tiling_results = load_successful_tiled_slides_fn(output_dir)
-        paired_by_sample = {
-            slide.sample_id: (slide, tiling_result)
+        # Key by the composite (sample_id, annotation) work unit so a multi-class slide's sibling
+        # classes never overwrite each other; flat units collapse to the bare sample_id key.
+        paired_by_unit = {
+            encode_work_unit(slide.sample_id, tiling_result_annotation(tiling_result)): (slide, tiling_result)
             for slide, tiling_result in zip(slide_records, tiling_results)
         }
         progress_events_path = request.get("progress_events_path")
@@ -71,8 +80,9 @@ def main(argv=None) -> int:
         with context:
             if request["strategy"] == "tile_shard":
-                sample_id = request["sample_id"]
-                slide, tiling_result = paired_by_sample[sample_id]
+                work_unit = request["work_unit"]
+                shard_stem = work_unit_shard_stem(*decode_work_unit(work_unit))
+                slide, tiling_result = paired_by_unit[work_unit]
                 loaded = model._load_backend()
                 if is_hierarchical_preprocessing(preprocessing):
                     geometry = resolve_hierarchical_geometry(preprocessing, tiling_result)
@@ -103,7 +113,7 @@ def main(argv=None) -> int:
                         "flat_index": torch.as_tensor(shard_indices, dtype=torch.long),
                         "tile_embeddings": tile_embeddings.detach().cpu() if torch.is_tensor(tile_embeddings) else torch.as_tensor(tile_embeddings),
                     }
-                    torch.save(payload, coordination_dir / f"{sample_id}.hier.rank{global_rank}.pt")
+                    torch.save(payload, coordination_dir / f"{shard_stem}.hier.rank{global_rank}.pt")
                 else:
                     num_tiles = len(tiling_result.x)
                     tile_indices = np.array_split(np.arange(num_tiles, dtype=np.int64), world_size)[global_rank]
@@ -129,14 +139,14 @@ def main(argv=None) -> int:
                         "tile_index": torch.as_tensor(tile_indices, dtype=torch.long),
                         "tile_embeddings": tile_embeddings.detach().cpu() if torch.is_tensor(tile_embeddings) else torch.as_tensor(tile_embeddings),
                     }
-                    torch.save(payload, coordination_dir / f"{sample_id}.tiles.rank{global_rank}.pt")
+                    torch.save(payload, coordination_dir / f"{shard_stem}.tiles.rank{global_rank}.pt")
                 return 0
             assigned_ids = list(request.get("assignments", {}).get(str(global_rank), []))
             if not assigned_ids:
                 return 0
-            assigned_slides = [paired_by_sample[sample_id][0] for sample_id in assigned_ids]
-            assigned_tiling_results = [paired_by_sample[sample_id][1] for sample_id in assigned_ids]
+            assigned_slides = [paired_by_unit[unit_key][0] for unit_key in assigned_ids]
+            assigned_tiling_results = [paired_by_unit[unit_key][1] for unit_key in assigned_ids]
             def _persist_embedded_slide(slide, tiling_result, embedded_slide) -> None:
                 payload = {
@@ -144,7 +154,12 @@ def main(argv=None) -> int:
                     "slide_embedding": _to_cpu_payload(embedded_slide.slide_embedding),
                     "latents": _to_cpu_payload(embedded_slide.latents),
                 }
-                torch.save(payload, coordination_dir / f"{embedded_slide.sample_id}.embedded.pt")
+                # Stem by (sample_id, annotation) so two classes of one slide never overwrite each
+                # other; flat units keep the bare-sample_id filename for backward compatibility.
+                stem = work_unit_shard_stem(
+                    embedded_slide.sample_id, tiling_result_annotation(tiling_result)
+                )
+                torch.save(payload, coordination_dir / f"{stem}.embedded.pt")
             compute_embedded_slides_fn = getattr(inference, "_compute_embedded_slides", None)
             if not callable(compute_embedded_slides_fn):

{slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/distributed/pipeline_worker.py RENAMED Viewed

@@ -3,7 +3,8 @@ from contextlib import nullcontext
 import json
 from pathlib import Path
-from slide2vec.runtime.distributed import assign_slides_to_ranks
+from slide2vec.runtime.distributed import assign_slides_to_ranks, encode_work_unit
+from slide2vec.runtime.embedding import tiling_result_annotation
 def get_args_parser(add_help: bool = True) -> argparse.ArgumentParser:
@@ -48,26 +49,29 @@ def main(argv=None) -> int:
         if not callable(load_successful_tiled_slides_fn):
             from slide2vec.runtime.manifest import load_successful_tiled_slides as load_successful_tiled_slides_fn
         slide_records, tiling_results = load_successful_tiled_slides_fn(tiling_input_dir)
-        requested_sample_ids = request.get("sample_ids")
-        if requested_sample_ids is not None:
-            requested_sample_id_set = {str(sample_id) for sample_id in requested_sample_ids}
-            paired = [
-                (slide, tiling_result)
-                for slide, tiling_result in zip(slide_records, tiling_results)
-                if slide.sample_id in requested_sample_id_set
-            ]
-            slide_records = [slide for slide, _ in paired]
-            tiling_results = [tiling_result for _, tiling_result in paired]
+        # Each (sample_id, annotation) row is an independent work unit; key by the composite so a
+        # multi-class slide's sibling classes never overwrite each other. Flat units (None / tissue /
+        # merged) encode to the bare sample_id, byte-identical to pre-#168 single-class runs.
+        paired_by_unit = {
+            encode_work_unit(slide.sample_id, tiling_result_annotation(tiling_result)): (slide, tiling_result)
+            for slide, tiling_result in zip(slide_records, tiling_results)
+        }
+        requested_work_units = request.get("work_units")
+        if requested_work_units is not None:
+            requested_unit_set = {str(unit) for unit in requested_work_units}
+            paired_by_unit = {
+                unit_key: pair
+                for unit_key, pair in paired_by_unit.items()
+                if unit_key in requested_unit_set
+            }
+        slide_records = [slide for slide, _ in paired_by_unit.values()]
+        tiling_results = [tiling_result for _, tiling_result in paired_by_unit.values()]
         assignments = assign_slides_to_ranks(slide_records, tiling_results, num_gpus=world_size)
         assigned_ids = assignments.get(global_rank, [])
         if not assigned_ids:
             return 0
-        paired_by_sample = {
-            slide.sample_id: (slide, tiling_result)
-            for slide, tiling_result in zip(slide_records, tiling_results)
-        }
-        assigned_slides = [paired_by_sample[sample_id][0] for sample_id in assigned_ids]
-        assigned_tiling_results = [paired_by_sample[sample_id][1] for sample_id in assigned_ids]
+        assigned_slides = [paired_by_unit[unit_key][0] for unit_key in assigned_ids]
+        assigned_tiling_results = [paired_by_unit[unit_key][1] for unit_key in assigned_ids]
         progress_events_path = request.get("progress_events_path")
         reporter = (
             JsonlProgressReporter(

{slide2vec-4.7.0 → slide2vec-4.8.0}/slide2vec/inference.py RENAMED Viewed

@@ -156,6 +156,7 @@ def _reconcile_embedding_process_list(
     process_list_path,
     embeddable_slides,
     output_dir,
+    embeddable_tiling_results=None,
 ):
     """Reconcile the process_list with the embeddings on disk once, at end of run.
@@ -170,6 +171,16 @@ def _reconcile_embedding_process_list(
     persist_hierarchical_embeddings = hierarchical.is_hierarchical_preprocessing(preprocessing)
     include_slide_embeddings = model.level == "slide"
     include_tile_embeddings = persist_tile_embeddings and not persist_hierarchical_embeddings
+    annotations = None
+    if (include_slide_embeddings or persist_hierarchical_embeddings) and embeddable_tiling_results is not None:
+        # Re-read each class's namespaced slide- or hierarchical-embedding artifact so the
+        # final reconcile records the per-class feature path instead of collapsing every
+        # annotation row onto the flat path. The default tissue-only path leaves annotations
+        # None.
+        annotations = [
+            embedding.tiling_result_annotation(tiling_result)
+            for tiling_result in embeddable_tiling_results
+        ]
     tile_artifacts, hierarchical_artifacts, slide_artifacts = artifacts_collect.collect_pipeline_artifacts(
         embeddable_slides,
         output_dir=output_dir,
@@ -177,6 +188,7 @@ def _reconcile_embedding_process_list(
         include_tile_embeddings=include_tile_embeddings,
         include_hierarchical_embeddings=persist_hierarchical_embeddings,
         include_slide_embeddings=include_slide_embeddings,
+        annotations=annotations,
     )
     if process_list_path is not None and Path(process_list_path).is_file():
         persistence.update_process_list_after_embedding(
@@ -323,6 +335,7 @@ def embed_slides(
                     process_list_path=process_list_path,
                     embeddable_slides=embeddable_slides,
                     output_dir=Path(execution.output_dir),
+                    embeddable_tiling_results=embeddable_tiling_results,
                 )
             emit_progress(
                 "embedding.finished",
@@ -574,6 +587,7 @@ def embed_tiles(
                     backend=tiling.resolve_slide_backend(resolved_preprocessing, tiling_result),
                     preprocessing=resolved_preprocessing,
                 ),
+                annotation=embedding.tiling_result_annotation(tiling_result),
             )
         else:
             features = embedding_pipeline.compute_tile_embeddings_for_slide(
@@ -597,6 +611,7 @@ def embed_tiles(
                 features,
                 execution=execution,
                 metadata=metadata,
+                annotation=embedding.tiling_result_annotation(tiling_result),
             )
         artifacts.append(artifact)
     return artifacts
@@ -854,6 +869,7 @@ def run_pipeline(
             process_list_path=process_list_path,
             embeddable_slides=embeddable_slides,
             output_dir=output_dir,
+            embeddable_tiling_results=embeddable_tiling_results,
         )
         emit_progress(
             "embedding.finished",
@@ -974,6 +990,7 @@ def run_pipeline_with_coordinates(
             process_list_path=process_list_path,
             embeddable_slides=embeddable_slides,
             output_dir=output_dir,
+            embeddable_tiling_results=embeddable_tiling_results,
         )
         return RunResult(
             tile_artifacts=tile_artifacts,

slide2vec 4.7.0__tar.gz → 4.8.0__tar.gz

slide2vec 4.7.0tar.gz → 4.8.0tar.gz