PyPI - slide2vec - Versions diffs - 4.7.0__tar.gz → 5.0.0__tar.gz - Mend

slide2vec 4.7.0tar.gz → 5.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

{slide2vec-4.7.0 → slide2vec-5.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: slide2vec
-Version: 4.7.0
+Version: 5.0.0
 Summary: Embedding of whole slide images with Foundation Models
 Author-email: Clément Grisi <clement.grisi@radboudumc.nl>
 License-Expression: Apache-2.0
@@ -15,7 +15,7 @@ Classifier: Programming Language :: Python :: 3.13
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.0.8
+Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.2.0
 Requires-Dist: omegaconf
 Requires-Dist: matplotlib
 Requires-Dist: numpy<2
@@ -65,7 +65,7 @@ Requires-Dist: numpy<2; extra == "fm"
 Requires-Dist: pandas; extra == "fm"
 Requires-Dist: pillow; extra == "fm"
 Requires-Dist: rich; extra == "fm"
-Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.0.8; extra == "fm"
+Requires-Dist: hs2p[asap,cucim,openslide,sam2,vips]>=4.2.0; extra == "fm"
 Requires-Dist: wandb; extra == "fm"
 Requires-Dist: torch<2.8,>=2.3; extra == "fm"
 Requires-Dist: torchvision>=0.18.0; extra == "fm"
@@ -169,7 +169,7 @@ pipeline = Pipeline(
     preprocessing=PreprocessingConfig(
         requested_spacing_um=0.5,
         requested_tile_size_px=224,
-        tissue_threshold=0.1,
+        masks={"min_coverage": {"tissue": 0.1}},
     ),
     execution=ExecutionOptions(output_dir="outputs/demo"),
 )

{slide2vec-4.7.0 → slide2vec-5.0.0}/README.md RENAMED Viewed

@@ -63,7 +63,7 @@ pipeline = Pipeline(
     preprocessing=PreprocessingConfig(
         requested_spacing_um=0.5,
         requested_tile_size_px=224,
-        tissue_threshold=0.1,
+        masks={"min_coverage": {"tissue": 0.1}},
     ),
     execution=ExecutionOptions(output_dir="outputs/demo"),
 )

{slide2vec-4.7.0 → slide2vec-5.0.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "slide2vec"
-version = "4.7.0"
+version = "5.0.0"
 description = "Embedding of whole slide images with Foundation Models"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -21,7 +21,7 @@ classifiers = [
     "Programming Language :: Python :: 3.13",
 ]
 dependencies = [
-    "hs2p[asap,cucim,openslide,sam2,vips]>=4.0.8",
+    "hs2p[asap,cucim,openslide,sam2,vips]>=4.2.0",
     "omegaconf",
     "matplotlib",
     "numpy<2",
@@ -88,7 +88,7 @@ fm = [
     "pandas",
     "pillow",
     "rich",
-    "hs2p[asap,cucim,openslide,sam2,vips]>=4.0.8",
+    "hs2p[asap,cucim,openslide,sam2,vips]>=4.2.0",
     "wandb",
     "torch>=2.3,<2.8",
     "torchvision>=0.18.0",
@@ -164,7 +164,7 @@ no_implicit_reexport = true
 max-line-length = 160
 [tool.bumpver]
-current_version = "4.7.0"
+current_version = "5.0.0"
 version_pattern = "MAJOR.MINOR.PATCH"
 commit = false       # We do version bumping in CI, not as a commit
 tag = false          # Git tag already exists — we don't auto-tag

{slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/__init__.py RENAMED Viewed

@@ -11,7 +11,7 @@ from slide2vec.api import (
 from slide2vec.artifacts import HierarchicalEmbeddingArtifact, SlideEmbeddingArtifact, TileEmbeddingArtifact
-__version__ = "4.7.0"
+__version__ = "5.0.0"
 __all__ = [
     "Model",

{slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/api.py RENAMED Viewed

@@ -1,4 +1,5 @@
+import copy
 import logging
 import os
 from dataclasses import dataclass, field, replace
@@ -40,6 +41,55 @@ SlideSequence = Sequence[SlideInput]
 TilingResultsInput = Sequence[Any] | Mapping[str, Any]
+#: Default annotation-mask vocabulary — plain binary tissue tiling. Mirrors hs2p's
+#: shipped default ``{background: 0, tissue: 1}``; leaving it untouched keeps a run
+#: behaving exactly as a tissue-only run. ``min_coverage.tissue`` is the single source
+#: of truth for the tissue threshold (the standalone ``tissue_threshold`` knob is gone).
+#: A :class:`PreprocessingConfig` ``masks`` value is deep-merged over this default, so
+#: callers only state what they override (e.g. ``{"min_coverage": {"tissue": 0.1}}``).
+DEFAULT_MASKS: dict[str, Any] = {
+    "output_mode": "per_annotation",
+    "pixel_mapping": {"background": 0, "tissue": 1},
+    "colors": {"background": None, "tissue": [157, 219, 129]},
+    "min_coverage": {"background": None, "tissue": 0.01},
+}
+def _deep_merge_masks(base: Mapping[str, Any], override: Mapping[str, Any]) -> dict[str, Any]:
+    """Deep-merge *override* onto a copy of *base* (nested dicts merge key-by-key)."""
+    merged = copy.deepcopy(dict(base))
+    for key, value in override.items():
+        existing = merged.get(key)
+        if isinstance(value, Mapping) and isinstance(existing, dict):
+            merged[key] = _deep_merge_masks(existing, value)
+        else:
+            merged[key] = copy.deepcopy(value)
+    return merged
+def resolve_masks(masks: Mapping[str, Any] | None) -> dict[str, Any]:
+    """Complete a (possibly partial) ``masks`` mapping by merging it over :data:`DEFAULT_MASKS`."""
+    if not masks:
+        return copy.deepcopy(DEFAULT_MASKS)
+    return _deep_merge_masks(DEFAULT_MASKS, masks)
+def _masks_to_plain_dict(node: Any) -> dict[str, Any]:
+    """Normalize a masks config node (OmegaConf, mapping, or namespace) to a plain dict."""
+    if node is None:
+        return {}
+    try:
+        from omegaconf import OmegaConf
+        if OmegaConf.is_config(node):
+            return copy.deepcopy(OmegaConf.to_container(node, resolve=True))  # type: ignore[return-value]
+    except ImportError:
+        pass
+    if isinstance(node, Mapping):
+        return copy.deepcopy(dict(node))
+    return copy.deepcopy(dict(vars(node)))
 @dataclass(frozen=True, kw_only=True)
 class PreprocessingConfig:
     """Configuration for slide tiling and preprocessing."""
@@ -62,8 +112,6 @@ class PreprocessingConfig:
     tolerance: float = 0.05
     #: Fractional tile overlap (``0.0`` = no overlap).
     overlap: float = 0.0
-    #: Minimum tissue fraction required to keep a tile (default ``0.01``).
-    tissue_threshold: float = 0.01
     #: Directory containing pre-extracted tile coordinates to reuse, skipping tiling.
     read_coordinates_from: Path | None = None
     #: Directory containing pre-extracted tile images to skip the tiling step entirely.
@@ -90,6 +138,20 @@ class PreprocessingConfig:
     #: Controls whether hs2p writes mask and tiling preview images.
     #: Keys: ``save_mask_preview``, ``save_tiling_preview``, ``downsample``.
     preview: dict[str, Any] = field(default_factory=dict)
+    #: Annotation-mask vocabulary forwarded to hs2p's sampling resolver. Keys:
+    #: ``output_mode``, ``pixel_mapping``, ``colors``, ``min_coverage``. A partial
+    #: mapping is deep-merged over :data:`DEFAULT_MASKS`, so callers only state what
+    #: they override (e.g. ``{"min_coverage": {"tissue": 0.1}}``). The default
+    #: ``{background, tissue}`` block is plain tissue tiling; ``min_coverage.tissue``
+    #: is the single source of truth for the tissue threshold.
+    masks: dict[str, Any] = field(default_factory=dict)
+    #: When annotation sampling is active, tile each class independently (``True``)
+    #: vs jointly across classes (``False``).
+    independent_sampling: bool = True
+    def __post_init__(self) -> None:
+        # Complete a (possibly partial) masks mapping against the shipped default.
+        object.__setattr__(self, "masks", resolve_masks(self.masks))
     @classmethod
     def from_config(cls, cfg: Any) -> "PreprocessingConfig":
@@ -121,7 +183,8 @@ class PreprocessingConfig:
             region_tile_multiple=int(region_tile_multiple) if region_tile_multiple is not None else None,
             tolerance=float(tiling.params.tolerance),
             overlap=float(tiling.params.overlap),
-            tissue_threshold=float(tiling.params.tissue_threshold),
+            masks=_masks_to_plain_dict(getattr(tiling, "masks", None)),
+            independent_sampling=bool(getattr(tiling, "independent_sampling", True)),
             read_coordinates_from=Path(read_coordinates_from) if read_coordinates_from else None,
             read_tiles_from=(
                 Path(read_tiles_from) if read_tiles_from else None
@@ -288,6 +351,11 @@ class EmbeddedSlide:
     image_path: Path
     #: Path to the tissue mask used for tiling, if any.
     mask_path: Path | None = None
+    #: Annotation class this bag of tiles was sampled for. ``"tissue"`` for the
+    #: default tissue-only path, ``"merged"`` for the union output mode, or the
+    #: class name (e.g. ``"tumor"``) when annotation-aware sampling fans a slide
+    #: out into one bag per class. See the annotation-aware sampling documentation.
+    annotation: str | None = None
     #: Number of tiles extracted from the slide.
     num_tiles: int | None = None
     #: Path to the mask preview image, if generated.
@@ -379,12 +447,13 @@ class Model:
         self,
         slide: SlideInput,
         *,
+        annotation: str | list[str] | None = None,
         preprocessing: PreprocessingConfig | None = None,
         execution: ExecutionOptions | None = None,
         sample_id: str | None = None,
         mask_path: PathLike | None = None,
         spacing_at_level_0: float | None = None,
-    ) -> EmbeddedSlide:
+    ) -> EmbeddedSlide | list[EmbeddedSlide]:
         if isinstance(slide, (str, Path)):
             slide = {
                 "sample_id": sample_id or Path(slide).stem,
@@ -396,31 +465,42 @@ class Model:
             raise ValueError(
                 "sample_id, mask_path, and spacing_at_level_0 overrides are only supported when slide is a path-like input"
             )
-        return self.embed_slides(
+        requested = None if isinstance(annotation, str) else annotation
+        grouped = self.embed_slides(
             [slide],
+            annotations=requested,
             preprocessing=preprocessing,
             execution=execution,
-        )[0]
+        )
+        # Single slide in → at most one outer key out. Flatten to the inner
+        # {label: EmbeddedSlide} mapping (empty when the run produced nothing).
+        bags: dict[str, EmbeddedSlide] = {}
+        for inner in grouped.values():
+            bags = inner
+            break
+        return _select_embedded_bag(bags, annotation)
     def embed_slides(
         self,
         slides: SlideSequence,
         *,
+        annotations: list[str] | None = None,
         preprocessing: PreprocessingConfig | None = None,
         execution: ExecutionOptions | None = None,
-    ) -> list[EmbeddedSlide]:
+    ) -> dict[str, dict[str, EmbeddedSlide]]:
         from slide2vec.inference import embed_slides
         resolved = _coerce_execution_options(execution, model=self)
         resolved_preprocessing = _resolve_direct_api_preprocessing(self, preprocessing)
         with _auto_progress_reporting(output_dir=resolved.output_dir):
             _validate_model_config(self, resolved_preprocessing, resolved)
-            return embed_slides(
+            embedded = embed_slides(
                 self,
                 slides,
                 preprocessing=resolved_preprocessing,
                 execution=resolved,
             )
+        return _group_embedded_slides(embedded, annotations=annotations)
     def embed_patient(
         self,
@@ -587,6 +667,77 @@ class Pipeline:
             )
+def _select_embedded_bag(
+    bags: Mapping[str, EmbeddedSlide],
+    annotation: str | list[str] | None,
+) -> EmbeddedSlide | list[EmbeddedSlide]:
+    """Select per-class bag(s) from a single slide's ``{label: EmbeddedSlide}`` map.
+    numpy-style shape-in/shape-out:
+    - a single class string returns one :class:`EmbeddedSlide`;
+    - a list of class strings returns a list in the requested order;
+    - ``None`` returns the single bag when the run produced exactly one,
+      otherwise raises naming the available bags and directing to
+      :meth:`Model.embed_slides`.
+    Requesting a class the run did not produce raises naming what is available.
+    """
+    available = sorted(bags)
+    if isinstance(annotation, str):
+        if annotation not in bags:
+            raise ValueError(
+                f"embed_slide() found no '{annotation}' annotation bag for this "
+                f"slide; available bags: {available}."
+            )
+        return bags[annotation]
+    if annotation is not None:
+        selected: list[EmbeddedSlide] = []
+        for label in annotation:
+            if label not in bags:
+                raise ValueError(
+                    f"embed_slide() found no '{label}' annotation bag for this "
+                    f"slide; available bags: {available}."
+                )
+            selected.append(bags[label])
+        return selected
+    if len(bags) == 1:
+        return next(iter(bags.values()))
+    raise ValueError(
+        f"embed_slide() received {len(bags)} annotation bags for this slide "
+        f"({available}); annotation-aware sampling produces one bag per class. "
+        "Pass annotation=... to select a class, or use Model.embed_slides(...) "
+        "to receive every per-class EmbeddedSlide (each carries its .annotation)."
+    )
+def _group_embedded_slides(
+    embedded: Sequence[EmbeddedSlide],
+    *,
+    annotations: list[str] | None = None,
+) -> dict[str, dict[str, EmbeddedSlide]]:
+    """Group flat per-row :class:`EmbeddedSlide` results into a nested mapping.
+    The outer key is ``sample_id``; the inner key is the bag's informative
+    annotation label (``"tissue"``/``"merged"``/class name), never ``None``.
+    A bag whose ``.annotation`` is ``None`` (defensive — post-#173 real runs
+    always carry a label) does not produce a ``None`` key.
+    When *annotations* is given, the inner keys are restricted to the named
+    classes (in encounter order).
+    """
+    requested = None if annotations is None else set(annotations)
+    grouped: dict[str, dict[str, EmbeddedSlide]] = {}
+    for bag in embedded:
+        label = bag.annotation
+        if label is None:
+            continue
+        if requested is not None and label not in requested:
+            continue
+        grouped.setdefault(bag.sample_id, {})[label] = bag
+    return grouped
 def _coerce_execution_options(
     options: ExecutionOptions | None,
     *,

{slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/artifacts.py RENAMED Viewed

@@ -5,6 +5,7 @@ from typing import Any
 import numpy as np
 import torch
+from hs2p.fileops import is_flattened_annotation
 @dataclass(frozen=True, kw_only=True)
@@ -29,6 +30,7 @@ class SlideEmbeddingArtifact:
     format: str
     feature_dim: int
     latent_path: Path | None = None
+    annotation: str | None = None
     @property
     def metadata(self) -> dict[str, Any]:
@@ -58,6 +60,7 @@ class HierarchicalEmbeddingArtifact:
     feature_dim: int
     num_regions: int
     tiles_per_region: int
+    annotation: str | None = None
     @property
     def metadata(self) -> dict[str, Any]:
@@ -90,6 +93,53 @@ def _write_metadata(path: Path, metadata: dict[str, Any]) -> None:
     path.write_text(json.dumps(metadata, indent=2, sort_keys=True), encoding="utf-8")
+def tile_embeddings_subdir(annotation: str | None) -> str:
+    """Namespace the ``tile_embeddings`` output dir per annotation class.
+    Reuses hs2p's flatten rule (the single source of truth): ``None`` and the sentinel
+    ``"tissue"`` collapse to the flat ``tile_embeddings`` root, so the default tissue-only
+    path is byte-for-byte unchanged; any real class label gets its own
+    ``tile_embeddings/<class>`` subdirectory.
+    """
+    if is_flattened_annotation(annotation):
+        return "tile_embeddings"
+    return f"tile_embeddings/{annotation}"
+def slide_embeddings_subdir(annotation: str | None) -> str:
+    """Namespace the ``slide_embeddings`` output dir per annotation class.
+    Reuses hs2p's flatten rule (the single source of truth, shared with
+    :func:`tile_embeddings_subdir`): ``None`` and the sentinel ``"tissue"`` collapse to the
+    flat ``slide_embeddings`` root, so the default tissue-only path is byte-for-byte
+    unchanged; any real class label gets its own ``slide_embeddings/<class>`` subdirectory.
+    """
+    if is_flattened_annotation(annotation):
+        return "slide_embeddings"
+    return f"slide_embeddings/{annotation}"
+def slide_latents_subdir(annotation: str | None) -> str:
+    """Namespace the ``slide_latents`` output dir per annotation class (mirrors slide embeddings)."""
+    if is_flattened_annotation(annotation):
+        return "slide_latents"
+    return f"slide_latents/{annotation}"
+def hierarchical_embeddings_subdir(annotation: str | None) -> str:
+    """Namespace the ``hierarchical_embeddings`` output dir per annotation class.
+    Reuses hs2p's flatten rule (the single source of truth, shared with
+    :func:`tile_embeddings_subdir` and :func:`slide_embeddings_subdir`): ``None`` and the
+    sentinel ``"tissue"`` collapse to the flat ``hierarchical_embeddings`` root, so the
+    default tissue-only path is byte-for-byte unchanged; any real class label gets its own
+    ``hierarchical_embeddings/<class>`` subdirectory.
+    """
+    if is_flattened_annotation(annotation):
+        return "hierarchical_embeddings"
+    return f"hierarchical_embeddings/{annotation}"
 def _setup_artifact_paths(
     output_dir: str | Path, subdir: str, sample_id: str, output_format: str
 ) -> tuple[Path, Path]:
@@ -142,9 +192,12 @@ def write_tile_embeddings(
     output_format: str = "pt",
     metadata: dict[str, Any] | None = None,
     tile_index: Any | None = None,
+    annotation: str | None = None,
 ) -> TileEmbeddingArtifact:
     output_format = _validate_output_format(output_format)
-    artifact_path, metadata_path = _setup_artifact_paths(output_dir, "tile_embeddings", sample_id, output_format)
+    artifact_path, metadata_path = _setup_artifact_paths(
+        output_dir, tile_embeddings_subdir(annotation), sample_id, output_format
+    )
     feature_array = _ensure_array(features)
     if output_format == "pt":
         torch.save(_ensure_tensor(features), artifact_path)
@@ -180,9 +233,12 @@ def write_tile_embedding_metadata(
     feature_dim: int | None = None,
     num_tiles: int = 0,
     metadata: dict[str, Any] | None = None,
+    annotation: str | None = None,
 ) -> Path:
     output_format = _validate_output_format(output_format)
-    _, metadata_path = _setup_artifact_paths(output_dir, "tile_embeddings", sample_id, output_format)
+    _, metadata_path = _setup_artifact_paths(
+        output_dir, tile_embeddings_subdir(annotation), sample_id, output_format
+    )
     tile_metadata = _build_tile_embedding_metadata(
         sample_id,
         output_format=output_format,
@@ -202,9 +258,12 @@ def write_slide_embeddings(
     output_format: str = "pt",
     metadata: dict[str, Any] | None = None,
     latents: Any | None = None,
+    annotation: str | None = None,
 ) -> SlideEmbeddingArtifact:
     output_format = _validate_output_format(output_format)
-    artifact_path, metadata_path = _setup_artifact_paths(output_dir, "slide_embeddings", sample_id, output_format)
+    artifact_path, metadata_path = _setup_artifact_paths(
+        output_dir, slide_embeddings_subdir(annotation), sample_id, output_format
+    )
     embedding_array = _ensure_array(embedding)
     latent_path = None
     if output_format == "pt":
@@ -212,7 +271,9 @@ def write_slide_embeddings(
     else:
         np.savez_compressed(artifact_path, features=embedding_array)
     if latents is not None:
-        latent_path, _ = _setup_artifact_paths(output_dir, "slide_latents", sample_id, output_format)
+        latent_path, _ = _setup_artifact_paths(
+            output_dir, slide_latents_subdir(annotation), sample_id, output_format
+        )
         if output_format == "pt":
             torch.save(_ensure_tensor(latents), latent_path)
         else:
@@ -234,6 +295,7 @@ def write_slide_embeddings(
         format=output_format,
         feature_dim=slide_metadata["feature_dim"],
         latent_path=latent_path,
+        annotation=annotation,
     )
@@ -283,9 +345,12 @@ def write_hierarchical_embeddings(
     output_dir: str | Path,
     output_format: str = "pt",
     metadata: dict[str, Any] | None = None,
+    annotation: str | None = None,
 ) -> HierarchicalEmbeddingArtifact:
     output_format = _validate_output_format(output_format)
-    artifact_path, metadata_path = _setup_artifact_paths(output_dir, "hierarchical_embeddings", sample_id, output_format)
+    artifact_path, metadata_path = _setup_artifact_paths(
+        output_dir, hierarchical_embeddings_subdir(annotation), sample_id, output_format
+    )
     feature_array = _ensure_array(features)
     if feature_array.ndim != 3:
         raise ValueError(
@@ -315,4 +380,5 @@ def write_hierarchical_embeddings(
         feature_dim=int(hierarchical_metadata["feature_dim"]),
         num_regions=int(hierarchical_metadata["num_regions"]),
         tiles_per_region=int(hierarchical_metadata["tiles_per_region"]),
+        annotation=annotation,
     )

{slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/configs/default.yaml RENAMED Viewed

@@ -26,6 +26,24 @@ tiling:
   read_coordinates_from: # path to an existing directory containing pre-extracted `.coordinates.npz` / `.coordinates.meta.json` artifacts to reuse instead of starting tiling from scratch
   read_tiles_from: # path to an existing directory containing pre-extracted `.tiles.tar` tile stores to reuse instead of starting tiling from scratch
   backend: "auto" # backend to use for slide reading; "auto" lets hs2p resolve the best backend per slide, preferring cuCIM when available
+  independent_sampling: true # selection strategy when annotation sampling is active. true: sample each class independently against its own binary mask (independent selection); false: sample once over the union of active classes, then post-filter per class by coverage (joint selection). Ignored when the masks vocabulary is left at the tissue-only default.
+  masks:
+    # Annotation-mask vocabulary forwarded to hs2p's sampling resolver. The shipped default
+    # ({background:0, tissue:1}) is plain binary tissue tiling — leave it untouched and the run
+    # behaves exactly as a tissue-only run. Customising the vocabulary (e.g. adding a `tumor`
+    # class with its own pixel value + min_coverage) opts the run into annotation-aware sampling,
+    # where `mask_path` is read as a multi-label raster. The `tissue` min_coverage entry below is
+    # the single source of truth for the tissue threshold.
+    output_mode: per_annotation # how sampled tiles are grouped into artifacts. per_annotation: one flat artifact set per sampled class, namespaced under a `<class>/` subdir (the `tissue` class collapses to the flat root). merged: a single flat artifact set per slide over the union of tiles passing any active class threshold — it carries no class label, so it lands at the flat output root (no `<class>/` subdir).
+    pixel_mapping: # {class_name: integer pixel value in the mask raster}
+      background: 0
+      tissue: 1
+    colors: # {class_name: [r, g, b] | null} used when rendering previews
+      background:
+      tissue: [157, 219, 129]
+    min_coverage: # {class_name: float | null}; minimum fraction of a tile that must be covered to keep it; null = don't sample that class
+      background:
+      tissue: 0.1
   params:
     requested_spacing_um: # spacing at which to tile the slide, in microns per pixel; filled from a preset model when available
     tolerance: 0.05 # tolerance for matching the spacing (float between 0 and 1, deciding how much the spacing can deviate from the one specified in the slide metadata)
@@ -33,7 +51,6 @@ tiling:
     requested_region_size_px: # size of hierarchical parent regions in pixels; when unset and region_tile_multiple is set, derived from requested_tile_size_px * region_tile_multiple
     region_tile_multiple: # hierarchical region grid width/height in tiles; e.g. 6 means 6x6 tiles per region
     overlap: 0.0 # percentage of overlap between two consecutive tiles (float between 0 and 1)
-    tissue_threshold: 0.1 # minimum fraction of pixels that must be tissue to keep a tile (float between 0 and 1)
   seg_params:
     # downsample controls which pyramid level is read for tissue segmentation.
     # Larger values are faster and use less memory; smaller values can improve mask precision.

{slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/distributed/direct_embed_worker.py RENAMED Viewed

@@ -49,12 +49,21 @@ def main(argv=None) -> int:
         )
         preprocessing = deserialize_preprocessing(request["preprocessing"])
         execution = deserialize_execution(request["execution"])
+        from slide2vec.runtime.distributed import (
+            decode_work_unit,
+            encode_work_unit,
+            work_unit_shard_stem,
+        )
+        from slide2vec.runtime.embedding import tiling_result_annotation
         load_successful_tiled_slides_fn = getattr(inference, "load_successful_tiled_slides", None)
         if not callable(load_successful_tiled_slides_fn):
             from slide2vec.runtime.manifest import load_successful_tiled_slides as load_successful_tiled_slides_fn
         slide_records, tiling_results = load_successful_tiled_slides_fn(output_dir)
-        paired_by_sample = {
-            slide.sample_id: (slide, tiling_result)
+        # Key by the composite (sample_id, annotation) work unit so a multi-class slide's sibling
+        # classes never overwrite each other; flat units collapse to the bare sample_id key.
+        paired_by_unit = {
+            encode_work_unit(slide.sample_id, tiling_result_annotation(tiling_result)): (slide, tiling_result)
             for slide, tiling_result in zip(slide_records, tiling_results)
         }
         progress_events_path = request.get("progress_events_path")
@@ -71,8 +80,9 @@ def main(argv=None) -> int:
         with context:
             if request["strategy"] == "tile_shard":
-                sample_id = request["sample_id"]
-                slide, tiling_result = paired_by_sample[sample_id]
+                work_unit = request["work_unit"]
+                shard_stem = work_unit_shard_stem(*decode_work_unit(work_unit))
+                slide, tiling_result = paired_by_unit[work_unit]
                 loaded = model._load_backend()
                 if is_hierarchical_preprocessing(preprocessing):
                     geometry = resolve_hierarchical_geometry(preprocessing, tiling_result)
@@ -103,7 +113,7 @@ def main(argv=None) -> int:
                         "flat_index": torch.as_tensor(shard_indices, dtype=torch.long),
                         "tile_embeddings": tile_embeddings.detach().cpu() if torch.is_tensor(tile_embeddings) else torch.as_tensor(tile_embeddings),
                     }
-                    torch.save(payload, coordination_dir / f"{sample_id}.hier.rank{global_rank}.pt")
+                    torch.save(payload, coordination_dir / f"{shard_stem}.hier.rank{global_rank}.pt")
                 else:
                     num_tiles = len(tiling_result.x)
                     tile_indices = np.array_split(np.arange(num_tiles, dtype=np.int64), world_size)[global_rank]
@@ -129,14 +139,14 @@ def main(argv=None) -> int:
                         "tile_index": torch.as_tensor(tile_indices, dtype=torch.long),
                         "tile_embeddings": tile_embeddings.detach().cpu() if torch.is_tensor(tile_embeddings) else torch.as_tensor(tile_embeddings),
                     }
-                    torch.save(payload, coordination_dir / f"{sample_id}.tiles.rank{global_rank}.pt")
+                    torch.save(payload, coordination_dir / f"{shard_stem}.tiles.rank{global_rank}.pt")
                 return 0
             assigned_ids = list(request.get("assignments", {}).get(str(global_rank), []))
             if not assigned_ids:
                 return 0
-            assigned_slides = [paired_by_sample[sample_id][0] for sample_id in assigned_ids]
-            assigned_tiling_results = [paired_by_sample[sample_id][1] for sample_id in assigned_ids]
+            assigned_slides = [paired_by_unit[unit_key][0] for unit_key in assigned_ids]
+            assigned_tiling_results = [paired_by_unit[unit_key][1] for unit_key in assigned_ids]
             def _persist_embedded_slide(slide, tiling_result, embedded_slide) -> None:
                 payload = {
@@ -144,7 +154,12 @@ def main(argv=None) -> int:
                     "slide_embedding": _to_cpu_payload(embedded_slide.slide_embedding),
                     "latents": _to_cpu_payload(embedded_slide.latents),
                 }
-                torch.save(payload, coordination_dir / f"{embedded_slide.sample_id}.embedded.pt")
+                # Stem by (sample_id, annotation) so two classes of one slide never overwrite each
+                # other; flat units keep the bare-sample_id filename for backward compatibility.
+                stem = work_unit_shard_stem(
+                    embedded_slide.sample_id, tiling_result_annotation(tiling_result)
+                )
+                torch.save(payload, coordination_dir / f"{stem}.embedded.pt")
             compute_embedded_slides_fn = getattr(inference, "_compute_embedded_slides", None)
             if not callable(compute_embedded_slides_fn):

{slide2vec-4.7.0 → slide2vec-5.0.0}/slide2vec/distributed/pipeline_worker.py RENAMED Viewed

@@ -3,7 +3,8 @@ from contextlib import nullcontext
 import json
 from pathlib import Path
-from slide2vec.runtime.distributed import assign_slides_to_ranks
+from slide2vec.runtime.distributed import assign_slides_to_ranks, encode_work_unit
+from slide2vec.runtime.embedding import tiling_result_annotation
 def get_args_parser(add_help: bool = True) -> argparse.ArgumentParser:
@@ -48,26 +49,29 @@ def main(argv=None) -> int:
         if not callable(load_successful_tiled_slides_fn):
             from slide2vec.runtime.manifest import load_successful_tiled_slides as load_successful_tiled_slides_fn
         slide_records, tiling_results = load_successful_tiled_slides_fn(tiling_input_dir)
-        requested_sample_ids = request.get("sample_ids")
-        if requested_sample_ids is not None:
-            requested_sample_id_set = {str(sample_id) for sample_id in requested_sample_ids}
-            paired = [
-                (slide, tiling_result)
-                for slide, tiling_result in zip(slide_records, tiling_results)
-                if slide.sample_id in requested_sample_id_set
-            ]
-            slide_records = [slide for slide, _ in paired]
-            tiling_results = [tiling_result for _, tiling_result in paired]
+        # Each (sample_id, annotation) row is an independent work unit; key by the composite so a
+        # multi-class slide's sibling classes never overwrite each other. Flat units (None / tissue /
+        # merged) encode to the bare sample_id, byte-identical to pre-#168 single-class runs.
+        paired_by_unit = {
+            encode_work_unit(slide.sample_id, tiling_result_annotation(tiling_result)): (slide, tiling_result)
+            for slide, tiling_result in zip(slide_records, tiling_results)
+        }
+        requested_work_units = request.get("work_units")
+        if requested_work_units is not None:
+            requested_unit_set = {str(unit) for unit in requested_work_units}
+            paired_by_unit = {
+                unit_key: pair
+                for unit_key, pair in paired_by_unit.items()
+                if unit_key in requested_unit_set
+            }
+        slide_records = [slide for slide, _ in paired_by_unit.values()]
+        tiling_results = [tiling_result for _, tiling_result in paired_by_unit.values()]
         assignments = assign_slides_to_ranks(slide_records, tiling_results, num_gpus=world_size)
         assigned_ids = assignments.get(global_rank, [])
         if not assigned_ids:
             return 0
-        paired_by_sample = {
-            slide.sample_id: (slide, tiling_result)
-            for slide, tiling_result in zip(slide_records, tiling_results)
-        }
-        assigned_slides = [paired_by_sample[sample_id][0] for sample_id in assigned_ids]
-        assigned_tiling_results = [paired_by_sample[sample_id][1] for sample_id in assigned_ids]
+        assigned_slides = [paired_by_unit[unit_key][0] for unit_key in assigned_ids]
+        assigned_tiling_results = [paired_by_unit[unit_key][1] for unit_key in assigned_ids]
         progress_events_path = request.get("progress_events_path")
         reporter = (
             JsonlProgressReporter(

slide2vec 4.7.0__tar.gz → 5.0.0__tar.gz

slide2vec 4.7.0tar.gz → 5.0.0tar.gz