slide2vec 4.0.2__tar.gz → 4.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {slide2vec-4.0.2 → slide2vec-4.0.4}/PKG-INFO +1 -1
  2. {slide2vec-4.0.2 → slide2vec-4.0.4}/pyproject.toml +2 -2
  3. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/__init__.py +1 -1
  4. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/api.py +14 -7
  5. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/configs/default.yaml +2 -2
  6. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/inference.py +68 -28
  7. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/utils/tiling_io.py +9 -0
  8. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/utils/utils.py +2 -1
  9. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec.egg-info/PKG-INFO +1 -1
  10. {slide2vec-4.0.2 → slide2vec-4.0.4}/tests/test_hs2p_package_cutover.py +6 -0
  11. {slide2vec-4.0.2 → slide2vec-4.0.4}/tests/test_output_consistency.py +1 -2
  12. {slide2vec-4.0.2 → slide2vec-4.0.4}/tests/test_regression_core.py +54 -0
  13. {slide2vec-4.0.2 → slide2vec-4.0.4}/tests/test_regression_inference.py +162 -5
  14. {slide2vec-4.0.2 → slide2vec-4.0.4}/LICENSE +0 -0
  15. {slide2vec-4.0.2 → slide2vec-4.0.4}/README.md +0 -0
  16. {slide2vec-4.0.2 → slide2vec-4.0.4}/setup.cfg +0 -0
  17. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/__main__.py +0 -0
  18. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/artifacts.py +0 -0
  19. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/cli.py +0 -0
  20. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/configs/__init__.py +0 -0
  21. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/data/__init__.py +0 -0
  22. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/data/dataset.py +0 -0
  23. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/data/tile_reader.py +0 -0
  24. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/data/tile_store.py +0 -0
  25. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/distributed/__init__.py +0 -0
  26. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/distributed/direct_embed_worker.py +0 -0
  27. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/distributed/pipeline_worker.py +0 -0
  28. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/__init__.py +0 -0
  29. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/base.py +0 -0
  30. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/models/__init__.py +0 -0
  31. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/models/conch.py +0 -0
  32. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/models/gigapath.py +0 -0
  33. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/models/hibou.py +0 -0
  34. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/models/hoptimus.py +0 -0
  35. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/models/midnight.py +0 -0
  36. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/models/musk.py +0 -0
  37. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/models/phikon.py +0 -0
  38. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/models/prism.py +0 -0
  39. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/models/prost40m.py +0 -0
  40. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/models/titan.py +0 -0
  41. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/models/uni.py +0 -0
  42. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/models/virchow.py +0 -0
  43. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/registry.py +0 -0
  44. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/encoders/validation.py +0 -0
  45. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/main.py +0 -0
  46. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/model_settings.py +0 -0
  47. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/progress.py +0 -0
  48. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/registry.py +0 -0
  49. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/resources.py +0 -0
  50. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/runtime_types.py +0 -0
  51. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/utils/__init__.py +0 -0
  52. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/utils/config.py +0 -0
  53. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/utils/coordinates.py +0 -0
  54. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec/utils/log_utils.py +0 -0
  55. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec.egg-info/SOURCES.txt +0 -0
  56. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec.egg-info/dependency_links.txt +0 -0
  57. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec.egg-info/entry_points.txt +0 -0
  58. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec.egg-info/not-zip-safe +0 -0
  59. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec.egg-info/requires.txt +0 -0
  60. {slide2vec-4.0.2 → slide2vec-4.0.4}/slide2vec.egg-info/top_level.txt +0 -0
  61. {slide2vec-4.0.2 → slide2vec-4.0.4}/tests/test_batch_collator_timing.py +0 -0
  62. {slide2vec-4.0.2 → slide2vec-4.0.4}/tests/test_encoder_registry.py +0 -0
  63. {slide2vec-4.0.2 → slide2vec-4.0.4}/tests/test_packaging_metadata.py +0 -0
  64. {slide2vec-4.0.2 → slide2vec-4.0.4}/tests/test_progress.py +0 -0
  65. {slide2vec-4.0.2 → slide2vec-4.0.4}/tests/test_regression_models.py +0 -0
  66. {slide2vec-4.0.2 → slide2vec-4.0.4}/tests/test_tile_store.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: slide2vec
3
- Version: 4.0.2
3
+ Version: 4.0.4
4
4
  Summary: Embedding of whole slide images with Foundation Models
5
5
  Author-email: Clément Grisi <clement.grisi@radboudumc.nl>
6
6
  License-Expression: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "slide2vec"
7
- version = "4.0.2"
7
+ version = "4.0.4"
8
8
  description = "Embedding of whole slide images with Foundation Models"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -154,7 +154,7 @@ no_implicit_reexport = true
154
154
  max-line-length = 160
155
155
 
156
156
  [tool.bumpver]
157
- current_version = "4.0.2"
157
+ current_version = "4.0.4"
158
158
  version_pattern = "MAJOR.MINOR.PATCH"
159
159
  commit = false # We do version bumping in CI, not as a commit
160
160
  tag = false # Git tag already exists — we don't auto-tag
@@ -2,7 +2,7 @@ from slide2vec.api import EmbeddedSlide, ExecutionOptions, Model, Pipeline, Prep
2
2
  from slide2vec.artifacts import HierarchicalEmbeddingArtifact, SlideEmbeddingArtifact, TileEmbeddingArtifact
3
3
 
4
4
 
5
- __version__ = "4.0.2"
5
+ __version__ = "4.0.4"
6
6
 
7
7
  __all__ = [
8
8
  "Model",
@@ -120,7 +120,7 @@ class ExecutionOptions:
120
120
  output_dir: Path | None = None
121
121
  output_format: str = "pt"
122
122
  batch_size: int = 1
123
- num_workers: int = 0
123
+ num_workers: int | None = None
124
124
  num_preprocessing_workers: int | None = None
125
125
  num_gpus: int | None = None
126
126
  precision: str | None = None
@@ -140,7 +140,7 @@ class ExecutionOptions:
140
140
  output_dir=Path(cfg.output_dir),
141
141
  output_format="pt",
142
142
  batch_size=int(cfg.model.batch_size),
143
- num_workers=int(num_workers),
143
+ num_workers=int(num_workers) if num_workers is not None else None,
144
144
  num_preprocessing_workers=(
145
145
  int(cfg.speed.num_preprocessing_workers)
146
146
  if cfg.speed.num_preprocessing_workers is not None
@@ -165,23 +165,30 @@ class ExecutionOptions:
165
165
  cap = cpu_worker_limit()
166
166
  cpu_count = os.cpu_count() or 1
167
167
  slurm_limit = slurm_cpu_limit()
168
- capped_num_workers = min(self.num_workers, cap)
169
168
  capped_num_preprocessing_workers = (
170
169
  cap if self.num_preprocessing_workers is None else min(self.num_preprocessing_workers, cap)
171
170
  )
172
- object.__setattr__(self, "num_workers", capped_num_workers)
173
171
  object.__setattr__(self, "num_preprocessing_workers", capped_num_preprocessing_workers)
174
172
  logger = logging.getLogger(__name__)
175
173
  cap_source = f"slurm_cpu_limit={slurm_limit}" if slurm_limit is not None else f"cpu_count={cpu_count}"
174
+ resolved_num_workers = self.resolved_num_workers()
175
+ num_workers_label = (
176
+ f"{resolved_num_workers} (requested=auto)"
177
+ if self.num_workers is None
178
+ else str(resolved_num_workers)
179
+ )
176
180
  logger.info(
177
- "ExecutionOptions: num_workers=%d, num_preprocessing_workers=%d "
178
- "(cap=%d via %s)",
179
- capped_num_workers,
181
+ "ExecutionOptions: num_workers=%s, num_preprocessing_workers=%d "
182
+ "(preprocessing cap=%d via %s)",
183
+ num_workers_label,
180
184
  capped_num_preprocessing_workers,
181
185
  cap,
182
186
  cap_source,
183
187
  )
184
188
 
189
+ def resolved_num_workers(self) -> int:
190
+ return cpu_worker_limit() if self.num_workers is None else int(self.num_workers)
191
+
185
192
  def with_output_dir(self, output_dir: PathLike | None) -> "ExecutionOptions":
186
193
  if output_dir is None:
187
194
  return self
@@ -66,9 +66,9 @@ tiling:
66
66
 
67
67
  speed:
68
68
  precision: # model inference precision ["fp32", "fp16", "bf16"]; if not set, determined automatically based on model recommendations
69
- num_dataloader_workers: 8 # number of DataLoader worker processes for reading tiles during embedding (tar path); on-the-fly path derives this automatically from cpu_count // speed.num_cucim_workers
69
+ num_dataloader_workers: # number of DataLoader worker processes for reading tiles during embedding; defaults to auto (job CPU budget, except cuCIM on-the-fly uses cpu_budget // speed.num_cucim_workers)
70
70
  num_gpus: # number of GPUs to use for feature extraction; defaults to all available GPUs
71
- num_preprocessing_workers: # number of workers for hs2p tiling (WSI reading, JPEG encoding, tar writing); defaults to the CPU budget at runtime
71
+ num_preprocessing_workers: # number of workers for hs2p tiling (WSI reading, JPEG encoding, tar writing); defaults to the runtime CPU budget capped at 64
72
72
  num_cucim_workers: 4 # number of internal cucim threads per read_region call (embedding path, on-the-fly only); DataLoader workers are auto-set to cpu_count // num_cucim_workers
73
73
  prefetch_factor_embedding: 4 # prefetch factor for tile embedding dataloaders
74
74
  persistent_workers_embedding: true # keep DataLoader workers alive across epochs/batches
@@ -39,7 +39,11 @@ from slide2vec.artifacts import (
39
39
  write_tile_embedding_metadata,
40
40
  write_tile_embeddings,
41
41
  )
42
- from slide2vec.encoders.registry import encoder_registry, resolve_preprocessing_defaults
42
+ from slide2vec.encoders.registry import (
43
+ encoder_registry,
44
+ resolve_encoder_output,
45
+ resolve_preprocessing_defaults,
46
+ )
43
47
  from slide2vec.model_settings import canonicalize_model_name
44
48
  from slide2vec.runtime_types import LoadedModel
45
49
  from slide2vec.progress import (
@@ -386,6 +390,8 @@ def embed_slides(
386
390
  persist_tile_embeddings=persist_tile_embeddings,
387
391
  persist_hierarchical_embeddings=persist_hierarchical_embeddings,
388
392
  include_slide_embeddings=include_slide_embeddings,
393
+ encoder_name=model.name,
394
+ output_variant=_resolved_process_list_output_variant(model),
389
395
  tile_artifacts=tile_artifacts,
390
396
  hierarchical_artifacts=hierarchical_artifacts,
391
397
  slide_artifacts=slide_artifacts,
@@ -717,6 +723,8 @@ def run_pipeline(
717
723
  persist_tile_embeddings=persist_tile_embeddings,
718
724
  persist_hierarchical_embeddings=persist_hierarchical_embeddings,
719
725
  include_slide_embeddings=include_slide_embeddings,
726
+ encoder_name=model.name,
727
+ output_variant=_resolved_process_list_output_variant(model),
720
728
  tile_artifacts=tile_artifacts,
721
729
  hierarchical_artifacts=hierarchical_artifacts,
722
730
  slide_artifacts=slide_artifacts,
@@ -903,6 +911,8 @@ def _build_incremental_persist_callback(
903
911
  persist_tile_embeddings=persist_tile_embeddings,
904
912
  persist_hierarchical_embeddings=persist_hierarchical_embeddings,
905
913
  include_slide_embeddings=include_slide_embeddings,
914
+ encoder_name=model.name,
915
+ output_variant=_resolved_process_list_output_variant(model),
906
916
  tile_artifacts=[tile_artifact] if isinstance(tile_artifact, TileEmbeddingArtifact) else [],
907
917
  hierarchical_artifacts=[tile_artifact] if isinstance(tile_artifact, HierarchicalEmbeddingArtifact) else [],
908
918
  slide_artifacts=[slide_artifact] if slide_artifact is not None else [],
@@ -1054,6 +1064,8 @@ def _collect_distributed_pipeline_artifacts(
1054
1064
  persist_tile_embeddings=persist_tile_embeddings,
1055
1065
  persist_hierarchical_embeddings=persist_hierarchical_embeddings,
1056
1066
  include_slide_embeddings=include_slide_embeddings,
1067
+ encoder_name=model.name,
1068
+ output_variant=_resolved_process_list_output_variant(model),
1057
1069
  tile_artifacts=tile_artifacts,
1058
1070
  hierarchical_artifacts=hierarchical_artifacts,
1059
1071
  slide_artifacts=slide_artifacts,
@@ -1206,7 +1218,7 @@ def _compute_tile_embeddings_for_slide(
1206
1218
  )
1207
1219
  loader_kwargs = _embedding_dataloader_kwargs(loaded, execution)
1208
1220
  resolved_backend = _resolve_slide_backend(preprocessing, tiling_result)
1209
- if preprocessing.on_the_fly and preprocessing.read_tiles_from is None:
1221
+ if preprocessing.on_the_fly and preprocessing.read_tiles_from is None and resolved_backend == "cucim":
1210
1222
  effective_num_workers, worker_context = _resolve_on_the_fly_num_workers(preprocessing.num_cucim_workers)
1211
1223
  if effective_num_workers != execution.num_workers:
1212
1224
  logging.getLogger(__name__).info(
@@ -1289,18 +1301,19 @@ def _compute_hierarchical_embeddings_for_slide(
1289
1301
  target_tile_size_px=int(geometry["target_tile_size_px"]),
1290
1302
  )
1291
1303
  loader_kwargs = _embedding_dataloader_kwargs(loaded, execution)
1292
- effective_num_workers, worker_context = _resolve_on_the_fly_num_workers(preprocessing.num_cucim_workers)
1293
1304
  resolved_backend = _resolve_slide_backend(preprocessing, tiling_result)
1294
- if effective_num_workers != execution.num_workers:
1295
- logging.getLogger(__name__).info(
1296
- f"on-the-fly hierarchical mode: setting DataLoader num_workers={effective_num_workers} "
1297
- f"({worker_context}); "
1298
- f"ignoring speed.num_dataloader_workers={execution.num_workers}"
1299
- )
1300
- loader_kwargs["num_workers"] = effective_num_workers
1301
- if effective_num_workers == 0:
1302
- loader_kwargs.pop("persistent_workers", None)
1303
- loader_kwargs.pop("prefetch_factor", None)
1305
+ if resolved_backend == "cucim":
1306
+ effective_num_workers, worker_context = _resolve_on_the_fly_num_workers(preprocessing.num_cucim_workers)
1307
+ if effective_num_workers != execution.num_workers:
1308
+ logging.getLogger(__name__).info(
1309
+ f"on-the-fly hierarchical mode: setting DataLoader num_workers={effective_num_workers} "
1310
+ f"({worker_context}); "
1311
+ f"ignoring speed.num_dataloader_workers={execution.num_workers}"
1312
+ )
1313
+ loader_kwargs["num_workers"] = effective_num_workers
1314
+ if effective_num_workers == 0:
1315
+ loader_kwargs.pop("persistent_workers", None)
1316
+ loader_kwargs.pop("prefetch_factor", None)
1304
1317
  _configure_cucim_worker_stderr(
1305
1318
  loader_kwargs,
1306
1319
  backend=resolved_backend,
@@ -1376,12 +1389,13 @@ def _compute_hierarchical_embedding_shard_for_slide(
1376
1389
  target_tile_size_px=int(geometry["target_tile_size_px"]),
1377
1390
  )
1378
1391
  loader_kwargs = _embedding_dataloader_kwargs(loaded, execution)
1379
- effective_num_workers, _worker_context = _resolve_on_the_fly_num_workers(preprocessing.num_cucim_workers)
1380
1392
  resolved_backend = _resolve_slide_backend(preprocessing, tiling_result)
1381
- loader_kwargs["num_workers"] = effective_num_workers
1382
- if effective_num_workers == 0:
1383
- loader_kwargs.pop("persistent_workers", None)
1384
- loader_kwargs.pop("prefetch_factor", None)
1393
+ if resolved_backend == "cucim":
1394
+ effective_num_workers, _worker_context = _resolve_on_the_fly_num_workers(preprocessing.num_cucim_workers)
1395
+ loader_kwargs["num_workers"] = effective_num_workers
1396
+ if effective_num_workers == 0:
1397
+ loader_kwargs.pop("persistent_workers", None)
1398
+ loader_kwargs.pop("prefetch_factor", None)
1385
1399
  _configure_cucim_worker_stderr(
1386
1400
  loader_kwargs,
1387
1401
  backend=resolved_backend,
@@ -1686,11 +1700,12 @@ def _write_hierarchical_embedding_artifact(
1686
1700
 
1687
1701
 
1688
1702
  def _embedding_dataloader_kwargs(loaded: LoadedModel, execution: ExecutionOptions) -> dict[str, Any]:
1703
+ resolved_num_workers = execution.resolved_num_workers()
1689
1704
  kwargs: dict[str, Any] = {
1690
- "num_workers": execution.num_workers,
1705
+ "num_workers": resolved_num_workers,
1691
1706
  "pin_memory": _uses_cuda_runtime(loaded.device),
1692
1707
  }
1693
- if execution.num_workers > 0:
1708
+ if resolved_num_workers > 0:
1694
1709
  kwargs["persistent_workers"] = bool(execution.persistent_workers)
1695
1710
  kwargs["prefetch_factor"] = int(execution.prefetch_factor)
1696
1711
  return kwargs
@@ -2245,6 +2260,17 @@ def _should_persist_tile_embeddings(model, execution: ExecutionOptions) -> bool:
2245
2260
  return True
2246
2261
 
2247
2262
 
2263
+ def _resolved_process_list_output_variant(model) -> str | None:
2264
+ requested_output_variant = getattr(model, "_output_variant", None)
2265
+ if not hasattr(model, "name") or model.name not in encoder_registry:
2266
+ return requested_output_variant
2267
+ resolved = resolve_encoder_output(
2268
+ model.name,
2269
+ requested_output_variant=requested_output_variant,
2270
+ )
2271
+ return str(resolved["output_variant"])
2272
+
2273
+
2248
2274
  def _prepare_tiled_slides(
2249
2275
  slide_records: Sequence[SlideSpec],
2250
2276
  preprocessing: PreprocessingConfig,
@@ -3043,7 +3069,7 @@ def deserialize_preprocessing(payload: dict[str, Any]) -> PreprocessingConfig:
3043
3069
  def deserialize_execution(payload: dict[str, Any]) -> ExecutionOptions:
3044
3070
  output_dir = payload["output_dir"] if "output_dir" in payload else None
3045
3071
  batch_size = payload["batch_size"] if "batch_size" in payload else None
3046
- num_workers = payload["num_workers"] if "num_workers" in payload else 0
3072
+ num_workers = payload["num_workers"] if "num_workers" in payload else None
3047
3073
  num_gpus = payload["num_gpus"] if "num_gpus" in payload else 1
3048
3074
  precision = payload["precision"] if "precision" in payload else "fp32"
3049
3075
  prefetch_factor = payload["prefetch_factor"] if "prefetch_factor" in payload else 4
@@ -3058,7 +3084,7 @@ def deserialize_execution(payload: dict[str, Any]) -> ExecutionOptions:
3058
3084
  output_dir=Path(output_dir) if output_dir is not None else None,
3059
3085
  output_format=payload["output_format"] if "output_format" in payload else "pt",
3060
3086
  batch_size=batch_size,
3061
- num_workers=int(num_workers),
3087
+ num_workers=int(num_workers) if num_workers is not None else None,
3062
3088
  num_gpus=int(num_gpus),
3063
3089
  precision=precision,
3064
3090
  prefetch_factor=int(prefetch_factor),
@@ -3163,6 +3189,8 @@ def _update_process_list_after_embedding(
3163
3189
  persist_tile_embeddings: bool,
3164
3190
  persist_hierarchical_embeddings: bool,
3165
3191
  include_slide_embeddings: bool,
3192
+ encoder_name: str,
3193
+ output_variant: str | None,
3166
3194
  tile_artifacts: Sequence[TileEmbeddingArtifact],
3167
3195
  hierarchical_artifacts: Sequence[HierarchicalEmbeddingArtifact],
3168
3196
  slide_artifacts: Sequence[SlideEmbeddingArtifact],
@@ -3177,6 +3205,12 @@ def _update_process_list_after_embedding(
3177
3205
  df["feature_status"] = ["tbp"] * len(df)
3178
3206
  if "feature_path" not in df.columns:
3179
3207
  df["feature_path"] = [None] * len(df)
3208
+ if "encoder_name" not in df.columns:
3209
+ df["encoder_name"] = [None] * len(df)
3210
+ if "output_variant" not in df.columns:
3211
+ df["output_variant"] = [None] * len(df)
3212
+ if "feature_kind" not in df.columns:
3213
+ df["feature_kind"] = [None] * len(df)
3180
3214
  if include_slide_embeddings and "aggregation_status" not in df.columns:
3181
3215
  df["aggregation_status"] = ["tbp"] * len(df)
3182
3216
  tile_success_ids = {artifact.sample_id for artifact in tile_artifacts}
@@ -3184,28 +3218,34 @@ def _update_process_list_after_embedding(
3184
3218
  slide_success_ids = {artifact.sample_id for artifact in slide_artifacts}
3185
3219
  if slide_artifacts:
3186
3220
  feature_path_by_sample_id = {artifact.sample_id: _resolve_path_str(artifact.path) for artifact in slide_artifacts}
3221
+ feature_kind = "slide"
3222
+ feature_success_ids = slide_success_ids
3187
3223
  elif persist_hierarchical_embeddings:
3188
3224
  feature_path_by_sample_id = {
3189
3225
  artifact.sample_id: _resolve_path_str(artifact.path) for artifact in hierarchical_artifacts
3190
3226
  }
3227
+ feature_kind = "hierarchical"
3228
+ feature_success_ids = hierarchical_success_ids
3191
3229
  elif persist_tile_embeddings:
3192
3230
  feature_path_by_sample_id = {
3193
3231
  artifact.sample_id: _resolve_path_str(artifact.path) for artifact in tile_artifacts
3194
3232
  }
3233
+ feature_kind = "tile"
3234
+ feature_success_ids = tile_success_ids
3195
3235
  else:
3196
3236
  feature_path_by_sample_id = {}
3237
+ feature_kind = None
3238
+ feature_success_ids = {slide.sample_id for slide in successful_slides}
3197
3239
  for slide in successful_slides:
3198
3240
  mask = df["sample_id"].astype(str) == slide.sample_id
3199
- if persist_hierarchical_embeddings:
3200
- feature_status = "success" if slide.sample_id in hierarchical_success_ids else "error"
3201
- elif persist_tile_embeddings:
3202
- feature_status = "success" if slide.sample_id in tile_success_ids else "error"
3203
- else:
3204
- feature_status = "success"
3241
+ feature_status = "success" if slide.sample_id in feature_success_ids else "error"
3205
3242
  df.loc[mask, "feature_status"] = feature_status
3206
3243
  mapped_feature_path = feature_path_by_sample_id.get(slide.sample_id)
3207
3244
  if mapped_feature_path is not None:
3208
3245
  df.loc[mask, "feature_path"] = mapped_feature_path
3246
+ df.loc[mask, "encoder_name"] = encoder_name
3247
+ df.loc[mask, "output_variant"] = output_variant
3248
+ df.loc[mask, "feature_kind"] = feature_kind
3209
3249
  if include_slide_embeddings:
3210
3250
  df.loc[mask, "aggregation_status"] = (
3211
3251
  "success" if slide.sample_id in slide_success_ids else "error"
@@ -53,6 +53,9 @@ BASE_EMBEDDING_ORDERED_COLUMNS = (
53
53
  "tiling_preview_path",
54
54
  "feature_status",
55
55
  "feature_path",
56
+ "encoder_name",
57
+ "output_variant",
58
+ "feature_kind",
56
59
  "error",
57
60
  "traceback",
58
61
  )
@@ -157,6 +160,12 @@ def load_embedding_process_df(
157
160
  df["feature_status"] = ["tbp"] * len(df)
158
161
  if "feature_path" not in df.columns:
159
162
  df["feature_path"] = [None] * len(df)
163
+ if "encoder_name" not in df.columns:
164
+ df["encoder_name"] = [None] * len(df)
165
+ if "output_variant" not in df.columns:
166
+ df["output_variant"] = [None] * len(df)
167
+ if "feature_kind" not in df.columns:
168
+ df["feature_kind"] = [None] * len(df)
160
169
  if include_aggregation_status and "aggregation_status" not in df.columns:
161
170
  df["aggregation_status"] = ["tbp"] * len(df)
162
171
  ordered_columns = list(BASE_EMBEDDING_ORDERED_COLUMNS)
@@ -90,7 +90,8 @@ def cpu_worker_limit() -> int:
90
90
  """Return the largest safe worker count for CPU-bound tiling work."""
91
91
  cpu_count = os.cpu_count() or 1
92
92
  slurm_limit = slurm_cpu_limit()
93
- return min(cpu_count, slurm_limit) if slurm_limit is not None else cpu_count
93
+ available = min(cpu_count, slurm_limit) if slurm_limit is not None else cpu_count
94
+ return min(available, 64)
94
95
 
95
96
 
96
97
  def _parse_positive_cpu_value(value: str) -> int | None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: slide2vec
3
- Version: 4.0.2
3
+ Version: 4.0.4
4
4
  Summary: Embedding of whole slide images with Foundation Models
5
5
  Author-email: Clément Grisi <clement.grisi@radboudumc.nl>
6
6
  License-Expression: Apache-2.0
@@ -133,12 +133,18 @@ def test_load_embedding_process_df_accepts_hs2p_process_list_columns(tmp_path: P
133
133
  "tiling_preview_path",
134
134
  "feature_status",
135
135
  "feature_path",
136
+ "encoder_name",
137
+ "output_variant",
138
+ "feature_kind",
136
139
  "aggregation_status",
137
140
  "error",
138
141
  "traceback",
139
142
  ]
140
143
  assert df.loc[0, "feature_status"] == "tbp"
141
144
  assert pd.isna(df.loc[0, "feature_path"])
145
+ assert pd.isna(df.loc[0, "encoder_name"])
146
+ assert pd.isna(df.loc[0, "output_variant"])
147
+ assert pd.isna(df.loc[0, "feature_kind"])
142
148
 
143
149
 
144
150
  def test_load_tiling_process_df_rejects_legacy_mask_columns(tmp_path: Path):
@@ -60,8 +60,7 @@ MODEL_PARAMS = dict(
60
60
  # -- speed --
61
61
  SPEED_PARAMS = dict(
62
62
  precision="fp16", # override (default: fp32)
63
- num_workers=4, # override (default: 8)
64
- num_workers_embedding=4, # override (default: 8)
63
+ num_dataloader_workers=0, # keep the Prism subprocess path single-process to avoid worker SHM pressure
65
64
  )
66
65
 
67
66
  # ---------------------------------------------------------------------------
@@ -310,9 +310,45 @@ def test_execution_options_defaults_preprocessing_workers_to_cpu_budget(monkeypa
310
310
 
311
311
  assert api.ExecutionOptions().num_preprocessing_workers == 24
312
312
 
313
+ def test_execution_options_preserves_explicit_dataloader_workers(monkeypatch):
314
+ import slide2vec.api as api
315
+
316
+ monkeypatch.setattr(api, "cpu_worker_limit", lambda: 2)
317
+ monkeypatch.setattr(api, "slurm_cpu_limit", lambda: 2)
318
+
319
+ execution = api.ExecutionOptions(num_workers=3)
320
+
321
+ assert execution.num_workers == 3
322
+ assert execution.num_preprocessing_workers == 2
323
+
324
+ def test_cpu_worker_limit_caps_large_cpu_budget_to_sixty_four(monkeypatch):
325
+ import slide2vec.utils.utils as utils
326
+
327
+ monkeypatch.setattr(utils.os, "cpu_count", lambda: 128)
328
+ monkeypatch.setattr(utils, "slurm_cpu_limit", lambda: 96)
329
+
330
+ assert utils.cpu_worker_limit() == 64
331
+
313
332
  def test_execution_options_default_batch_size_is_one():
314
333
  assert ExecutionOptions().batch_size == 1
315
334
 
335
+ def test_execution_options_default_num_workers_is_auto():
336
+ assert ExecutionOptions().num_workers is None
337
+
338
+ def test_execution_options_logs_resolved_auto_num_workers(monkeypatch, caplog):
339
+ import slide2vec.api as api
340
+
341
+ monkeypatch.setattr(api, "cpu_worker_limit", lambda: 18)
342
+ monkeypatch.setattr(api, "slurm_cpu_limit", lambda: 18)
343
+ monkeypatch.setattr(api.os, "cpu_count", lambda: 64)
344
+
345
+ with caplog.at_level("INFO"):
346
+ execution = api.ExecutionOptions()
347
+
348
+ assert execution.num_workers is None
349
+ assert "ExecutionOptions: num_workers=18 (requested=auto)" in caplog.text
350
+ assert "num_workers=auto" not in caplog.text
351
+
316
352
  def test_execution_options_from_config_maps_cli_fields(tmp_path: Path):
317
353
  cfg = SimpleNamespace(
318
354
  output_dir=str(tmp_path),
@@ -368,6 +404,24 @@ def test_execution_options_from_config_defaults_preprocessing_workers_to_cpu_bud
368
404
 
369
405
  assert execution.num_preprocessing_workers == 18
370
406
 
407
+ def test_execution_options_from_config_preserves_auto_num_workers(tmp_path: Path):
408
+ cfg = SimpleNamespace(
409
+ output_dir=str(tmp_path),
410
+ model=SimpleNamespace(batch_size=4, save_tile_embeddings=False, save_latents=False),
411
+ speed=SimpleNamespace(
412
+ precision="fp16",
413
+ num_dataloader_workers=None,
414
+ num_preprocessing_workers=None,
415
+ num_gpus=3,
416
+ prefetch_factor_embedding=5,
417
+ persistent_workers_embedding=False,
418
+ ),
419
+ )
420
+
421
+ execution = ExecutionOptions.from_config(cfg)
422
+
423
+ assert execution.num_workers is None
424
+
371
425
  def test_execution_options_from_config_defaults_to_all_available_gpus_when_unset(monkeypatch, tmp_path: Path):
372
426
  import slide2vec.api as api
373
427
 
@@ -176,6 +176,8 @@ def test_collect_distributed_pipeline_artifacts_runs_stage_collects_and_updates(
176
176
  persist_tile_embeddings,
177
177
  persist_hierarchical_embeddings,
178
178
  include_slide_embeddings,
179
+ encoder_name,
180
+ output_variant,
179
181
  tile_artifacts,
180
182
  hierarchical_artifacts,
181
183
  slide_artifacts,
@@ -186,6 +188,8 @@ def test_collect_distributed_pipeline_artifacts_runs_stage_collects_and_updates(
186
188
  "persist_tile_embeddings": persist_tile_embeddings,
187
189
  "persist_hierarchical_embeddings": persist_hierarchical_embeddings,
188
190
  "include_slide_embeddings": include_slide_embeddings,
191
+ "encoder_name": encoder_name,
192
+ "output_variant": output_variant,
189
193
  "tile_artifacts": tile_artifacts,
190
194
  "hierarchical_artifacts": hierarchical_artifacts,
191
195
  "slide_artifacts": slide_artifacts,
@@ -220,6 +224,8 @@ def test_collect_distributed_pipeline_artifacts_runs_stage_collects_and_updates(
220
224
  assert captured["update"]["persist_tile_embeddings"] is True
221
225
  assert captured["update"]["persist_hierarchical_embeddings"] is False
222
226
  assert captured["update"]["include_slide_embeddings"] is True
227
+ assert captured["update"]["encoder_name"] == "prism"
228
+ assert captured["update"]["output_variant"] == "default"
223
229
  assert captured["update"]["tile_artifacts"] == ["tile-artifact"]
224
230
  assert captured["update"]["hierarchical_artifacts"] == []
225
231
  assert captured["update"]["slide_artifacts"] == ["slide-artifact"]
@@ -292,9 +298,18 @@ def test_has_complete_local_embedding_outputs_uses_hierarchical_artifacts_for_hi
292
298
  )
293
299
 
294
300
 
295
- @pytest.mark.parametrize("persist_hierarchical_embeddings", [False, True])
296
- def test_update_process_list_after_embedding_writes_feature_path(
301
+ @pytest.mark.parametrize(
302
+ ("persist_hierarchical_embeddings", "include_slide_embeddings", "expected_feature_kind"),
303
+ [
304
+ (False, False, "tile"),
305
+ (True, False, "hierarchical"),
306
+ (False, True, "slide"),
307
+ ],
308
+ )
309
+ def test_update_process_list_after_embedding_writes_feature_provenance(
297
310
  persist_hierarchical_embeddings: bool,
311
+ include_slide_embeddings: bool,
312
+ expected_feature_kind: str,
298
313
  tmp_path: Path,
299
314
  ):
300
315
  import slide2vec.inference as inference
@@ -306,7 +321,19 @@ def test_update_process_list_after_embedding_writes_feature_path(
306
321
  "slide-a,/tmp/slide-a.svs,,asap,asap,,success,1,/tmp/slide-a.coordinates.npz,/tmp/slide-a.coordinates.meta.json,tbp,,\n",
307
322
  encoding="utf-8",
308
323
  )
309
- if persist_hierarchical_embeddings:
324
+ slide_artifacts = []
325
+ if include_slide_embeddings:
326
+ artifact = write_slide_embeddings(
327
+ "slide-a",
328
+ np.zeros((4,), dtype=np.float32),
329
+ output_dir=tmp_path,
330
+ output_format="pt",
331
+ metadata={"image_path": "/tmp/slide-a.svs"},
332
+ )
333
+ tile_artifacts = []
334
+ hierarchical_artifacts = []
335
+ slide_artifacts = [artifact]
336
+ elif persist_hierarchical_embeddings:
310
337
  artifact = write_hierarchical_embeddings(
311
338
  "slide-a",
312
339
  np.zeros((1, 2, 4), dtype=np.float32),
@@ -332,15 +359,20 @@ def test_update_process_list_after_embedding_writes_feature_path(
332
359
  successful_slides=[slide],
333
360
  persist_tile_embeddings=not persist_hierarchical_embeddings,
334
361
  persist_hierarchical_embeddings=persist_hierarchical_embeddings,
335
- include_slide_embeddings=False,
362
+ include_slide_embeddings=include_slide_embeddings,
363
+ encoder_name="virchow2" if not include_slide_embeddings else "prism",
364
+ output_variant="cls" if not include_slide_embeddings else "default",
336
365
  tile_artifacts=tile_artifacts,
337
366
  hierarchical_artifacts=hierarchical_artifacts,
338
- slide_artifacts=[],
367
+ slide_artifacts=slide_artifacts,
339
368
  )
340
369
 
341
370
  recorded = pd.read_csv(process_list_path).set_index("sample_id")
342
371
  assert recorded.loc["slide-a", "feature_status"] == "success"
343
372
  assert recorded.loc["slide-a", "feature_path"] == str(artifact.path)
373
+ assert recorded.loc["slide-a", "encoder_name"] == ("virchow2" if not include_slide_embeddings else "prism")
374
+ assert recorded.loc["slide-a", "output_variant"] == ("cls" if not include_slide_embeddings else "default")
375
+ assert recorded.loc["slide-a", "feature_kind"] == expected_feature_kind
344
376
 
345
377
 
346
378
  def test_model_embed_slide_updates_process_list_feature_status_and_path_in_distributed_path(
@@ -2033,6 +2065,128 @@ def test_serialize_execution_preserves_loader_optimization_fields():
2033
2065
  assert restored.precision == "bf16"
2034
2066
 
2035
2067
 
2068
+ def test_deserialize_execution_defaults_num_workers_to_auto():
2069
+ import slide2vec.inference as inference
2070
+
2071
+ restored = inference.deserialize_execution({"batch_size": 4, "num_gpus": 1})
2072
+
2073
+ assert restored.num_workers is None
2074
+
2075
+
2076
+ def test_deserialize_execution_preserves_auto_num_workers():
2077
+ import slide2vec.inference as inference
2078
+
2079
+ restored = inference.deserialize_execution({"batch_size": 4, "num_workers": None, "num_gpus": 1})
2080
+
2081
+ assert restored.num_workers is None
2082
+
2083
+
2084
+ def test_embedding_dataloader_kwargs_resolve_auto_mode_to_cpu_budget(monkeypatch):
2085
+ import slide2vec.api as api
2086
+ import slide2vec.inference as inference
2087
+ torch = pytest.importorskip("torch")
2088
+
2089
+ monkeypatch.setattr(api, "cpu_worker_limit", lambda: 24)
2090
+
2091
+ loaded = inference.LoadedModel(
2092
+ name="test",
2093
+ level="tile",
2094
+ model=object(),
2095
+ transforms=object(),
2096
+ feature_dim=3,
2097
+ device=torch.device("cpu"),
2098
+ )
2099
+
2100
+ kwargs = inference._embedding_dataloader_kwargs(
2101
+ loaded,
2102
+ ExecutionOptions(num_workers=None, num_gpus=1),
2103
+ )
2104
+
2105
+ assert kwargs["num_workers"] == 24
2106
+ assert kwargs["persistent_workers"] is True
2107
+ assert kwargs["prefetch_factor"] == 4
2108
+
2109
+
2110
+ def test_compute_tile_embeddings_for_slide_uses_cpu_budget_for_auto_workers_on_non_cucim_on_the_fly(monkeypatch):
2111
+ import slide2vec.api as api
2112
+ import slide2vec.inference as inference
2113
+ torch = pytest.importorskip("torch")
2114
+
2115
+ captured = {}
2116
+
2117
+ class DummyLoader:
2118
+ def __init__(self, dataset, **kwargs):
2119
+ captured["kwargs"] = kwargs
2120
+
2121
+ def __iter__(self):
2122
+ yield (
2123
+ torch.tensor([0, 1], dtype=torch.long),
2124
+ torch.zeros((2, 3, 4, 4), dtype=torch.uint8),
2125
+ {"worker_batch_ms": 0.0, "reader_open_ms": 0.0, "reader_read_ms": 0.0},
2126
+ )
2127
+
2128
+ def __len__(self):
2129
+ return 1
2130
+
2131
+ class DummyEncoder:
2132
+ pretrained_cfg = {}
2133
+
2134
+ class DummyModel:
2135
+ encoder = DummyEncoder()
2136
+
2137
+ def encode_tiles(self, image):
2138
+ return torch.ones((image.shape[0], 3), dtype=torch.float32, device=image.device)
2139
+
2140
+ class DummyCollator:
2141
+ ordered_indices = None
2142
+
2143
+ def __init__(self, **kwargs):
2144
+ captured["wsd_collator_kwargs"] = kwargs
2145
+
2146
+ def __call__(self, batch_indices):
2147
+ tile_indices = torch.as_tensor(batch_indices, dtype=torch.long)
2148
+ batch = torch.zeros((len(batch_indices), 3, 4, 4), dtype=torch.uint8)
2149
+ return tile_indices, batch, {"worker_batch_ms": 0.0, "reader_open_ms": 0.0, "reader_read_ms": 0.0}
2150
+
2151
+ monkeypatch.setattr(inference, "OnTheFlyBatchTileCollator", DummyCollator)
2152
+ monkeypatch.setattr(torch.utils.data, "DataLoader", DummyLoader)
2153
+ monkeypatch.setattr(inference, "_build_batch_preprocessor", lambda *args, **kwargs: lambda batch: batch.float())
2154
+ monkeypatch.setattr(api, "cpu_worker_limit", lambda: 24)
2155
+
2156
+ loaded = inference.LoadedModel(
2157
+ name="prov-gigapath",
2158
+ level="tile",
2159
+ model=DummyModel(),
2160
+ transforms=object(),
2161
+ feature_dim=3,
2162
+ device=torch.device("cpu"),
2163
+ )
2164
+
2165
+ result = inference._compute_tile_embeddings_for_slide(
2166
+ loaded,
2167
+ SimpleNamespace(level="tile"),
2168
+ make_slide("slide-a"),
2169
+ SimpleNamespace(
2170
+ x=np.array([0, 10]),
2171
+ y=np.array([5, 15]),
2172
+ backend="asap",
2173
+ target_spacing_um=0.5,
2174
+ target_tile_size_px=4,
2175
+ read_spacing_um=0.5,
2176
+ read_tile_size_px=4,
2177
+ tile_size_lv0=224,
2178
+ ),
2179
+ preprocessing=replace(DEFAULT_PREPROCESSING, on_the_fly=True, backend="auto", num_cucim_workers=4),
2180
+ execution=ExecutionOptions(batch_size=2, num_workers=None, num_gpus=1),
2181
+ )
2182
+
2183
+ assert result.shape == (2, 3)
2184
+ assert captured["kwargs"]["num_workers"] == 24
2185
+ assert captured["kwargs"]["persistent_workers"] is True
2186
+ assert captured["kwargs"]["prefetch_factor"] == 4
2187
+ assert captured["wsd_collator_kwargs"]["backend"] == "asap"
2188
+
2189
+
2036
2190
  def test_compute_tile_embeddings_for_slide_uses_batched_loader_knobs(monkeypatch):
2037
2191
  import slide2vec.inference as inference
2038
2192
  torch = pytest.importorskip("torch")
@@ -2541,6 +2695,9 @@ def test_compute_tile_embeddings_for_slide_uses_resolved_wsd_backend_when_auto(m
2541
2695
  )
2542
2696
 
2543
2697
  assert result.shape == (2, 3)
2698
+ assert captured["kwargs"]["num_workers"] == 8
2699
+ assert captured["kwargs"]["persistent_workers"] is True
2700
+ assert captured["kwargs"]["prefetch_factor"] == 4
2544
2701
  assert captured["wsd_collator_kwargs"]["backend"] == "asap"
2545
2702
 
2546
2703
 
File without changes
File without changes
File without changes
File without changes
File without changes