slide2vec 4.0.2__tar.gz → 4.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {slide2vec-4.0.2 → slide2vec-4.0.3}/PKG-INFO +1 -1
  2. {slide2vec-4.0.2 → slide2vec-4.0.3}/pyproject.toml +2 -2
  3. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/__init__.py +1 -1
  4. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/api.py +14 -7
  5. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/configs/default.yaml +2 -2
  6. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/inference.py +67 -28
  7. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/utils/tiling_io.py +9 -0
  8. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/utils/utils.py +2 -1
  9. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec.egg-info/PKG-INFO +1 -1
  10. {slide2vec-4.0.2 → slide2vec-4.0.3}/tests/test_hs2p_package_cutover.py +6 -0
  11. {slide2vec-4.0.2 → slide2vec-4.0.3}/tests/test_output_consistency.py +1 -2
  12. {slide2vec-4.0.2 → slide2vec-4.0.3}/tests/test_regression_core.py +54 -0
  13. {slide2vec-4.0.2 → slide2vec-4.0.3}/tests/test_regression_inference.py +156 -5
  14. {slide2vec-4.0.2 → slide2vec-4.0.3}/LICENSE +0 -0
  15. {slide2vec-4.0.2 → slide2vec-4.0.3}/README.md +0 -0
  16. {slide2vec-4.0.2 → slide2vec-4.0.3}/setup.cfg +0 -0
  17. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/__main__.py +0 -0
  18. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/artifacts.py +0 -0
  19. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/cli.py +0 -0
  20. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/configs/__init__.py +0 -0
  21. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/data/__init__.py +0 -0
  22. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/data/dataset.py +0 -0
  23. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/data/tile_reader.py +0 -0
  24. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/data/tile_store.py +0 -0
  25. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/distributed/__init__.py +0 -0
  26. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/distributed/direct_embed_worker.py +0 -0
  27. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/distributed/pipeline_worker.py +0 -0
  28. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/__init__.py +0 -0
  29. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/base.py +0 -0
  30. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/models/__init__.py +0 -0
  31. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/models/conch.py +0 -0
  32. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/models/gigapath.py +0 -0
  33. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/models/hibou.py +0 -0
  34. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/models/hoptimus.py +0 -0
  35. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/models/midnight.py +0 -0
  36. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/models/musk.py +0 -0
  37. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/models/phikon.py +0 -0
  38. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/models/prism.py +0 -0
  39. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/models/prost40m.py +0 -0
  40. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/models/titan.py +0 -0
  41. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/models/uni.py +0 -0
  42. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/models/virchow.py +0 -0
  43. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/registry.py +0 -0
  44. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/encoders/validation.py +0 -0
  45. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/main.py +0 -0
  46. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/model_settings.py +0 -0
  47. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/progress.py +0 -0
  48. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/registry.py +0 -0
  49. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/resources.py +0 -0
  50. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/runtime_types.py +0 -0
  51. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/utils/__init__.py +0 -0
  52. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/utils/config.py +0 -0
  53. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/utils/coordinates.py +0 -0
  54. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec/utils/log_utils.py +0 -0
  55. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec.egg-info/SOURCES.txt +0 -0
  56. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec.egg-info/dependency_links.txt +0 -0
  57. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec.egg-info/entry_points.txt +0 -0
  58. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec.egg-info/not-zip-safe +0 -0
  59. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec.egg-info/requires.txt +0 -0
  60. {slide2vec-4.0.2 → slide2vec-4.0.3}/slide2vec.egg-info/top_level.txt +0 -0
  61. {slide2vec-4.0.2 → slide2vec-4.0.3}/tests/test_batch_collator_timing.py +0 -0
  62. {slide2vec-4.0.2 → slide2vec-4.0.3}/tests/test_encoder_registry.py +0 -0
  63. {slide2vec-4.0.2 → slide2vec-4.0.3}/tests/test_packaging_metadata.py +0 -0
  64. {slide2vec-4.0.2 → slide2vec-4.0.3}/tests/test_progress.py +0 -0
  65. {slide2vec-4.0.2 → slide2vec-4.0.3}/tests/test_regression_models.py +0 -0
  66. {slide2vec-4.0.2 → slide2vec-4.0.3}/tests/test_tile_store.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: slide2vec
3
- Version: 4.0.2
3
+ Version: 4.0.3
4
4
  Summary: Embedding of whole slide images with Foundation Models
5
5
  Author-email: Clément Grisi <clement.grisi@radboudumc.nl>
6
6
  License-Expression: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "slide2vec"
7
- version = "4.0.2"
7
+ version = "4.0.3"
8
8
  description = "Embedding of whole slide images with Foundation Models"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -154,7 +154,7 @@ no_implicit_reexport = true
154
154
  max-line-length = 160
155
155
 
156
156
  [tool.bumpver]
157
- current_version = "4.0.2"
157
+ current_version = "4.0.3"
158
158
  version_pattern = "MAJOR.MINOR.PATCH"
159
159
  commit = false # We do version bumping in CI, not as a commit
160
160
  tag = false # Git tag already exists — we don't auto-tag
@@ -2,7 +2,7 @@ from slide2vec.api import EmbeddedSlide, ExecutionOptions, Model, Pipeline, Prep
2
2
  from slide2vec.artifacts import HierarchicalEmbeddingArtifact, SlideEmbeddingArtifact, TileEmbeddingArtifact
3
3
 
4
4
 
5
- __version__ = "4.0.2"
5
+ __version__ = "4.0.3"
6
6
 
7
7
  __all__ = [
8
8
  "Model",
@@ -120,7 +120,7 @@ class ExecutionOptions:
120
120
  output_dir: Path | None = None
121
121
  output_format: str = "pt"
122
122
  batch_size: int = 1
123
- num_workers: int = 0
123
+ num_workers: int | None = None
124
124
  num_preprocessing_workers: int | None = None
125
125
  num_gpus: int | None = None
126
126
  precision: str | None = None
@@ -140,7 +140,7 @@ class ExecutionOptions:
140
140
  output_dir=Path(cfg.output_dir),
141
141
  output_format="pt",
142
142
  batch_size=int(cfg.model.batch_size),
143
- num_workers=int(num_workers),
143
+ num_workers=int(num_workers) if num_workers is not None else None,
144
144
  num_preprocessing_workers=(
145
145
  int(cfg.speed.num_preprocessing_workers)
146
146
  if cfg.speed.num_preprocessing_workers is not None
@@ -165,23 +165,30 @@ class ExecutionOptions:
165
165
  cap = cpu_worker_limit()
166
166
  cpu_count = os.cpu_count() or 1
167
167
  slurm_limit = slurm_cpu_limit()
168
- capped_num_workers = min(self.num_workers, cap)
169
168
  capped_num_preprocessing_workers = (
170
169
  cap if self.num_preprocessing_workers is None else min(self.num_preprocessing_workers, cap)
171
170
  )
172
- object.__setattr__(self, "num_workers", capped_num_workers)
173
171
  object.__setattr__(self, "num_preprocessing_workers", capped_num_preprocessing_workers)
174
172
  logger = logging.getLogger(__name__)
175
173
  cap_source = f"slurm_cpu_limit={slurm_limit}" if slurm_limit is not None else f"cpu_count={cpu_count}"
174
+ resolved_num_workers = self.resolved_num_workers()
175
+ num_workers_label = (
176
+ f"{resolved_num_workers} (requested=auto)"
177
+ if self.num_workers is None
178
+ else str(resolved_num_workers)
179
+ )
176
180
  logger.info(
177
- "ExecutionOptions: num_workers=%d, num_preprocessing_workers=%d "
178
- "(cap=%d via %s)",
179
- capped_num_workers,
181
+ "ExecutionOptions: num_workers=%s, num_preprocessing_workers=%d "
182
+ "(preprocessing cap=%d via %s)",
183
+ num_workers_label,
180
184
  capped_num_preprocessing_workers,
181
185
  cap,
182
186
  cap_source,
183
187
  )
184
188
 
189
+ def resolved_num_workers(self) -> int:
190
+ return cpu_worker_limit() if self.num_workers is None else int(self.num_workers)
191
+
185
192
  def with_output_dir(self, output_dir: PathLike | None) -> "ExecutionOptions":
186
193
  if output_dir is None:
187
194
  return self
@@ -66,9 +66,9 @@ tiling:
66
66
 
67
67
  speed:
68
68
  precision: # model inference precision ["fp32", "fp16", "bf16"]; if not set, determined automatically based on model recommendations
69
- num_dataloader_workers: 8 # number of DataLoader worker processes for reading tiles during embedding (tar path); on-the-fly path derives this automatically from cpu_count // speed.num_cucim_workers
69
+ num_dataloader_workers: # number of DataLoader worker processes for reading tiles during embedding; defaults to auto (job CPU budget, except cuCIM on-the-fly uses cpu_budget // speed.num_cucim_workers)
70
70
  num_gpus: # number of GPUs to use for feature extraction; defaults to all available GPUs
71
- num_preprocessing_workers: # number of workers for hs2p tiling (WSI reading, JPEG encoding, tar writing); defaults to the CPU budget at runtime
71
+ num_preprocessing_workers: # number of workers for hs2p tiling (WSI reading, JPEG encoding, tar writing); defaults to the runtime CPU budget capped at 64
72
72
  num_cucim_workers: 4 # number of internal cucim threads per read_region call (embedding path, on-the-fly only); DataLoader workers are auto-set to cpu_count // num_cucim_workers
73
73
  prefetch_factor_embedding: 4 # prefetch factor for tile embedding dataloaders
74
74
  persistent_workers_embedding: true # keep DataLoader workers alive across epochs/batches
@@ -39,7 +39,11 @@ from slide2vec.artifacts import (
39
39
  write_tile_embedding_metadata,
40
40
  write_tile_embeddings,
41
41
  )
42
- from slide2vec.encoders.registry import encoder_registry, resolve_preprocessing_defaults
42
+ from slide2vec.encoders.registry import (
43
+ encoder_registry,
44
+ resolve_encoder_output,
45
+ resolve_preprocessing_defaults,
46
+ )
43
47
  from slide2vec.model_settings import canonicalize_model_name
44
48
  from slide2vec.runtime_types import LoadedModel
45
49
  from slide2vec.progress import (
@@ -386,6 +390,8 @@ def embed_slides(
386
390
  persist_tile_embeddings=persist_tile_embeddings,
387
391
  persist_hierarchical_embeddings=persist_hierarchical_embeddings,
388
392
  include_slide_embeddings=include_slide_embeddings,
393
+ encoder_name=model.name,
394
+ output_variant=_resolved_process_list_output_variant(model),
389
395
  tile_artifacts=tile_artifacts,
390
396
  hierarchical_artifacts=hierarchical_artifacts,
391
397
  slide_artifacts=slide_artifacts,
@@ -717,6 +723,8 @@ def run_pipeline(
717
723
  persist_tile_embeddings=persist_tile_embeddings,
718
724
  persist_hierarchical_embeddings=persist_hierarchical_embeddings,
719
725
  include_slide_embeddings=include_slide_embeddings,
726
+ encoder_name=model.name,
727
+ output_variant=_resolved_process_list_output_variant(model),
720
728
  tile_artifacts=tile_artifacts,
721
729
  hierarchical_artifacts=hierarchical_artifacts,
722
730
  slide_artifacts=slide_artifacts,
@@ -903,6 +911,8 @@ def _build_incremental_persist_callback(
903
911
  persist_tile_embeddings=persist_tile_embeddings,
904
912
  persist_hierarchical_embeddings=persist_hierarchical_embeddings,
905
913
  include_slide_embeddings=include_slide_embeddings,
914
+ encoder_name=model.name,
915
+ output_variant=_resolved_process_list_output_variant(model),
906
916
  tile_artifacts=[tile_artifact] if isinstance(tile_artifact, TileEmbeddingArtifact) else [],
907
917
  hierarchical_artifacts=[tile_artifact] if isinstance(tile_artifact, HierarchicalEmbeddingArtifact) else [],
908
918
  slide_artifacts=[slide_artifact] if slide_artifact is not None else [],
@@ -1054,6 +1064,8 @@ def _collect_distributed_pipeline_artifacts(
1054
1064
  persist_tile_embeddings=persist_tile_embeddings,
1055
1065
  persist_hierarchical_embeddings=persist_hierarchical_embeddings,
1056
1066
  include_slide_embeddings=include_slide_embeddings,
1067
+ encoder_name=model.name,
1068
+ output_variant=_resolved_process_list_output_variant(model),
1057
1069
  tile_artifacts=tile_artifacts,
1058
1070
  hierarchical_artifacts=hierarchical_artifacts,
1059
1071
  slide_artifacts=slide_artifacts,
@@ -1206,7 +1218,7 @@ def _compute_tile_embeddings_for_slide(
1206
1218
  )
1207
1219
  loader_kwargs = _embedding_dataloader_kwargs(loaded, execution)
1208
1220
  resolved_backend = _resolve_slide_backend(preprocessing, tiling_result)
1209
- if preprocessing.on_the_fly and preprocessing.read_tiles_from is None:
1221
+ if preprocessing.on_the_fly and preprocessing.read_tiles_from is None and resolved_backend == "cucim":
1210
1222
  effective_num_workers, worker_context = _resolve_on_the_fly_num_workers(preprocessing.num_cucim_workers)
1211
1223
  if effective_num_workers != execution.num_workers:
1212
1224
  logging.getLogger(__name__).info(
@@ -1289,18 +1301,19 @@ def _compute_hierarchical_embeddings_for_slide(
1289
1301
  target_tile_size_px=int(geometry["target_tile_size_px"]),
1290
1302
  )
1291
1303
  loader_kwargs = _embedding_dataloader_kwargs(loaded, execution)
1292
- effective_num_workers, worker_context = _resolve_on_the_fly_num_workers(preprocessing.num_cucim_workers)
1293
1304
  resolved_backend = _resolve_slide_backend(preprocessing, tiling_result)
1294
- if effective_num_workers != execution.num_workers:
1295
- logging.getLogger(__name__).info(
1296
- f"on-the-fly hierarchical mode: setting DataLoader num_workers={effective_num_workers} "
1297
- f"({worker_context}); "
1298
- f"ignoring speed.num_dataloader_workers={execution.num_workers}"
1299
- )
1300
- loader_kwargs["num_workers"] = effective_num_workers
1301
- if effective_num_workers == 0:
1302
- loader_kwargs.pop("persistent_workers", None)
1303
- loader_kwargs.pop("prefetch_factor", None)
1305
+ if resolved_backend == "cucim":
1306
+ effective_num_workers, worker_context = _resolve_on_the_fly_num_workers(preprocessing.num_cucim_workers)
1307
+ if effective_num_workers != execution.num_workers:
1308
+ logging.getLogger(__name__).info(
1309
+ f"on-the-fly hierarchical mode: setting DataLoader num_workers={effective_num_workers} "
1310
+ f"({worker_context}); "
1311
+ f"ignoring speed.num_dataloader_workers={execution.num_workers}"
1312
+ )
1313
+ loader_kwargs["num_workers"] = effective_num_workers
1314
+ if effective_num_workers == 0:
1315
+ loader_kwargs.pop("persistent_workers", None)
1316
+ loader_kwargs.pop("prefetch_factor", None)
1304
1317
  _configure_cucim_worker_stderr(
1305
1318
  loader_kwargs,
1306
1319
  backend=resolved_backend,
@@ -1376,12 +1389,13 @@ def _compute_hierarchical_embedding_shard_for_slide(
1376
1389
  target_tile_size_px=int(geometry["target_tile_size_px"]),
1377
1390
  )
1378
1391
  loader_kwargs = _embedding_dataloader_kwargs(loaded, execution)
1379
- effective_num_workers, _worker_context = _resolve_on_the_fly_num_workers(preprocessing.num_cucim_workers)
1380
1392
  resolved_backend = _resolve_slide_backend(preprocessing, tiling_result)
1381
- loader_kwargs["num_workers"] = effective_num_workers
1382
- if effective_num_workers == 0:
1383
- loader_kwargs.pop("persistent_workers", None)
1384
- loader_kwargs.pop("prefetch_factor", None)
1393
+ if resolved_backend == "cucim":
1394
+ effective_num_workers, _worker_context = _resolve_on_the_fly_num_workers(preprocessing.num_cucim_workers)
1395
+ loader_kwargs["num_workers"] = effective_num_workers
1396
+ if effective_num_workers == 0:
1397
+ loader_kwargs.pop("persistent_workers", None)
1398
+ loader_kwargs.pop("prefetch_factor", None)
1385
1399
  _configure_cucim_worker_stderr(
1386
1400
  loader_kwargs,
1387
1401
  backend=resolved_backend,
@@ -1686,11 +1700,12 @@ def _write_hierarchical_embedding_artifact(
1686
1700
 
1687
1701
 
1688
1702
  def _embedding_dataloader_kwargs(loaded: LoadedModel, execution: ExecutionOptions) -> dict[str, Any]:
1703
+ resolved_num_workers = execution.resolved_num_workers()
1689
1704
  kwargs: dict[str, Any] = {
1690
- "num_workers": execution.num_workers,
1705
+ "num_workers": resolved_num_workers,
1691
1706
  "pin_memory": _uses_cuda_runtime(loaded.device),
1692
1707
  }
1693
- if execution.num_workers > 0:
1708
+ if resolved_num_workers > 0:
1694
1709
  kwargs["persistent_workers"] = bool(execution.persistent_workers)
1695
1710
  kwargs["prefetch_factor"] = int(execution.prefetch_factor)
1696
1711
  return kwargs
@@ -2245,6 +2260,16 @@ def _should_persist_tile_embeddings(model, execution: ExecutionOptions) -> bool:
2245
2260
  return True
2246
2261
 
2247
2262
 
2263
+ def _resolved_process_list_output_variant(model) -> str | None:
2264
+ if not hasattr(model, "name") or model.name not in encoder_registry:
2265
+ return model._output_variant if hasattr(model, "_output_variant") else None
2266
+ resolved = resolve_encoder_output(
2267
+ model.name,
2268
+ requested_output_variant=model._output_variant,
2269
+ )
2270
+ return str(resolved["output_variant"])
2271
+
2272
+
2248
2273
  def _prepare_tiled_slides(
2249
2274
  slide_records: Sequence[SlideSpec],
2250
2275
  preprocessing: PreprocessingConfig,
@@ -3043,7 +3068,7 @@ def deserialize_preprocessing(payload: dict[str, Any]) -> PreprocessingConfig:
3043
3068
  def deserialize_execution(payload: dict[str, Any]) -> ExecutionOptions:
3044
3069
  output_dir = payload["output_dir"] if "output_dir" in payload else None
3045
3070
  batch_size = payload["batch_size"] if "batch_size" in payload else None
3046
- num_workers = payload["num_workers"] if "num_workers" in payload else 0
3071
+ num_workers = payload["num_workers"] if "num_workers" in payload else None
3047
3072
  num_gpus = payload["num_gpus"] if "num_gpus" in payload else 1
3048
3073
  precision = payload["precision"] if "precision" in payload else "fp32"
3049
3074
  prefetch_factor = payload["prefetch_factor"] if "prefetch_factor" in payload else 4
@@ -3058,7 +3083,7 @@ def deserialize_execution(payload: dict[str, Any]) -> ExecutionOptions:
3058
3083
  output_dir=Path(output_dir) if output_dir is not None else None,
3059
3084
  output_format=payload["output_format"] if "output_format" in payload else "pt",
3060
3085
  batch_size=batch_size,
3061
- num_workers=int(num_workers),
3086
+ num_workers=int(num_workers) if num_workers is not None else None,
3062
3087
  num_gpus=int(num_gpus),
3063
3088
  precision=precision,
3064
3089
  prefetch_factor=int(prefetch_factor),
@@ -3163,6 +3188,8 @@ def _update_process_list_after_embedding(
3163
3188
  persist_tile_embeddings: bool,
3164
3189
  persist_hierarchical_embeddings: bool,
3165
3190
  include_slide_embeddings: bool,
3191
+ encoder_name: str,
3192
+ output_variant: str | None,
3166
3193
  tile_artifacts: Sequence[TileEmbeddingArtifact],
3167
3194
  hierarchical_artifacts: Sequence[HierarchicalEmbeddingArtifact],
3168
3195
  slide_artifacts: Sequence[SlideEmbeddingArtifact],
@@ -3177,6 +3204,12 @@ def _update_process_list_after_embedding(
3177
3204
  df["feature_status"] = ["tbp"] * len(df)
3178
3205
  if "feature_path" not in df.columns:
3179
3206
  df["feature_path"] = [None] * len(df)
3207
+ if "encoder_name" not in df.columns:
3208
+ df["encoder_name"] = [None] * len(df)
3209
+ if "output_variant" not in df.columns:
3210
+ df["output_variant"] = [None] * len(df)
3211
+ if "feature_kind" not in df.columns:
3212
+ df["feature_kind"] = [None] * len(df)
3180
3213
  if include_slide_embeddings and "aggregation_status" not in df.columns:
3181
3214
  df["aggregation_status"] = ["tbp"] * len(df)
3182
3215
  tile_success_ids = {artifact.sample_id for artifact in tile_artifacts}
@@ -3184,28 +3217,34 @@ def _update_process_list_after_embedding(
3184
3217
  slide_success_ids = {artifact.sample_id for artifact in slide_artifacts}
3185
3218
  if slide_artifacts:
3186
3219
  feature_path_by_sample_id = {artifact.sample_id: _resolve_path_str(artifact.path) for artifact in slide_artifacts}
3220
+ feature_kind = "slide"
3221
+ feature_success_ids = slide_success_ids
3187
3222
  elif persist_hierarchical_embeddings:
3188
3223
  feature_path_by_sample_id = {
3189
3224
  artifact.sample_id: _resolve_path_str(artifact.path) for artifact in hierarchical_artifacts
3190
3225
  }
3226
+ feature_kind = "hierarchical"
3227
+ feature_success_ids = hierarchical_success_ids
3191
3228
  elif persist_tile_embeddings:
3192
3229
  feature_path_by_sample_id = {
3193
3230
  artifact.sample_id: _resolve_path_str(artifact.path) for artifact in tile_artifacts
3194
3231
  }
3232
+ feature_kind = "tile"
3233
+ feature_success_ids = tile_success_ids
3195
3234
  else:
3196
3235
  feature_path_by_sample_id = {}
3236
+ feature_kind = None
3237
+ feature_success_ids = {slide.sample_id for slide in successful_slides}
3197
3238
  for slide in successful_slides:
3198
3239
  mask = df["sample_id"].astype(str) == slide.sample_id
3199
- if persist_hierarchical_embeddings:
3200
- feature_status = "success" if slide.sample_id in hierarchical_success_ids else "error"
3201
- elif persist_tile_embeddings:
3202
- feature_status = "success" if slide.sample_id in tile_success_ids else "error"
3203
- else:
3204
- feature_status = "success"
3240
+ feature_status = "success" if slide.sample_id in feature_success_ids else "error"
3205
3241
  df.loc[mask, "feature_status"] = feature_status
3206
3242
  mapped_feature_path = feature_path_by_sample_id.get(slide.sample_id)
3207
3243
  if mapped_feature_path is not None:
3208
3244
  df.loc[mask, "feature_path"] = mapped_feature_path
3245
+ df.loc[mask, "encoder_name"] = encoder_name
3246
+ df.loc[mask, "output_variant"] = output_variant
3247
+ df.loc[mask, "feature_kind"] = feature_kind
3209
3248
  if include_slide_embeddings:
3210
3249
  df.loc[mask, "aggregation_status"] = (
3211
3250
  "success" if slide.sample_id in slide_success_ids else "error"
@@ -53,6 +53,9 @@ BASE_EMBEDDING_ORDERED_COLUMNS = (
53
53
  "tiling_preview_path",
54
54
  "feature_status",
55
55
  "feature_path",
56
+ "encoder_name",
57
+ "output_variant",
58
+ "feature_kind",
56
59
  "error",
57
60
  "traceback",
58
61
  )
@@ -157,6 +160,12 @@ def load_embedding_process_df(
157
160
  df["feature_status"] = ["tbp"] * len(df)
158
161
  if "feature_path" not in df.columns:
159
162
  df["feature_path"] = [None] * len(df)
163
+ if "encoder_name" not in df.columns:
164
+ df["encoder_name"] = [None] * len(df)
165
+ if "output_variant" not in df.columns:
166
+ df["output_variant"] = [None] * len(df)
167
+ if "feature_kind" not in df.columns:
168
+ df["feature_kind"] = [None] * len(df)
160
169
  if include_aggregation_status and "aggregation_status" not in df.columns:
161
170
  df["aggregation_status"] = ["tbp"] * len(df)
162
171
  ordered_columns = list(BASE_EMBEDDING_ORDERED_COLUMNS)
@@ -90,7 +90,8 @@ def cpu_worker_limit() -> int:
90
90
  """Return the largest safe worker count for CPU-bound tiling work."""
91
91
  cpu_count = os.cpu_count() or 1
92
92
  slurm_limit = slurm_cpu_limit()
93
- return min(cpu_count, slurm_limit) if slurm_limit is not None else cpu_count
93
+ available = min(cpu_count, slurm_limit) if slurm_limit is not None else cpu_count
94
+ return min(available, 64)
94
95
 
95
96
 
96
97
  def _parse_positive_cpu_value(value: str) -> int | None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: slide2vec
3
- Version: 4.0.2
3
+ Version: 4.0.3
4
4
  Summary: Embedding of whole slide images with Foundation Models
5
5
  Author-email: Clément Grisi <clement.grisi@radboudumc.nl>
6
6
  License-Expression: Apache-2.0
@@ -133,12 +133,18 @@ def test_load_embedding_process_df_accepts_hs2p_process_list_columns(tmp_path: P
133
133
  "tiling_preview_path",
134
134
  "feature_status",
135
135
  "feature_path",
136
+ "encoder_name",
137
+ "output_variant",
138
+ "feature_kind",
136
139
  "aggregation_status",
137
140
  "error",
138
141
  "traceback",
139
142
  ]
140
143
  assert df.loc[0, "feature_status"] == "tbp"
141
144
  assert pd.isna(df.loc[0, "feature_path"])
145
+ assert pd.isna(df.loc[0, "encoder_name"])
146
+ assert pd.isna(df.loc[0, "output_variant"])
147
+ assert pd.isna(df.loc[0, "feature_kind"])
142
148
 
143
149
 
144
150
  def test_load_tiling_process_df_rejects_legacy_mask_columns(tmp_path: Path):
@@ -60,8 +60,7 @@ MODEL_PARAMS = dict(
60
60
  # -- speed --
61
61
  SPEED_PARAMS = dict(
62
62
  precision="fp16", # override (default: fp32)
63
- num_workers=4, # override (default: 8)
64
- num_workers_embedding=4, # override (default: 8)
63
+ num_dataloader_workers=0, # keep the Prism subprocess path single-process to avoid worker SHM pressure
65
64
  )
66
65
 
67
66
  # ---------------------------------------------------------------------------
@@ -310,9 +310,45 @@ def test_execution_options_defaults_preprocessing_workers_to_cpu_budget(monkeypa
310
310
 
311
311
  assert api.ExecutionOptions().num_preprocessing_workers == 24
312
312
 
313
+ def test_execution_options_preserves_explicit_dataloader_workers(monkeypatch):
314
+ import slide2vec.api as api
315
+
316
+ monkeypatch.setattr(api, "cpu_worker_limit", lambda: 2)
317
+ monkeypatch.setattr(api, "slurm_cpu_limit", lambda: 2)
318
+
319
+ execution = api.ExecutionOptions(num_workers=3)
320
+
321
+ assert execution.num_workers == 3
322
+ assert execution.num_preprocessing_workers == 2
323
+
324
+ def test_cpu_worker_limit_caps_large_cpu_budget_to_sixty_four(monkeypatch):
325
+ import slide2vec.utils.utils as utils
326
+
327
+ monkeypatch.setattr(utils.os, "cpu_count", lambda: 128)
328
+ monkeypatch.setattr(utils, "slurm_cpu_limit", lambda: 96)
329
+
330
+ assert utils.cpu_worker_limit() == 64
331
+
313
332
  def test_execution_options_default_batch_size_is_one():
314
333
  assert ExecutionOptions().batch_size == 1
315
334
 
335
+ def test_execution_options_default_num_workers_is_auto():
336
+ assert ExecutionOptions().num_workers is None
337
+
338
+ def test_execution_options_logs_resolved_auto_num_workers(monkeypatch, caplog):
339
+ import slide2vec.api as api
340
+
341
+ monkeypatch.setattr(api, "cpu_worker_limit", lambda: 18)
342
+ monkeypatch.setattr(api, "slurm_cpu_limit", lambda: 18)
343
+ monkeypatch.setattr(api.os, "cpu_count", lambda: 64)
344
+
345
+ with caplog.at_level("INFO"):
346
+ execution = api.ExecutionOptions()
347
+
348
+ assert execution.num_workers is None
349
+ assert "ExecutionOptions: num_workers=18 (requested=auto)" in caplog.text
350
+ assert "num_workers=auto" not in caplog.text
351
+
316
352
  def test_execution_options_from_config_maps_cli_fields(tmp_path: Path):
317
353
  cfg = SimpleNamespace(
318
354
  output_dir=str(tmp_path),
@@ -368,6 +404,24 @@ def test_execution_options_from_config_defaults_preprocessing_workers_to_cpu_bud
368
404
 
369
405
  assert execution.num_preprocessing_workers == 18
370
406
 
407
+ def test_execution_options_from_config_preserves_auto_num_workers(tmp_path: Path):
408
+ cfg = SimpleNamespace(
409
+ output_dir=str(tmp_path),
410
+ model=SimpleNamespace(batch_size=4, save_tile_embeddings=False, save_latents=False),
411
+ speed=SimpleNamespace(
412
+ precision="fp16",
413
+ num_dataloader_workers=None,
414
+ num_preprocessing_workers=None,
415
+ num_gpus=3,
416
+ prefetch_factor_embedding=5,
417
+ persistent_workers_embedding=False,
418
+ ),
419
+ )
420
+
421
+ execution = ExecutionOptions.from_config(cfg)
422
+
423
+ assert execution.num_workers is None
424
+
371
425
  def test_execution_options_from_config_defaults_to_all_available_gpus_when_unset(monkeypatch, tmp_path: Path):
372
426
  import slide2vec.api as api
373
427
 
@@ -292,9 +292,18 @@ def test_has_complete_local_embedding_outputs_uses_hierarchical_artifacts_for_hi
292
292
  )
293
293
 
294
294
 
295
- @pytest.mark.parametrize("persist_hierarchical_embeddings", [False, True])
296
- def test_update_process_list_after_embedding_writes_feature_path(
295
+ @pytest.mark.parametrize(
296
+ ("persist_hierarchical_embeddings", "include_slide_embeddings", "expected_feature_kind"),
297
+ [
298
+ (False, False, "tile"),
299
+ (True, False, "hierarchical"),
300
+ (False, True, "slide"),
301
+ ],
302
+ )
303
+ def test_update_process_list_after_embedding_writes_feature_provenance(
297
304
  persist_hierarchical_embeddings: bool,
305
+ include_slide_embeddings: bool,
306
+ expected_feature_kind: str,
298
307
  tmp_path: Path,
299
308
  ):
300
309
  import slide2vec.inference as inference
@@ -306,7 +315,19 @@ def test_update_process_list_after_embedding_writes_feature_path(
306
315
  "slide-a,/tmp/slide-a.svs,,asap,asap,,success,1,/tmp/slide-a.coordinates.npz,/tmp/slide-a.coordinates.meta.json,tbp,,\n",
307
316
  encoding="utf-8",
308
317
  )
309
- if persist_hierarchical_embeddings:
318
+ slide_artifacts = []
319
+ if include_slide_embeddings:
320
+ artifact = write_slide_embeddings(
321
+ "slide-a",
322
+ np.zeros((4,), dtype=np.float32),
323
+ output_dir=tmp_path,
324
+ output_format="pt",
325
+ metadata={"image_path": "/tmp/slide-a.svs"},
326
+ )
327
+ tile_artifacts = []
328
+ hierarchical_artifacts = []
329
+ slide_artifacts = [artifact]
330
+ elif persist_hierarchical_embeddings:
310
331
  artifact = write_hierarchical_embeddings(
311
332
  "slide-a",
312
333
  np.zeros((1, 2, 4), dtype=np.float32),
@@ -332,15 +353,20 @@ def test_update_process_list_after_embedding_writes_feature_path(
332
353
  successful_slides=[slide],
333
354
  persist_tile_embeddings=not persist_hierarchical_embeddings,
334
355
  persist_hierarchical_embeddings=persist_hierarchical_embeddings,
335
- include_slide_embeddings=False,
356
+ include_slide_embeddings=include_slide_embeddings,
357
+ encoder_name="virchow2" if not include_slide_embeddings else "prism",
358
+ output_variant="cls" if not include_slide_embeddings else "default",
336
359
  tile_artifacts=tile_artifacts,
337
360
  hierarchical_artifacts=hierarchical_artifacts,
338
- slide_artifacts=[],
361
+ slide_artifacts=slide_artifacts,
339
362
  )
340
363
 
341
364
  recorded = pd.read_csv(process_list_path).set_index("sample_id")
342
365
  assert recorded.loc["slide-a", "feature_status"] == "success"
343
366
  assert recorded.loc["slide-a", "feature_path"] == str(artifact.path)
367
+ assert recorded.loc["slide-a", "encoder_name"] == ("virchow2" if not include_slide_embeddings else "prism")
368
+ assert recorded.loc["slide-a", "output_variant"] == ("cls" if not include_slide_embeddings else "default")
369
+ assert recorded.loc["slide-a", "feature_kind"] == expected_feature_kind
344
370
 
345
371
 
346
372
  def test_model_embed_slide_updates_process_list_feature_status_and_path_in_distributed_path(
@@ -2033,6 +2059,128 @@ def test_serialize_execution_preserves_loader_optimization_fields():
2033
2059
  assert restored.precision == "bf16"
2034
2060
 
2035
2061
 
2062
+ def test_deserialize_execution_defaults_num_workers_to_auto():
2063
+ import slide2vec.inference as inference
2064
+
2065
+ restored = inference.deserialize_execution({"batch_size": 4, "num_gpus": 1})
2066
+
2067
+ assert restored.num_workers is None
2068
+
2069
+
2070
+ def test_deserialize_execution_preserves_auto_num_workers():
2071
+ import slide2vec.inference as inference
2072
+
2073
+ restored = inference.deserialize_execution({"batch_size": 4, "num_workers": None, "num_gpus": 1})
2074
+
2075
+ assert restored.num_workers is None
2076
+
2077
+
2078
+ def test_embedding_dataloader_kwargs_resolve_auto_mode_to_cpu_budget(monkeypatch):
2079
+ import slide2vec.api as api
2080
+ import slide2vec.inference as inference
2081
+ torch = pytest.importorskip("torch")
2082
+
2083
+ monkeypatch.setattr(api, "cpu_worker_limit", lambda: 24)
2084
+
2085
+ loaded = inference.LoadedModel(
2086
+ name="test",
2087
+ level="tile",
2088
+ model=object(),
2089
+ transforms=object(),
2090
+ feature_dim=3,
2091
+ device=torch.device("cpu"),
2092
+ )
2093
+
2094
+ kwargs = inference._embedding_dataloader_kwargs(
2095
+ loaded,
2096
+ ExecutionOptions(num_workers=None, num_gpus=1),
2097
+ )
2098
+
2099
+ assert kwargs["num_workers"] == 24
2100
+ assert kwargs["persistent_workers"] is True
2101
+ assert kwargs["prefetch_factor"] == 4
2102
+
2103
+
2104
+ def test_compute_tile_embeddings_for_slide_uses_cpu_budget_for_auto_workers_on_non_cucim_on_the_fly(monkeypatch):
2105
+ import slide2vec.api as api
2106
+ import slide2vec.inference as inference
2107
+ torch = pytest.importorskip("torch")
2108
+
2109
+ captured = {}
2110
+
2111
+ class DummyLoader:
2112
+ def __init__(self, dataset, **kwargs):
2113
+ captured["kwargs"] = kwargs
2114
+
2115
+ def __iter__(self):
2116
+ yield (
2117
+ torch.tensor([0, 1], dtype=torch.long),
2118
+ torch.zeros((2, 3, 4, 4), dtype=torch.uint8),
2119
+ {"worker_batch_ms": 0.0, "reader_open_ms": 0.0, "reader_read_ms": 0.0},
2120
+ )
2121
+
2122
+ def __len__(self):
2123
+ return 1
2124
+
2125
+ class DummyEncoder:
2126
+ pretrained_cfg = {}
2127
+
2128
+ class DummyModel:
2129
+ encoder = DummyEncoder()
2130
+
2131
+ def encode_tiles(self, image):
2132
+ return torch.ones((image.shape[0], 3), dtype=torch.float32, device=image.device)
2133
+
2134
+ class DummyCollator:
2135
+ ordered_indices = None
2136
+
2137
+ def __init__(self, **kwargs):
2138
+ captured["wsd_collator_kwargs"] = kwargs
2139
+
2140
+ def __call__(self, batch_indices):
2141
+ tile_indices = torch.as_tensor(batch_indices, dtype=torch.long)
2142
+ batch = torch.zeros((len(batch_indices), 3, 4, 4), dtype=torch.uint8)
2143
+ return tile_indices, batch, {"worker_batch_ms": 0.0, "reader_open_ms": 0.0, "reader_read_ms": 0.0}
2144
+
2145
+ monkeypatch.setattr(inference, "OnTheFlyBatchTileCollator", DummyCollator)
2146
+ monkeypatch.setattr(torch.utils.data, "DataLoader", DummyLoader)
2147
+ monkeypatch.setattr(inference, "_build_batch_preprocessor", lambda *args, **kwargs: lambda batch: batch.float())
2148
+ monkeypatch.setattr(api, "cpu_worker_limit", lambda: 24)
2149
+
2150
+ loaded = inference.LoadedModel(
2151
+ name="prov-gigapath",
2152
+ level="tile",
2153
+ model=DummyModel(),
2154
+ transforms=object(),
2155
+ feature_dim=3,
2156
+ device=torch.device("cpu"),
2157
+ )
2158
+
2159
+ result = inference._compute_tile_embeddings_for_slide(
2160
+ loaded,
2161
+ SimpleNamespace(level="tile"),
2162
+ make_slide("slide-a"),
2163
+ SimpleNamespace(
2164
+ x=np.array([0, 10]),
2165
+ y=np.array([5, 15]),
2166
+ backend="asap",
2167
+ target_spacing_um=0.5,
2168
+ target_tile_size_px=4,
2169
+ read_spacing_um=0.5,
2170
+ read_tile_size_px=4,
2171
+ tile_size_lv0=224,
2172
+ ),
2173
+ preprocessing=replace(DEFAULT_PREPROCESSING, on_the_fly=True, backend="auto", num_cucim_workers=4),
2174
+ execution=ExecutionOptions(batch_size=2, num_workers=None, num_gpus=1),
2175
+ )
2176
+
2177
+ assert result.shape == (2, 3)
2178
+ assert captured["kwargs"]["num_workers"] == 24
2179
+ assert captured["kwargs"]["persistent_workers"] is True
2180
+ assert captured["kwargs"]["prefetch_factor"] == 4
2181
+ assert captured["wsd_collator_kwargs"]["backend"] == "asap"
2182
+
2183
+
2036
2184
  def test_compute_tile_embeddings_for_slide_uses_batched_loader_knobs(monkeypatch):
2037
2185
  import slide2vec.inference as inference
2038
2186
  torch = pytest.importorskip("torch")
@@ -2541,6 +2689,9 @@ def test_compute_tile_embeddings_for_slide_uses_resolved_wsd_backend_when_auto(m
2541
2689
  )
2542
2690
 
2543
2691
  assert result.shape == (2, 3)
2692
+ assert captured["kwargs"]["num_workers"] == 8
2693
+ assert captured["kwargs"]["persistent_workers"] is True
2694
+ assert captured["kwargs"]["prefetch_factor"] == 4
2544
2695
  assert captured["wsd_collator_kwargs"]["backend"] == "asap"
2545
2696
 
2546
2697
 
File without changes
File without changes
File without changes
File without changes
File without changes