euler-preprocess 2.1.0__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/PKG-INFO +86 -1
  2. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/README.md +85 -0
  3. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/cli.py +131 -2
  4. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/common/output.py +164 -8
  5. euler_preprocess-2.2.0/euler_preprocess/fog/augmentations.py +318 -0
  6. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/fog/models.py +35 -6
  7. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/fog/transform.py +467 -65
  8. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess.egg-info/PKG-INFO +86 -1
  9. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess.egg-info/SOURCES.txt +2 -0
  10. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/pyproject.toml +1 -1
  11. euler_preprocess-2.2.0/tests/test_cli_sample_selection.py +127 -0
  12. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/tests/test_fog_aux_outputs.py +110 -0
  13. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/__init__.py +0 -0
  14. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/common/__init__.py +0 -0
  15. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/common/dataset.py +0 -0
  16. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/common/device.py +0 -0
  17. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/common/intrinsics.py +0 -0
  18. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/common/io.py +0 -0
  19. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/common/logging.py +0 -0
  20. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/common/noise.py +0 -0
  21. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/common/normalize.py +0 -0
  22. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/common/sampling.py +0 -0
  23. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/common/transform.py +0 -0
  24. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/fog/__init__.py +0 -0
  25. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/fog/airlight_from_sky.py +0 -0
  26. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/fog/dcp_airlight.py +0 -0
  27. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/fog/dcp_airlight_torch.py +0 -0
  28. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/fog/dcp_heuristic_airlight.py +0 -0
  29. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/fog/dcp_heuristic_airlight_torch.py +0 -0
  30. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/fog/foggify.py +0 -0
  31. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/fog/foggify_logging.py +0 -0
  32. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/fog/logging.py +0 -0
  33. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/radial/__init__.py +0 -0
  34. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/radial/transform.py +0 -0
  35. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/sky_depth/__init__.py +0 -0
  36. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess/sky_depth/transform.py +0 -0
  37. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess.egg-info/dependency_links.txt +0 -0
  38. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess.egg-info/entry_points.txt +0 -0
  39. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess.egg-info/requires.txt +0 -0
  40. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/euler_preprocess.egg-info/top_level.txt +0 -0
  41. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/setup.cfg +0 -0
  42. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/tests/test_airlight_fallback.py +0 -0
  43. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/tests/test_dcp_heuristic_airlight.py +0 -0
  44. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/tests/test_foggify_integration.py +0 -0
  45. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/tests/test_radial.py +0 -0
  46. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/tests/test_sky_depth.py +0 -0
  47. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/tests/test_source_backed_output.py +0 -0
  48. {euler_preprocess-2.1.0 → euler_preprocess-2.2.0}/tests/test_zip_output.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: euler-preprocess
3
- Version: 2.1.0
3
+ Version: 2.2.0
4
4
  Summary: Physics-based preprocessing (fog, etc.) for RGB+depth datasets
5
5
  Requires-Python: >=3.9
6
6
  Description-Content-Type: text/markdown
@@ -49,6 +49,7 @@ Every subcommand takes a **dataset config** JSON that points to the input data a
49
49
  "transform_config_path": "configs/run1.json",
50
50
  "output_path": "/path/to/output",
51
51
  "output_slot": "rgb",
52
+ "sample": 42,
52
53
  "modalities": {
53
54
  "rgb": {"path": "/path/to/rgb", "split": "train"},
54
55
  "depth": "/path/to/depth",
@@ -78,6 +79,8 @@ Every subcommand takes a **dataset config** JSON that points to the input data a
78
79
  | `transform_config_path` | Path to the transform-specific config (see below). `fog_config_path` is also accepted for backward compatibility. |
79
80
  | `output_path` | Output root used when no pipeline target overrides it. Optional if `pipeline.output_root` or `pipeline.output_targets[].path` supplies the destination. |
80
81
  | `output_slot` | Optional slot selector when `pipeline.output_targets` contains multiple entries. Defaults to `rgb` for `fog`, `depth` for `sky-depth`, and `depth` for `radial`. |
82
+ | `sample` | Optional 0-based euler-loading dataset index. When set, only `dataset[sample]` is transformed, which is useful for small augmented benchmark slices from large datasets. |
83
+ | `samples` | Optional multi-sample selector. Use a list of 0-based indices (`[0, 10, 20]`) or a slice object such as `{"start": 0, "stop": 1000, "step": 2, "count": 100}`. `stop` is exclusive; `count` caps the selected indices after slicing. Do not set both `sample` and `samples`. |
81
84
  | `modalities` | Regular modalities that participate in sample-ID intersection. Each value is either a plain path string or an object with a `path` key and an optional `split` key (see below). Which modalities are required depends on the transform (see table below). |
82
85
  | `hierarchical_modalities` | Per-scene data (e.g. intrinsics). Same format as `modalities`. Loaded once per scene and cached. |
83
86
  | `pipeline` | Optional runtime routing block compatible with `euler-inference` (`output_root`, `outputs_manifest_path`, `output_targets`). |
@@ -86,6 +89,11 @@ Every subcommand takes a **dataset config** JSON that points to the input data a
86
89
 
87
90
  When a modality directory contains [ds-crawler](https://github.com/d-rothen/ds-crawler) split files (`.ds_crawler/split_<name>.json`), you can select a subset of the data by setting the `split` key on that modality. Sample IDs are matched by intersection across all modalities, so specifying a split on a single modality is sufficient to restrict the entire dataset.
88
91
 
92
+ For quick slices after euler-loading has matched modalities, set `samples`.
93
+ For example, `{"samples": {"step": 2}}` processes every second matched sample,
94
+ and `{"samples": {"start": 10, "step": 5, "count": 20}}` processes 20 samples
95
+ starting at index 10 with stride 5.
96
+
89
97
  **Required modalities per transform:**
90
98
 
91
99
  | Transform | `modalities` | `hierarchical_modalities` |
@@ -142,6 +150,7 @@ Controls the fog simulation.
142
150
  "contrast_threshold": 0.05,
143
151
  "device": "cpu",
144
152
  "gpu_batch_size": 4,
153
+ "augmentations": { ... },
145
154
  "selection": { ... },
146
155
  "models": { ... }
147
156
  }
@@ -156,6 +165,7 @@ Controls the fog simulation.
156
165
  | `contrast_threshold` | Threshold *C_t* used in the visibility-to-attenuation conversion (default `0.05`). |
157
166
  | `device` | `"cpu"`, `"cuda"`, `"mps"`, or `"gpu"` (alias for cuda). |
158
167
  | `gpu_batch_size` | Batch size when running on GPU. Uniform-model samples are batched; heterogeneous samples are processed individually. |
168
+ | `augmentations` | Optional stepped augmentation set. When present, every input sample produces every configured augmentation and uses the file-id hierarchy output layout described below. |
159
169
 
160
170
  ### Fog Model
161
171
 
@@ -256,6 +266,65 @@ Each model specifies a `visibility_m` distribution from which a visibility dista
256
266
 
257
267
  The sampled visibility *V* is converted to the attenuation coefficient: **k = -ln(C_t) / V**.
258
268
 
269
+ ### Stepped Augmentations
270
+
271
+ For benchmark generation, set `augmentations` in the fog config. This switches
272
+ the fog transform from one sampled output per input to one output per configured
273
+ variant:
274
+
275
+ ```json
276
+ {
277
+ "airlight": "from_sky",
278
+ "seed": 1337,
279
+ "contrast_threshold": 0.05,
280
+ "augmentations": {
281
+ "file_id_hierarchy_name": "file_id",
282
+ "attribute_key": "fog_augmentation",
283
+ "models": ["uniform"],
284
+ "visibility_m": [10, 20, 40, 70, 100],
285
+ "airlight_methods": ["from_sky"]
286
+ }
287
+ }
288
+ ```
289
+
290
+ The matrix form above expands as the Cartesian product of `models`,
291
+ `visibility_m` (MOR in metres), optional `scattering_coefficients` / `beta`, and
292
+ airlight choices. `file_id_hierarchy_name` names the inserted hierarchy level
293
+ when the underlying ds-crawler writer has a hierarchy separator; the directory
294
+ name is the source file id in either case. For tighter control, use explicit
295
+ variants:
296
+
297
+ ```json
298
+ "augmentations": {
299
+ "variants": [
300
+ {
301
+ "id": "mor_010m_sky",
302
+ "model": "uniform",
303
+ "visibility_m": 10,
304
+ "airlight_method": "from_sky"
305
+ },
306
+ {
307
+ "id": "beta_0.15_white",
308
+ "model": "heterogeneous_k",
309
+ "scattering_coefficient": 0.15,
310
+ "atmospheric_light": [1.0, 1.0, 1.0],
311
+ "k_hetero": {
312
+ "scales": "auto",
313
+ "min_factor": 0.5,
314
+ "max_factor": 1.5,
315
+ "normalize_to_mean": true
316
+ }
317
+ }
318
+ ]
319
+ }
320
+ ```
321
+
322
+ Each output entry receives per-file ds-crawler attributes under
323
+ `fog_augmentation`, including the augmentation id, source id, source full id,
324
+ model, actual scattering coefficient, actual atmospheric light, and configured
325
+ MOR/beta descriptors when available. euler-loading exposes these as
326
+ `sample["attributes"]["rgb"]["fog_augmentation"]`.
327
+
259
328
  ### Heterogeneous Noise Fields
260
329
 
261
330
  Both `k_hetero` and `ls_hetero` use Perlin FBM (fractional Brownian motion) to generate spatially-varying factor fields:
@@ -297,6 +366,22 @@ When a pipeline target is present, `pipeline.output_targets[].path` replaces
297
366
  `output_path` entirely. Standalone/direct `FogTransform(...)` usage without the
298
367
  CLI still uses the legacy per-model layout with `config.json` sidecars.
299
368
 
369
+ With `augmentations` enabled, source-backed outputs are written one level below
370
+ the source file id instead:
371
+
372
+ ```
373
+ <output_path>/
374
+ .ds_crawler/output.json
375
+ Scene01/
376
+ Camera_0/
377
+ 00000/
378
+ mor_10m_airlight_from_sky.png
379
+ mor_20m_airlight_from_sky.png
380
+ ```
381
+
382
+ Auxiliary `scattering_coefficient` and `atmospheric_light` pipeline targets use
383
+ the same file-id hierarchy and write matching `.npy` augmentation files.
384
+
300
385
  ---
301
386
 
302
387
  ## Sky-Depth Transform
@@ -35,6 +35,7 @@ Every subcommand takes a **dataset config** JSON that points to the input data a
35
35
  "transform_config_path": "configs/run1.json",
36
36
  "output_path": "/path/to/output",
37
37
  "output_slot": "rgb",
38
+ "sample": 42,
38
39
  "modalities": {
39
40
  "rgb": {"path": "/path/to/rgb", "split": "train"},
40
41
  "depth": "/path/to/depth",
@@ -64,6 +65,8 @@ Every subcommand takes a **dataset config** JSON that points to the input data a
64
65
  | `transform_config_path` | Path to the transform-specific config (see below). `fog_config_path` is also accepted for backward compatibility. |
65
66
  | `output_path` | Output root used when no pipeline target overrides it. Optional if `pipeline.output_root` or `pipeline.output_targets[].path` supplies the destination. |
66
67
  | `output_slot` | Optional slot selector when `pipeline.output_targets` contains multiple entries. Defaults to `rgb` for `fog`, `depth` for `sky-depth`, and `depth` for `radial`. |
68
+ | `sample` | Optional 0-based euler-loading dataset index. When set, only `dataset[sample]` is transformed, which is useful for small augmented benchmark slices from large datasets. |
69
+ | `samples` | Optional multi-sample selector. Use a list of 0-based indices (`[0, 10, 20]`) or a slice object such as `{"start": 0, "stop": 1000, "step": 2, "count": 100}`. `stop` is exclusive; `count` caps the selected indices after slicing. Do not set both `sample` and `samples`. |
67
70
  | `modalities` | Regular modalities that participate in sample-ID intersection. Each value is either a plain path string or an object with a `path` key and an optional `split` key (see below). Which modalities are required depends on the transform (see table below). |
68
71
  | `hierarchical_modalities` | Per-scene data (e.g. intrinsics). Same format as `modalities`. Loaded once per scene and cached. |
69
72
  | `pipeline` | Optional runtime routing block compatible with `euler-inference` (`output_root`, `outputs_manifest_path`, `output_targets`). |
@@ -72,6 +75,11 @@ Every subcommand takes a **dataset config** JSON that points to the input data a
72
75
 
73
76
  When a modality directory contains [ds-crawler](https://github.com/d-rothen/ds-crawler) split files (`.ds_crawler/split_<name>.json`), you can select a subset of the data by setting the `split` key on that modality. Sample IDs are matched by intersection across all modalities, so specifying a split on a single modality is sufficient to restrict the entire dataset.
74
77
 
78
+ For quick slices after euler-loading has matched modalities, set `samples`.
79
+ For example, `{"samples": {"step": 2}}` processes every second matched sample,
80
+ and `{"samples": {"start": 10, "step": 5, "count": 20}}` processes 20 samples
81
+ starting at index 10 with stride 5.
82
+
75
83
  **Required modalities per transform:**
76
84
 
77
85
  | Transform | `modalities` | `hierarchical_modalities` |
@@ -128,6 +136,7 @@ Controls the fog simulation.
128
136
  "contrast_threshold": 0.05,
129
137
  "device": "cpu",
130
138
  "gpu_batch_size": 4,
139
+ "augmentations": { ... },
131
140
  "selection": { ... },
132
141
  "models": { ... }
133
142
  }
@@ -142,6 +151,7 @@ Controls the fog simulation.
142
151
  | `contrast_threshold` | Threshold *C_t* used in the visibility-to-attenuation conversion (default `0.05`). |
143
152
  | `device` | `"cpu"`, `"cuda"`, `"mps"`, or `"gpu"` (alias for cuda). |
144
153
  | `gpu_batch_size` | Batch size when running on GPU. Uniform-model samples are batched; heterogeneous samples are processed individually. |
154
+ | `augmentations` | Optional stepped augmentation set. When present, every input sample produces every configured augmentation and uses the file-id hierarchy output layout described below. |
145
155
 
146
156
  ### Fog Model
147
157
 
@@ -242,6 +252,65 @@ Each model specifies a `visibility_m` distribution from which a visibility dista
242
252
 
243
253
  The sampled visibility *V* is converted to the attenuation coefficient: **k = -ln(C_t) / V**.
244
254
 
255
+ ### Stepped Augmentations
256
+
257
+ For benchmark generation, set `augmentations` in the fog config. This switches
258
+ the fog transform from one sampled output per input to one output per configured
259
+ variant:
260
+
261
+ ```json
262
+ {
263
+ "airlight": "from_sky",
264
+ "seed": 1337,
265
+ "contrast_threshold": 0.05,
266
+ "augmentations": {
267
+ "file_id_hierarchy_name": "file_id",
268
+ "attribute_key": "fog_augmentation",
269
+ "models": ["uniform"],
270
+ "visibility_m": [10, 20, 40, 70, 100],
271
+ "airlight_methods": ["from_sky"]
272
+ }
273
+ }
274
+ ```
275
+
276
+ The matrix form above expands as the Cartesian product of `models`,
277
+ `visibility_m` (MOR in metres), optional `scattering_coefficients` / `beta`, and
278
+ airlight choices. `file_id_hierarchy_name` names the inserted hierarchy level
279
+ when the underlying ds-crawler writer has a hierarchy separator; the directory
280
+ name is the source file id in either case. For tighter control, use explicit
281
+ variants:
282
+
283
+ ```json
284
+ "augmentations": {
285
+ "variants": [
286
+ {
287
+ "id": "mor_010m_sky",
288
+ "model": "uniform",
289
+ "visibility_m": 10,
290
+ "airlight_method": "from_sky"
291
+ },
292
+ {
293
+ "id": "beta_0.15_white",
294
+ "model": "heterogeneous_k",
295
+ "scattering_coefficient": 0.15,
296
+ "atmospheric_light": [1.0, 1.0, 1.0],
297
+ "k_hetero": {
298
+ "scales": "auto",
299
+ "min_factor": 0.5,
300
+ "max_factor": 1.5,
301
+ "normalize_to_mean": true
302
+ }
303
+ }
304
+ ]
305
+ }
306
+ ```
307
+
308
+ Each output entry receives per-file ds-crawler attributes under
309
+ `fog_augmentation`, including the augmentation id, source id, source full id,
310
+ model, actual scattering coefficient, actual atmospheric light, and configured
311
+ MOR/beta descriptors when available. euler-loading exposes these as
312
+ `sample["attributes"]["rgb"]["fog_augmentation"]`.
313
+
245
314
  ### Heterogeneous Noise Fields
246
315
 
247
316
  Both `k_hetero` and `ls_hetero` use Perlin FBM (fractional Brownian motion) to generate spatially-varying factor fields:
@@ -283,6 +352,22 @@ When a pipeline target is present, `pipeline.output_targets[].path` replaces
283
352
  `output_path` entirely. Standalone/direct `FogTransform(...)` usage without the
284
353
  CLI still uses the legacy per-model layout with `config.json` sidecars.
285
354
 
355
+ With `augmentations` enabled, source-backed outputs are written one level below
356
+ the source file id instead:
357
+
358
+ ```
359
+ <output_path>/
360
+ .ds_crawler/output.json
361
+ Scene01/
362
+ Camera_0/
363
+ 00000/
364
+ mor_10m_airlight_from_sky.png
365
+ mor_20m_airlight_from_sky.png
366
+ ```
367
+
368
+ Auxiliary `scattering_coefficient` and `atmospheric_light` pipeline targets use
369
+ the same file-id hierarchy and write matching `.npy` augmentation files.
370
+
286
371
  ---
287
372
 
288
373
  ## Sky-Depth Transform
@@ -8,7 +8,9 @@ from __future__ import annotations
8
8
  import argparse
9
9
  import inspect
10
10
  import json
11
+ from collections.abc import Iterable, Iterator, Sequence
11
12
  from pathlib import Path
13
+ from typing import Any
12
14
 
13
15
  from euler_preprocess.common.dataset import build_dataset
14
16
  from euler_preprocess.common.logging import get_logger, log_dataset_info
@@ -27,6 +29,132 @@ def _resolve(path_str: str, config_dir: Path) -> Path:
27
29
  return (config_dir / p).resolve()
28
30
 
29
31
 
32
+ class _SelectedSamples(Sequence):
33
+ """Lazy view over selected euler-loading dataset entries."""
34
+
35
+ def __init__(self, dataset, indices: Iterable[int]) -> None:
36
+ self.dataset = dataset
37
+ self.indices = tuple(indices)
38
+
39
+ def __len__(self) -> int:
40
+ return len(self.indices)
41
+
42
+ def __iter__(self) -> Iterator[dict]:
43
+ for index in self.indices:
44
+ yield self.dataset[index]
45
+
46
+ def __getitem__(self, index: int | slice):
47
+ if isinstance(index, slice):
48
+ return [self.dataset[i] for i in self.indices[index]]
49
+ return self.dataset[self.indices[index]]
50
+
51
+
52
+ def _validate_sample_index(value: Any, *, key: str, dataset_size: int) -> int:
53
+ if isinstance(value, bool) or not isinstance(value, int):
54
+ raise ValueError(f"{key} must be a non-negative integer index")
55
+ if value < 0:
56
+ raise ValueError(f"{key} must be a non-negative integer index")
57
+ if value >= dataset_size:
58
+ raise IndexError(
59
+ f"{key} {value} out of range for dataset of length {dataset_size}"
60
+ )
61
+ return value
62
+
63
+
64
+ def _positive_int(value: Any, *, key: str) -> int:
65
+ if isinstance(value, bool) or not isinstance(value, int):
66
+ raise ValueError(f"{key} must be a positive integer")
67
+ if value <= 0:
68
+ raise ValueError(f"{key} must be a positive integer")
69
+ return value
70
+
71
+
72
+ def _non_negative_int(value: Any, *, key: str) -> int:
73
+ if isinstance(value, bool) or not isinstance(value, int):
74
+ raise ValueError(f"{key} must be a non-negative integer")
75
+ if value < 0:
76
+ raise ValueError(f"{key} must be a non-negative integer")
77
+ return value
78
+
79
+
80
+ def _resolve_sample_indices(selection: Any, *, dataset_size: int) -> tuple[int, ...]:
81
+ if isinstance(selection, list):
82
+ indices = tuple(
83
+ _validate_sample_index(value, key="samples[]", dataset_size=dataset_size)
84
+ for value in selection
85
+ )
86
+ if not indices:
87
+ raise ValueError("samples must select at least one dataset entry")
88
+ return indices
89
+
90
+ if not isinstance(selection, dict):
91
+ raise ValueError("samples must be an object or a list of integer indices")
92
+
93
+ allowed = {"start", "stop", "step", "count"}
94
+ unknown = sorted(set(selection) - allowed)
95
+ if unknown:
96
+ raise ValueError(f"samples contains unknown keys: {', '.join(unknown)}")
97
+
98
+ start = _non_negative_int(selection.get("start", 0), key="samples.start")
99
+ stop_value = selection.get("stop")
100
+ if stop_value is None:
101
+ stop = dataset_size
102
+ else:
103
+ stop = _non_negative_int(stop_value, key="samples.stop")
104
+ step = _positive_int(selection.get("step", 1), key="samples.step")
105
+
106
+ if start >= dataset_size:
107
+ raise IndexError(
108
+ f"samples.start {start} out of range for dataset of length {dataset_size}"
109
+ )
110
+
111
+ indices = tuple(range(start, min(stop, dataset_size), step))
112
+ if "count" in selection:
113
+ count = _positive_int(selection["count"], key="samples.count")
114
+ indices = indices[:count]
115
+
116
+ if not indices:
117
+ raise ValueError("samples must select at least one dataset entry")
118
+ return indices
119
+
120
+
121
+ def _select_configured_samples(config: dict, dataset, logger):
122
+ """Apply optional top-level sample selection from the dataset config."""
123
+ has_sample = "sample" in config
124
+ has_samples = "samples" in config
125
+ if has_sample and has_samples:
126
+ raise ValueError("Use either sample or samples, not both")
127
+ if not has_sample and not has_samples:
128
+ return dataset
129
+
130
+ dataset_size = len(dataset)
131
+ if has_sample:
132
+ sample_index = _validate_sample_index(
133
+ config["sample"],
134
+ key="sample",
135
+ dataset_size=dataset_size,
136
+ )
137
+ sample = dataset[sample_index]
138
+ logger.info(
139
+ "Sample selection: using sample=%d of %d (id=%s, full_id=%s)",
140
+ sample_index,
141
+ dataset_size,
142
+ sample.get("id"),
143
+ sample.get("full_id"),
144
+ )
145
+ return [sample]
146
+
147
+ indices = _resolve_sample_indices(config["samples"], dataset_size=dataset_size)
148
+ logger.info(
149
+ "Sample selection: using %d/%d samples (first_index=%d, last_index=%d)",
150
+ len(indices),
151
+ dataset_size,
152
+ indices[0],
153
+ indices[-1],
154
+ )
155
+ return _SelectedSamples(dataset, indices)
156
+
157
+
30
158
  def _run_transform(args: argparse.Namespace, transform_class: type) -> int:
31
159
  """Shared logic for all subcommands."""
32
160
  logger = get_logger()
@@ -57,6 +185,7 @@ def _run_transform(args: argparse.Namespace, transform_class: type) -> int:
57
185
  dataset = build_dataset(config, required_modalities, required_hierarchical)
58
186
  output_backends = prepare_output_backends(config, dataset, transform_class)
59
187
  primary_backend = next(iter(output_backends.values()))
188
+ samples = _select_configured_samples(config, dataset, logger)
60
189
  dataset_name = config.get("dataset", "dataset")
61
190
 
62
191
  raw_modalities = {
@@ -69,7 +198,7 @@ def _run_transform(args: argparse.Namespace, transform_class: type) -> int:
69
198
  modality_info[name] = {"path": entry}
70
199
  else:
71
200
  modality_info[name] = entry
72
- log_dataset_info(logger, dataset_name, len(dataset), modality_info, use_gpu)
201
+ log_dataset_info(logger, dataset_name, len(samples), modality_info, use_gpu)
73
202
  for slot, backend in output_backends.items():
74
203
  logger.info("Output path [%s]: %s", slot, backend.root)
75
204
 
@@ -98,7 +227,7 @@ def _run_transform(args: argparse.Namespace, transform_class: type) -> int:
98
227
  )
99
228
  transform = transform_class(**transform_kwargs)
100
229
 
101
- saved_paths = transform.run(dataset)
230
+ saved_paths = transform.run(samples)
102
231
 
103
232
  logger.info("Transform complete. Generated %d outputs.", len(saved_paths))
104
233
  return 0