ds_crawler 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. ds_crawler-1.0.1/.github/workflows/workflow.yml +30 -0
  2. ds_crawler-1.0.1/.gitignore +7 -0
  3. ds_crawler-1.0.1/PKG-INFO +10 -0
  4. ds_crawler-1.0.1/README.md +398 -0
  5. ds_crawler-1.0.1/config_example.json +75 -0
  6. ds_crawler-1.0.1/ds_crawler/__init__.py +50 -0
  7. ds_crawler-1.0.1/ds_crawler/cli.py +130 -0
  8. ds_crawler-1.0.1/ds_crawler/config.py +555 -0
  9. ds_crawler-1.0.1/ds_crawler/handlers/__init__.py +36 -0
  10. ds_crawler-1.0.1/ds_crawler/handlers/base.py +30 -0
  11. ds_crawler-1.0.1/ds_crawler/handlers/generic.py +28 -0
  12. ds_crawler-1.0.1/ds_crawler/handlers/zip_handler.py +53 -0
  13. ds_crawler-1.0.1/ds_crawler/operations.py +682 -0
  14. ds_crawler-1.0.1/ds_crawler/parser.py +863 -0
  15. ds_crawler-1.0.1/ds_crawler/path_filters.py +230 -0
  16. ds_crawler-1.0.1/ds_crawler/schema.py +80 -0
  17. ds_crawler-1.0.1/ds_crawler/traversal.py +324 -0
  18. ds_crawler-1.0.1/ds_crawler/validation.py +346 -0
  19. ds_crawler-1.0.1/ds_crawler/writer.py +556 -0
  20. ds_crawler-1.0.1/ds_crawler/zip_utils.py +234 -0
  21. ds_crawler-1.0.1/example-output-rds.json +101 -0
  22. ds_crawler-1.0.1/examples/config.json +82 -0
  23. ds_crawler-1.0.1/examples/example.py +56 -0
  24. ds_crawler-1.0.1/examples/example_output.json +283 -0
  25. ds_crawler-1.0.1/examples/run.sh +31 -0
  26. ds_crawler-1.0.1/examples/sample_realdrivesim.py +97 -0
  27. ds_crawler-1.0.1/meta/__init__.py +0 -0
  28. ds_crawler-1.0.1/meta/build_meta_schema.py +109 -0
  29. ds_crawler-1.0.1/meta/schema.json +67 -0
  30. ds_crawler-1.0.1/package-lock.json +6 -0
  31. ds_crawler-1.0.1/pyproject.toml +21 -0
  32. ds_crawler-1.0.1/split_rds.py +21 -0
  33. ds_crawler-1.0.1/tests/__init__.py +0 -0
  34. ds_crawler-1.0.1/tests/conftest.py +379 -0
  35. ds_crawler-1.0.1/tests/test_align_datasets.py +292 -0
  36. ds_crawler-1.0.1/tests/test_config.py +1001 -0
  37. ds_crawler-1.0.1/tests/test_copy_dataset.py +566 -0
  38. ds_crawler-1.0.1/tests/test_extract_datasets.py +347 -0
  39. ds_crawler-1.0.1/tests/test_handlers.py +183 -0
  40. ds_crawler-1.0.1/tests/test_integration.py +1040 -0
  41. ds_crawler-1.0.1/tests/test_parser.py +1712 -0
  42. ds_crawler-1.0.1/tests/test_split_dataset.py +1129 -0
  43. ds_crawler-1.0.1/tests/test_validation.py +173 -0
  44. ds_crawler-1.0.1/tests/test_writer.py +534 -0
  45. ds_crawler-1.0.1/tests/test_zip.py +1025 -0
@@ -0,0 +1,30 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*" # This triggers the workflow only when you push a tag starting with 'v'
7
+
8
+ jobs:
9
+ build-and-publish:
10
+ name: Build and Publish
11
+ runs-on: ubuntu-latest
12
+ environment: pypi # This matches the environment we set up in Step 2
13
+
14
+ permissions:
15
+ id-token: write # CRITICAL: This is what allows OIDC (passwordless) auth to PyPI
16
+ contents: read # Required to check out the code
17
+
18
+ steps:
19
+ - name: Checkout code
20
+ uses: actions/checkout@v4
21
+
22
+ - name: Install uv
23
+ uses: astral-sh/setup-uv@v5 # The official Astral action
24
+
25
+ - name: Build package
26
+ run: uv build
27
+
28
+ - name: Publish to PyPI
29
+ # uv publish automatically detects it is in GitHub Actions and uses OIDC
30
+ run: uv publish
@@ -0,0 +1,7 @@
1
+ __pycache__
2
+ .DS_Store
3
+ .vscode/
4
+ .claude/
5
+ /.pytest_cache
6
+ SHOULD_MATCH
7
+ /cluster/
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.4
2
+ Name: ds_crawler
3
+ Version: 1.0.1
4
+ Summary: Regex-based dataset metadata crawler and indexer
5
+ Requires-Python: >=3.9
6
+ Provides-Extra: dev
7
+ Requires-Dist: pytest; extra == 'dev'
8
+ Requires-Dist: tqdm; extra == 'dev'
9
+ Provides-Extra: progress
10
+ Requires-Dist: tqdm; extra == 'progress'
@@ -0,0 +1,398 @@
1
+ # ds-crawler
2
+
3
+ Regex-based dataset metadata crawler and indexer. Extracts structured
4
+ identifiers from file paths, organises them into a hierarchical index
5
+ (`output.json`), and provides utilities for aligning, copying, splitting
6
+ and writing multi-modal datasets.
7
+
8
+ ```
9
+ pip install . # core
10
+ pip install ".[progress]" # with tqdm progress bars
11
+ pip install ".[dev]" # with pytest + tqdm
12
+ uv pip install "ds-crawler @ git+https://github.com/d-rothen/ds-crawler"
13
+ ```
14
+
15
+ Requires Python >= 3.9. No runtime dependencies.
16
+
17
+ ---
18
+
19
+ ## Quick start
20
+
21
+ ### Index a dataset
22
+
23
+ Every dataset needs a small JSON config (`ds-crawler.json`) that tells
24
+ the crawler how to extract file IDs and (optionally) a hierarchy from
25
+ paths. Place it either inside `.ds_crawler/ds-crawler.json` at the
26
+ dataset root, or pass the config dict directly.
27
+
28
+ ```python
29
+ from ds_crawler import index_dataset_from_path
30
+
31
+ # Reads ds-crawler.json from inside the dataset, returns an output dict
32
+ output = index_dataset_from_path("/data/rgb", save_index=True)
33
+ ```
34
+
35
+ ### Align modalities by ID
36
+
37
+ ```python
38
+ from ds_crawler import align_datasets
39
+
40
+ aligned = align_datasets(
41
+ {"modality": "rgb", "source": "/data/rgb"},
42
+ {"modality": "depth", "source": "/data/depth"},
43
+ )
44
+
45
+ for file_id, mods in aligned.items():
46
+ if "rgb" in mods and "depth" in mods:
47
+ print(mods["rgb"]["path"], mods["depth"]["path"])
48
+ ```
49
+
50
+ ### Split into train / val
51
+
52
+ ```python
53
+ from ds_crawler import split_datasets
54
+
55
+ result = split_datasets(
56
+ source_paths=["/data/rgb", "/data/depth"],
57
+ suffixes=["train", "val"],
58
+ ratios=[80, 20],
59
+ seed=42,
60
+ )
61
+ ```
62
+
63
+ ### Write model outputs back to disk
64
+
65
+ ```python
66
+ from ds_crawler import DatasetWriter
67
+
68
+ writer = DatasetWriter(
69
+ "/output/segmentation",
70
+ name="segmentation",
71
+ type="segmentation",
72
+ euler_train={"used_as": "target", "modality_type": "semantic"},
73
+ )
74
+
75
+ for sample in dataloader:
76
+ pred = model(sample["rgb"])
77
+ path = writer.get_path(sample["full_id"], f"{sample['id']}.png")
78
+ save_image(pred, path)
79
+
80
+ writer.save_index() # writes output.json for later re-indexing
81
+ ```
82
+
83
+ ---
84
+
85
+ ## Configuration
86
+
87
+ ### `ds-crawler.json`
88
+
89
+ Placed at `<dataset_root>/.ds_crawler/ds-crawler.json` (or passed as a
90
+ dict). Minimal example:
91
+
92
+ ```json
93
+ {
94
+ "name": "my_rgb",
95
+ "path": "/data/my_rgb",
96
+ "type": "rgb",
97
+ "id_regex": "^frame_(\\d+)\\.png$",
98
+ "properties": {
99
+ "euler_train": {
100
+ "used_as": "input",
101
+ "modality_type": "rgb"
102
+ }
103
+ }
104
+ }
105
+ ```
106
+
107
+ | Field | Required | Description |
108
+ |---|---|---|
109
+ | `name` | yes | Human-readable dataset name |
110
+ | `path` | yes | Root directory or `.zip` archive |
111
+ | `type` | yes | Semantic label for the data modality (e.g. `"rgb"`, `"depth"`) |
112
+ | `id_regex` | yes | Regex applied to each file's relative path. Capture groups form the file ID (joined by `id_regex_join_char`). |
113
+ | `properties.euler_train.used_as` | yes | `"input"`, `"target"`, or `"condition"` |
114
+ | `properties.euler_train.modality_type` | yes | Identifier token (e.g. `"rgb"`, `"depth"`) |
115
+ | `hierarchy_regex` | no | Regex with named groups to build a hierarchy tree |
116
+ | `named_capture_group_value_separator` | no | Character joining group name and value in hierarchy keys (default `":"`) |
117
+ | `basename_regex` | no | Regex applied to basename only; properties stored per file |
118
+ | `path_regex` | no | Regex applied to full relative path; properties stored per file |
119
+ | `path_filters` | no | Path-level include/exclude rules (regex and/or term whitelist/blacklist) |
120
+ | `intrinsics_regex` | no | Regex matching camera intrinsics files |
121
+ | `extrinsics_regex` | no | Regex matching camera extrinsics files |
122
+ | `id_regex_join_char` | no | Join character for multi-group IDs (default `"+"`) |
123
+ | `file_extensions` | no | Restrict to these extensions (e.g. `[".png", ".jpg"]`) |
124
+ | `flat_ids_unique` | no | If `true`, IDs must be globally unique (not just within hierarchy node) |
125
+ | `output_json` | no | Path to a pre-existing `output.json` to load instead of crawling |
126
+
127
+ #### Path filters (`path_filters`)
128
+
129
+ Use `path_filters` when you want to include/exclude files by relative path
130
+ without changing your `id_regex`:
131
+
132
+ ```json
133
+ "path_filters": {
134
+ "include_terms": ["fog"],
135
+ "exclude_terms": ["night"],
136
+ "term_match_mode": "path_segment"
137
+ }
138
+ ```
139
+
140
+ Supported keys:
141
+
142
+ - `include_regex`: list of regexes, at least one must match when provided
143
+ - `exclude_regex`: list of regexes, none may match
144
+ - `include_terms`: term whitelist, at least one must match when provided
145
+ - `exclude_terms`: term blacklist, none may match
146
+ - `term_match_mode`: `"substring"` (default) or `"path_segment"`
147
+ - `case_sensitive`: `true` (default) or `false`
148
+
149
+ These filters apply to both data files and camera metadata paths
150
+ (`intrinsics_regex` / `extrinsics_regex` matches).
151
+
152
+ Example: keep only VKITTI fog frames:
153
+
154
+ ```json
155
+ "path_filters": {
156
+ "include_terms": ["fog"],
157
+ "term_match_mode": "path_segment"
158
+ }
159
+ ```
160
+
161
+ Example: exclude all fog frames:
162
+
163
+ ```json
164
+ "path_filters": {
165
+ "exclude_terms": ["fog"],
166
+ "term_match_mode": "path_segment"
167
+ }
168
+ ```
169
+
170
+ ### Multi-dataset config
171
+
172
+ For the CLI, wrap multiple dataset configs in:
173
+
174
+ ```json
175
+ {
176
+ "datasets": [
177
+ { "name": "rgb", "path": "/data/rgb", "type": "rgb", "id_regex": "..." },
178
+ { "name": "depth", "path": "/data/depth", "type": "depth", "id_regex": "..." }
179
+ ]
180
+ }
181
+ ```
182
+
183
+ ---
184
+
185
+ ## Output format (`output.json`)
186
+
187
+ The index produced by the crawler:
188
+
189
+ ```json
190
+ {
191
+ "name": "my_rgb",
192
+ "type": "rgb",
193
+ "id_regex": "...",
194
+ "id_regex_join_char": "+",
195
+ "path_filters": {
196
+ "include_terms": ["fog"],
197
+ "term_match_mode": "path_segment"
198
+ },
199
+ "euler_train": { "used_as": "input", "modality_type": "rgb" },
200
+ "named_capture_group_value_separator": ":",
201
+ "dataset": {
202
+ "files": [
203
+ { "path": "frame_001.png", "id": "001", "path_properties": {}, "basename_properties": {} }
204
+ ],
205
+ "children": {
206
+ "scene:Scene01": {
207
+ "files": [ ... ],
208
+ "children": { ... }
209
+ }
210
+ }
211
+ }
212
+ }
213
+ ```
214
+
215
+ `dataset` is a recursive node: each node has `files` (leaf entries) and
216
+ `children` (named sub-nodes). Hierarchy keys follow the pattern
217
+ `<group_name><separator><value>` (e.g. `scene:Scene01`).
218
+
219
+ ---
220
+
221
+ ## CLI
222
+
223
+ ```
224
+ ds-crawler CONFIG [OPTIONS]
225
+ ```
226
+
227
+ | Flag | Description |
228
+ |---|---|
229
+ | `-o, --output PATH` | Write a single combined output file (otherwise writes per-dataset) |
230
+ | `-w, --workdir PATH` | Prepend to relative dataset paths |
231
+ | `-s, --strict` | Abort on duplicate IDs or >20% regex misses |
232
+ | `--sample N` | Keep every Nth matched file |
233
+ | `--match-index PATH` | Only include file IDs present in this output.json |
234
+ | `-v, --verbose` | Log every skipped file |
235
+
236
+ ---
237
+
238
+ ## Python API
239
+
240
+ All public symbols are re-exported from the top-level `ds_crawler`
241
+ package. They live in four submodules:
242
+
243
+ ### Indexing (`ds_crawler.parser`)
244
+
245
+ #### `index_dataset(config, *, strict, save_index, sample, match_index) -> dict`
246
+
247
+ Index a single dataset from a config dict.
248
+
249
+ #### `index_dataset_from_path(path, *, strict, save_index, force_reindex, sample, match_index) -> dict`
250
+
251
+ Index a dataset by path, reading `ds-crawler.json` from the dataset root.
252
+ Returns a cached `output.json` when available (unless `force_reindex=True`).
253
+
254
+ #### `index_dataset_from_files(config, files, *, base_path, strict, sample, match_index) -> dict`
255
+
256
+ Index from pre-collected file paths (useful when files aren't on the
257
+ local filesystem).
258
+
259
+ #### `DatasetParser(config, *, strict, sample, match_index)`
260
+
261
+ Lower-level class wrapping the full `Config` object. Methods:
262
+
263
+ - `parse_all() -> list[dict]`
264
+ - `parse_dataset(ds_config, ...) -> dict`
265
+ - `parse_dataset_from_files(ds_config, files, ...) -> dict`
266
+ - `write_output(output_path)`
267
+ - `write_outputs_per_dataset(filename="output.json") -> list[Path]`
268
+
269
+ ---
270
+
271
+ ### Traversal & filtering (`ds_crawler.traversal`)
272
+
273
+ #### `get_files(output_json) -> list[str]`
274
+
275
+ Flat list of every file path in an output dict (or list of output dicts).
276
+
277
+ #### `collect_qualified_ids(output_json) -> set[tuple[str, ...]]`
278
+
279
+ Set of `(*hierarchy_keys, file_id)` tuples. Qualified IDs distinguish
280
+ files that share the same raw ID but live at different hierarchy levels.
281
+
282
+ #### `filter_index_by_qualified_ids(output_json, qualified_ids) -> dict`
283
+
284
+ Return a pruned copy of the output dict keeping only the given IDs.
285
+
286
+ #### `split_qualified_ids(qualified_ids, ratios, *, seed=None) -> list[set]`
287
+
288
+ Partition qualified IDs by percentage (e.g. `[80, 20]`). Deterministic
289
+ when `seed` is `None` (sorted order) or fixed.
290
+
291
+ ---
292
+
293
+ ### Operations (`ds_crawler.operations`)
294
+
295
+ #### `align_datasets(*args) -> dict[str, dict[str, dict]]`
296
+
297
+ Align multiple modalities by file ID. Each positional argument is a dict
298
+ with `"modality"` (label) and `"source"` (path or output dict). Returns
299
+ `{file_id: {modality: file_entry, ...}, ...}`.
300
+
301
+ #### `copy_dataset(input_path, output_path, *, index=None, sample=None) -> dict`
302
+
303
+ Copy dataset files to a new location (directory or `.zip`), preserving
304
+ structure. Returns `{"copied": int, "missing": int, "missing_files": [...]}`.
305
+
306
+ #### `split_dataset(source_path, ratios, target_paths, *, qualified_ids, seed) -> dict`
307
+
308
+ Split a single dataset into multiple targets by percentage.
309
+
310
+ #### `split_datasets(source_paths, suffixes, ratios, *, seed) -> dict`
311
+
312
+ Split multiple aligned datasets using their common ID intersection.
313
+ Target paths are derived by appending the suffix
314
+ (`/data/rgb` + `"train"` -> `/data/rgb_train`).
315
+
316
+ ---
317
+
318
+ ### Writer (`ds_crawler.writer`)
319
+
320
+ #### `DatasetWriter(root, *, name, type, euler_train, separator=":", **properties)`
321
+
322
+ Stateful helper that turns `(full_id, basename)` pairs into filesystem
323
+ paths while accumulating an `output.json`-compatible index.
324
+
325
+ | Method | Description |
326
+ |---|---|
327
+ | `get_path(full_id, basename, *, source_meta=None) -> Path` | Register a file and get the absolute path to write to. Directories are created automatically. |
328
+ | `build_output() -> dict` | Return the accumulated index as an output dict. |
329
+ | `save_index(filename="output.json") -> Path` | Persist the index to `<root>/.ds_crawler/<filename>`. |
330
+
331
+ `full_id` follows the format produced by euler-loading:
332
+ `/scene:Scene01/camera:Cam0/scene-Scene01+camera-Cam0+frame-00001`
333
+ where each `/key:value` segment maps to a directory level and the final
334
+ component is the ds-crawler file ID.
335
+
336
+ ---
337
+
338
+ ### Validation (`ds_crawler.validation`)
339
+
340
+ #### `validate_crawler_config(config, workdir=None) -> DatasetConfig`
341
+
342
+ Validate a `ds-crawler.json` dict. Raises `ValueError` on failure.
343
+
344
+ #### `validate_output(output) -> dict`
345
+
346
+ Validate an `output.json` object (single dict or list). Raises
347
+ `ValueError` on failure.
348
+
349
+ #### `validate_dataset(path) -> dict`
350
+
351
+ Check a dataset path for valid metadata files. Returns
352
+ `{"path", "has_config", "has_output", "config", "output"}`.
353
+
354
+ ---
355
+
356
+ ### Schema (`ds_crawler.schema`)
357
+
358
+ #### `DatasetDescriptor(name, path, type, properties={})`
359
+
360
+ Minimal dataset description dataclass. Class methods:
361
+
362
+ - `from_output(data, path) -> DatasetDescriptor`
363
+ - `from_output_file(path, dataset_root) -> list[DatasetDescriptor]`
364
+
365
+ ---
366
+
367
+ ### Config (`ds_crawler.config`)
368
+
369
+ #### `DatasetConfig`
370
+
371
+ Full dataset configuration (extends `DatasetDescriptor` with regex
372
+ fields). Usually created via `DatasetConfig.from_dict(data, workdir)` or
373
+ `load_dataset_config(data, workdir)`.
374
+
375
+ #### `Config(datasets: list[DatasetConfig])`
376
+
377
+ Container loaded with `Config.from_file(path, workdir)`.
378
+
379
+ ---
380
+
381
+ ## ZIP support
382
+
383
+ All operations work transparently with `.zip` archives. Paths like
384
+ `/data/dataset.zip` are handled the same as directories: the crawler
385
+ reads file listings from the archive, `copy_dataset` writes into a new
386
+ archive, and `save_index` / `write_outputs_per_dataset` embed
387
+ `output.json` inside the zip.
388
+
389
+ ---
390
+
391
+ ## Examples
392
+
393
+ See the [`examples/`](examples/) directory:
394
+
395
+ - **`example.py`** -- index a dataset, match against an existing index,
396
+ and copy a subset.
397
+ - **`sample_realdrivesim.py`** -- subsample a multi-modal RealDriveSim
398
+ dataset (RGB, depth, segmentation) preserving cross-modality alignment.
@@ -0,0 +1,75 @@
1
+ {
2
+ "datasets": [
3
+ {
4
+ "name": "VKITTI2",
5
+ "path": "/path/to/vkitti2/rgb",
6
+ "type": "rgb",
7
+ "path_regex": "^(?P<scene>Scene\\d+)/(?P<variation>[^/]+)/frames/rgb/(?P<camera>Camera_\\d+)/",
8
+ "basename_regex": "^rgb_(?P<frame>\\d+)\\.(?P<ext>jpg|png)$",
9
+ "named_capture_group_value_separator": ":",
10
+ "intrinsics_regex": "^(?P<scene>Scene\\d+)/(?P<variation>[^/]+)/intrinsics/(?P<camera>Camera_\\d+)_intrinsics\\.txt$",
11
+ "hierarchy_regex": "^(?P<scene>Scene\\d+)/(?P<variation>[^/]+)/frames/rgb/(?P<camera>Camera_\\d+)/rgb_(?P<frame>\\d+)\\.png$",
12
+ "id_regex": "^(?P<scene>Scene\\d+)/(?P<variation>[^/]+)/frames/rgb/(?P<camera>Camera_\\d+)/rgb_(?P<frame>\\d+)\\.(?:jpg|png)$",
13
+ "properties": {
14
+ "gt": true,
15
+ "baseline": true,
16
+ "euler_train": {
17
+ "used_as": "target",
18
+ "slot": "demo.target.rgb",
19
+ "modality_type": "rgb"
20
+ },
21
+ "meta": {
22
+ "range": [0, 255]
23
+ },
24
+ "specifier": "whatever",
25
+ "some_other_prop": 1234566
26
+ }
27
+ },
28
+ {
29
+ "name": "VKITTI2_depth",
30
+ "path": "/path/to/vkitti2/depth",
31
+ "type": "depth",
32
+ "path_regex": "^(?P<scene>Scene\\d+)/(?P<variation>[^/]+)/frames/depth/(?P<camera>Camera_\\d+)/",
33
+ "basename_regex": "^depth_(?P<frame>\\d+)\\.(?P<ext>png)$",
34
+ "named_capture_group_value_separator": ":",
35
+ "hierarchy_regex": "^(?P<scene>Scene\\d+)/(?P<variation>[^/]+)/frames/depth/(?P<camera>Camera_\\d+)/depth_(?P<frame>\\d+)\\.png$",
36
+ "id_regex": "^(?P<scene>Scene\\d+)/(?P<variation>[^/]+)/frames/depth/(?P<camera>Camera_\\d+)/depth_(?P<frame>\\d+)\\.png$",
37
+ "properties": {
38
+ "gt": true,
39
+ "euler_train": {
40
+ "used_as": "target",
41
+ "slot": "demo.target.depth",
42
+ "modality_type": "depth"
43
+ },
44
+ "meta": {
45
+ "radial_depth": false,
46
+ "scale_to_meters": 1.0,
47
+ "range": [0, 65535]
48
+ }
49
+ }
50
+ },
51
+ {
52
+ "name": "predicted_depth",
53
+ "path": "/path/to/predicted_depth",
54
+ "type": "depth",
55
+ "file_extensions": [".png", ".npy"],
56
+ "basename_regex": "^(?P<frame>\\d+)_pred\\.(?P<ext>png|npy)$",
57
+ "named_capture_group_value_separator": ":",
58
+ "id_regex": "^(?P<frame>\\d+)_pred\\.png$",
59
+ "hierarchy_regex": "^(?P<frame>\\d+)_pred\\.(?:png|npy)$",
60
+ "properties": {
61
+ "gt": false,
62
+ "euler_train": {
63
+ "used_as": "input",
64
+ "slot": "demo.input.depth",
65
+ "modality_type": "depth"
66
+ },
67
+ "meta": {
68
+ "radial_depth": false,
69
+ "scale_to_meters": 1.0,
70
+ "range": [0, 65535]
71
+ }
72
+ }
73
+ }
74
+ ]
75
+ }
@@ -0,0 +1,50 @@
1
+ """Dataset crawler package."""
2
+
3
+ from .schema import DatasetDescriptor
4
+ from .config import Config, DatasetConfig, load_dataset_config
5
+ from .validation import validate_crawler_config, validate_dataset, validate_output
6
+ from .parser import (
7
+ DatasetParser,
8
+ index_dataset,
9
+ index_dataset_from_files,
10
+ index_dataset_from_path,
11
+ )
12
+ from .traversal import (
13
+ collect_qualified_ids,
14
+ filter_index_by_qualified_ids,
15
+ get_files,
16
+ split_qualified_ids,
17
+ )
18
+ from .operations import (
19
+ align_datasets,
20
+ copy_dataset,
21
+ extract_datasets,
22
+ split_dataset,
23
+ split_datasets,
24
+ )
25
+ from .writer import DatasetWriter, ZipDatasetWriter
26
+
27
+ __all__ = [
28
+ "DatasetDescriptor",
29
+ "DatasetWriter",
30
+ "ZipDatasetWriter",
31
+ "Config",
32
+ "DatasetConfig",
33
+ "DatasetParser",
34
+ "align_datasets",
35
+ "collect_qualified_ids",
36
+ "copy_dataset",
37
+ "extract_datasets",
38
+ "filter_index_by_qualified_ids",
39
+ "get_files",
40
+ "index_dataset",
41
+ "index_dataset_from_files",
42
+ "index_dataset_from_path",
43
+ "load_dataset_config",
44
+ "split_dataset",
45
+ "split_datasets",
46
+ "split_qualified_ids",
47
+ "validate_crawler_config",
48
+ "validate_dataset",
49
+ "validate_output",
50
+ ]
@@ -0,0 +1,130 @@
1
+ #!/usr/bin/env python3
2
+ """Dataset crawler main entry point."""
3
+
4
+ import argparse
5
+ import json
6
+ import logging
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ from ds_crawler.config import Config
11
+ from ds_crawler.parser import DatasetParser
12
+
13
+
14
+ def setup_logging(verbose: bool) -> None:
15
+ """Configure logging with immediate flushing for cluster compatibility."""
16
+
17
+ class FlushingHandler(logging.StreamHandler):
18
+ def emit(self, record):
19
+ super().emit(record)
20
+ self.flush()
21
+
22
+ level = logging.DEBUG if verbose else logging.INFO
23
+ handler = FlushingHandler(sys.stderr)
24
+ handler.setLevel(level)
25
+ handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
26
+
27
+ logging.root.handlers = []
28
+ logging.root.addHandler(handler)
29
+ logging.root.setLevel(level)
30
+
31
+
32
+ def main() -> int:
33
+ """Main entry point."""
34
+ parser = argparse.ArgumentParser(
35
+ description="Crawl datasets and extract metadata based on configuration."
36
+ )
37
+ parser.add_argument(
38
+ "config",
39
+ type=Path,
40
+ help="Path to the configuration JSON file",
41
+ )
42
+ parser.add_argument(
43
+ "-o",
44
+ "--output",
45
+ type=Path,
46
+ default=None,
47
+ help="Single output JSON file path. If not specified, writes output.json to each dataset's root folder.",
48
+ )
49
+ parser.add_argument(
50
+ "-v",
51
+ "--verbose",
52
+ action="store_true",
53
+ help="Enable verbose output (show each skipped file)",
54
+ )
55
+ parser.add_argument(
56
+ "-w",
57
+ "--workdir",
58
+ type=Path,
59
+ default=None,
60
+ help="Working directory to prepend to dataset paths.",
61
+ )
62
+ parser.add_argument(
63
+ "-s",
64
+ "--strict",
65
+ action="store_true",
66
+ help="Strict mode: abort on duplicate IDs or excessive regex misses.",
67
+ )
68
+ parser.add_argument(
69
+ "--sample",
70
+ type=int,
71
+ default=None,
72
+ metavar="N",
73
+ help="Keep every Nth regex-matched file (deterministic subsampling).",
74
+ )
75
+ parser.add_argument(
76
+ "--match-index",
77
+ type=Path,
78
+ default=None,
79
+ metavar="PATH",
80
+ help="Path to an output.json whose file IDs are used as a filter.",
81
+ )
82
+
83
+ args = parser.parse_args()
84
+
85
+ setup_logging(args.verbose)
86
+
87
+ try:
88
+ config = Config.from_file(args.config, workdir=args.workdir)
89
+ except FileNotFoundError as e:
90
+ print(f"Error: {e}", file=sys.stderr)
91
+ return 1
92
+ except ValueError as e:
93
+ print(f"Configuration error: {e}", file=sys.stderr)
94
+ return 1
95
+
96
+ match_index = None
97
+ if args.match_index is not None:
98
+ try:
99
+ with open(args.match_index) as f:
100
+ match_index = json.load(f)
101
+ except (FileNotFoundError, json.JSONDecodeError) as e:
102
+ print(f"Error loading match-index: {e}", file=sys.stderr)
103
+ return 1
104
+
105
+ parser_instance = DatasetParser(
106
+ config,
107
+ strict=args.strict,
108
+ sample=args.sample,
109
+ match_index=match_index,
110
+ )
111
+
112
+ logger = logging.getLogger(__name__)
113
+
114
+ try:
115
+ if args.output:
116
+ parser_instance.write_output(args.output)
117
+ logger.info(f"Output written to: {args.output}")
118
+ else:
119
+ output_paths = parser_instance.write_outputs_per_dataset()
120
+ for path in output_paths:
121
+ logger.info(f"Output written to: {path}")
122
+ except Exception as e:
123
+ print(f"Error processing datasets: {e}", file=sys.stderr)
124
+ return 1
125
+
126
+ return 0
127
+
128
+
129
+ if __name__ == "__main__":
130
+ sys.exit(main())