ds_crawler 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ds_crawler-1.0.1/.github/workflows/workflow.yml +30 -0
- ds_crawler-1.0.1/.gitignore +7 -0
- ds_crawler-1.0.1/PKG-INFO +10 -0
- ds_crawler-1.0.1/README.md +398 -0
- ds_crawler-1.0.1/config_example.json +75 -0
- ds_crawler-1.0.1/ds_crawler/__init__.py +50 -0
- ds_crawler-1.0.1/ds_crawler/cli.py +130 -0
- ds_crawler-1.0.1/ds_crawler/config.py +555 -0
- ds_crawler-1.0.1/ds_crawler/handlers/__init__.py +36 -0
- ds_crawler-1.0.1/ds_crawler/handlers/base.py +30 -0
- ds_crawler-1.0.1/ds_crawler/handlers/generic.py +28 -0
- ds_crawler-1.0.1/ds_crawler/handlers/zip_handler.py +53 -0
- ds_crawler-1.0.1/ds_crawler/operations.py +682 -0
- ds_crawler-1.0.1/ds_crawler/parser.py +863 -0
- ds_crawler-1.0.1/ds_crawler/path_filters.py +230 -0
- ds_crawler-1.0.1/ds_crawler/schema.py +80 -0
- ds_crawler-1.0.1/ds_crawler/traversal.py +324 -0
- ds_crawler-1.0.1/ds_crawler/validation.py +346 -0
- ds_crawler-1.0.1/ds_crawler/writer.py +556 -0
- ds_crawler-1.0.1/ds_crawler/zip_utils.py +234 -0
- ds_crawler-1.0.1/example-output-rds.json +101 -0
- ds_crawler-1.0.1/examples/config.json +82 -0
- ds_crawler-1.0.1/examples/example.py +56 -0
- ds_crawler-1.0.1/examples/example_output.json +283 -0
- ds_crawler-1.0.1/examples/run.sh +31 -0
- ds_crawler-1.0.1/examples/sample_realdrivesim.py +97 -0
- ds_crawler-1.0.1/meta/__init__.py +0 -0
- ds_crawler-1.0.1/meta/build_meta_schema.py +109 -0
- ds_crawler-1.0.1/meta/schema.json +67 -0
- ds_crawler-1.0.1/package-lock.json +6 -0
- ds_crawler-1.0.1/pyproject.toml +21 -0
- ds_crawler-1.0.1/split_rds.py +21 -0
- ds_crawler-1.0.1/tests/__init__.py +0 -0
- ds_crawler-1.0.1/tests/conftest.py +379 -0
- ds_crawler-1.0.1/tests/test_align_datasets.py +292 -0
- ds_crawler-1.0.1/tests/test_config.py +1001 -0
- ds_crawler-1.0.1/tests/test_copy_dataset.py +566 -0
- ds_crawler-1.0.1/tests/test_extract_datasets.py +347 -0
- ds_crawler-1.0.1/tests/test_handlers.py +183 -0
- ds_crawler-1.0.1/tests/test_integration.py +1040 -0
- ds_crawler-1.0.1/tests/test_parser.py +1712 -0
- ds_crawler-1.0.1/tests/test_split_dataset.py +1129 -0
- ds_crawler-1.0.1/tests/test_validation.py +173 -0
- ds_crawler-1.0.1/tests/test_writer.py +534 -0
- ds_crawler-1.0.1/tests/test_zip.py +1025 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*" # This triggers the workflow only when you push a tag starting with 'v'
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
build-and-publish:
|
|
10
|
+
name: Build and Publish
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
environment: pypi # This matches the environment we set up in Step 2
|
|
13
|
+
|
|
14
|
+
permissions:
|
|
15
|
+
id-token: write # CRITICAL: This is what allows OIDC (passwordless) auth to PyPI
|
|
16
|
+
contents: read # Required to check out the code
|
|
17
|
+
|
|
18
|
+
steps:
|
|
19
|
+
- name: Checkout code
|
|
20
|
+
uses: actions/checkout@v4
|
|
21
|
+
|
|
22
|
+
- name: Install uv
|
|
23
|
+
uses: astral-sh/setup-uv@v5 # The official Astral action
|
|
24
|
+
|
|
25
|
+
- name: Build package
|
|
26
|
+
run: uv build
|
|
27
|
+
|
|
28
|
+
- name: Publish to PyPI
|
|
29
|
+
# uv publish automatically detects it is in GitHub Actions and uses OIDC
|
|
30
|
+
run: uv publish
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ds_crawler
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: Regex-based dataset metadata crawler and indexer
|
|
5
|
+
Requires-Python: >=3.9
|
|
6
|
+
Provides-Extra: dev
|
|
7
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
8
|
+
Requires-Dist: tqdm; extra == 'dev'
|
|
9
|
+
Provides-Extra: progress
|
|
10
|
+
Requires-Dist: tqdm; extra == 'progress'
|
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
# ds-crawler
|
|
2
|
+
|
|
3
|
+
Regex-based dataset metadata crawler and indexer. Extracts structured
|
|
4
|
+
identifiers from file paths, organises them into a hierarchical index
|
|
5
|
+
(`output.json`), and provides utilities for aligning, copying, splitting
|
|
6
|
+
and writing multi-modal datasets.
|
|
7
|
+
|
|
8
|
+
```
|
|
9
|
+
pip install . # core
|
|
10
|
+
pip install ".[progress]" # with tqdm progress bars
|
|
11
|
+
pip install ".[dev]" # with pytest + tqdm
|
|
12
|
+
uv pip install "ds-crawler @ git+https://github.com/d-rothen/ds-crawler"
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Requires Python >= 3.9. No runtime dependencies.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Quick start
|
|
20
|
+
|
|
21
|
+
### Index a dataset
|
|
22
|
+
|
|
23
|
+
Every dataset needs a small JSON config (`ds-crawler.json`) that tells
|
|
24
|
+
the crawler how to extract file IDs and (optionally) a hierarchy from
|
|
25
|
+
paths. Place it either inside `.ds_crawler/ds-crawler.json` at the
|
|
26
|
+
dataset root, or pass the config dict directly.
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from ds_crawler import index_dataset_from_path
|
|
30
|
+
|
|
31
|
+
# Reads ds-crawler.json from inside the dataset, returns an output dict
|
|
32
|
+
output = index_dataset_from_path("/data/rgb", save_index=True)
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Align modalities by ID
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from ds_crawler import align_datasets
|
|
39
|
+
|
|
40
|
+
aligned = align_datasets(
|
|
41
|
+
{"modality": "rgb", "source": "/data/rgb"},
|
|
42
|
+
{"modality": "depth", "source": "/data/depth"},
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
for file_id, mods in aligned.items():
|
|
46
|
+
if "rgb" in mods and "depth" in mods:
|
|
47
|
+
print(mods["rgb"]["path"], mods["depth"]["path"])
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Split into train / val
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from ds_crawler import split_datasets
|
|
54
|
+
|
|
55
|
+
result = split_datasets(
|
|
56
|
+
source_paths=["/data/rgb", "/data/depth"],
|
|
57
|
+
suffixes=["train", "val"],
|
|
58
|
+
ratios=[80, 20],
|
|
59
|
+
seed=42,
|
|
60
|
+
)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Write model outputs back to disk
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from ds_crawler import DatasetWriter
|
|
67
|
+
|
|
68
|
+
writer = DatasetWriter(
|
|
69
|
+
"/output/segmentation",
|
|
70
|
+
name="segmentation",
|
|
71
|
+
type="segmentation",
|
|
72
|
+
euler_train={"used_as": "target", "modality_type": "semantic"},
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
for sample in dataloader:
|
|
76
|
+
pred = model(sample["rgb"])
|
|
77
|
+
path = writer.get_path(sample["full_id"], f"{sample['id']}.png")
|
|
78
|
+
save_image(pred, path)
|
|
79
|
+
|
|
80
|
+
writer.save_index() # writes output.json for later re-indexing
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Configuration
|
|
86
|
+
|
|
87
|
+
### `ds-crawler.json`
|
|
88
|
+
|
|
89
|
+
Placed at `<dataset_root>/.ds_crawler/ds-crawler.json` (or passed as a
|
|
90
|
+
dict). Minimal example:
|
|
91
|
+
|
|
92
|
+
```json
|
|
93
|
+
{
|
|
94
|
+
"name": "my_rgb",
|
|
95
|
+
"path": "/data/my_rgb",
|
|
96
|
+
"type": "rgb",
|
|
97
|
+
"id_regex": "^frame_(\\d+)\\.png$",
|
|
98
|
+
"properties": {
|
|
99
|
+
"euler_train": {
|
|
100
|
+
"used_as": "input",
|
|
101
|
+
"modality_type": "rgb"
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
| Field | Required | Description |
|
|
108
|
+
|---|---|---|
|
|
109
|
+
| `name` | yes | Human-readable dataset name |
|
|
110
|
+
| `path` | yes | Root directory or `.zip` archive |
|
|
111
|
+
| `type` | yes | Semantic label for the data modality (e.g. `"rgb"`, `"depth"`) |
|
|
112
|
+
| `id_regex` | yes | Regex applied to each file's relative path. Capture groups form the file ID (joined by `id_regex_join_char`). |
|
|
113
|
+
| `properties.euler_train.used_as` | yes | `"input"`, `"target"`, or `"condition"` |
|
|
114
|
+
| `properties.euler_train.modality_type` | yes | Identifier token (e.g. `"rgb"`, `"depth"`) |
|
|
115
|
+
| `hierarchy_regex` | no | Regex with named groups to build a hierarchy tree |
|
|
116
|
+
| `named_capture_group_value_separator` | no | Character joining group name and value in hierarchy keys (default `":"`) |
|
|
117
|
+
| `basename_regex` | no | Regex applied to basename only; properties stored per file |
|
|
118
|
+
| `path_regex` | no | Regex applied to full relative path; properties stored per file |
|
|
119
|
+
| `path_filters` | no | Path-level include/exclude rules (regex and/or term whitelist/blacklist) |
|
|
120
|
+
| `intrinsics_regex` | no | Regex matching camera intrinsics files |
|
|
121
|
+
| `extrinsics_regex` | no | Regex matching camera extrinsics files |
|
|
122
|
+
| `id_regex_join_char` | no | Join character for multi-group IDs (default `"+"`) |
|
|
123
|
+
| `file_extensions` | no | Restrict to these extensions (e.g. `[".png", ".jpg"]`) |
|
|
124
|
+
| `flat_ids_unique` | no | If `true`, IDs must be globally unique (not just within hierarchy node) |
|
|
125
|
+
| `output_json` | no | Path to a pre-existing `output.json` to load instead of crawling |
|
|
126
|
+
|
|
127
|
+
#### Path filters (`path_filters`)
|
|
128
|
+
|
|
129
|
+
Use `path_filters` when you want to include/exclude files by relative path
|
|
130
|
+
without changing your `id_regex`:
|
|
131
|
+
|
|
132
|
+
```json
|
|
133
|
+
"path_filters": {
|
|
134
|
+
"include_terms": ["fog"],
|
|
135
|
+
"exclude_terms": ["night"],
|
|
136
|
+
"term_match_mode": "path_segment"
|
|
137
|
+
}
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Supported keys:
|
|
141
|
+
|
|
142
|
+
- `include_regex`: list of regexes, at least one must match when provided
|
|
143
|
+
- `exclude_regex`: list of regexes, none may match
|
|
144
|
+
- `include_terms`: term whitelist, at least one must match when provided
|
|
145
|
+
- `exclude_terms`: term blacklist, none may match
|
|
146
|
+
- `term_match_mode`: `"substring"` (default) or `"path_segment"`
|
|
147
|
+
- `case_sensitive`: `true` (default) or `false`
|
|
148
|
+
|
|
149
|
+
These filters apply to both data files and camera metadata paths
|
|
150
|
+
(`intrinsics_regex` / `extrinsics_regex` matches).
|
|
151
|
+
|
|
152
|
+
Example: keep only VKITTI fog frames:
|
|
153
|
+
|
|
154
|
+
```json
|
|
155
|
+
"path_filters": {
|
|
156
|
+
"include_terms": ["fog"],
|
|
157
|
+
"term_match_mode": "path_segment"
|
|
158
|
+
}
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Example: exclude all fog frames:
|
|
162
|
+
|
|
163
|
+
```json
|
|
164
|
+
"path_filters": {
|
|
165
|
+
"exclude_terms": ["fog"],
|
|
166
|
+
"term_match_mode": "path_segment"
|
|
167
|
+
}
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### Multi-dataset config
|
|
171
|
+
|
|
172
|
+
For the CLI, wrap multiple dataset configs in:
|
|
173
|
+
|
|
174
|
+
```json
|
|
175
|
+
{
|
|
176
|
+
"datasets": [
|
|
177
|
+
{ "name": "rgb", "path": "/data/rgb", "type": "rgb", "id_regex": "..." },
|
|
178
|
+
{ "name": "depth", "path": "/data/depth", "type": "depth", "id_regex": "..." }
|
|
179
|
+
]
|
|
180
|
+
}
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## Output format (`output.json`)
|
|
186
|
+
|
|
187
|
+
The index produced by the crawler:
|
|
188
|
+
|
|
189
|
+
```json
|
|
190
|
+
{
|
|
191
|
+
"name": "my_rgb",
|
|
192
|
+
"type": "rgb",
|
|
193
|
+
"id_regex": "...",
|
|
194
|
+
"id_regex_join_char": "+",
|
|
195
|
+
"path_filters": {
|
|
196
|
+
"include_terms": ["fog"],
|
|
197
|
+
"term_match_mode": "path_segment"
|
|
198
|
+
},
|
|
199
|
+
"euler_train": { "used_as": "input", "modality_type": "rgb" },
|
|
200
|
+
"named_capture_group_value_separator": ":",
|
|
201
|
+
"dataset": {
|
|
202
|
+
"files": [
|
|
203
|
+
{ "path": "frame_001.png", "id": "001", "path_properties": {}, "basename_properties": {} }
|
|
204
|
+
],
|
|
205
|
+
"children": {
|
|
206
|
+
"scene:Scene01": {
|
|
207
|
+
"files": [ ... ],
|
|
208
|
+
"children": { ... }
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
`dataset` is a recursive node: each node has `files` (leaf entries) and
|
|
216
|
+
`children` (named sub-nodes). Hierarchy keys follow the pattern
|
|
217
|
+
`<group_name><separator><value>` (e.g. `scene:Scene01`).
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## CLI
|
|
222
|
+
|
|
223
|
+
```
|
|
224
|
+
ds-crawler CONFIG [OPTIONS]
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
| Flag | Description |
|
|
228
|
+
|---|---|
|
|
229
|
+
| `-o, --output PATH` | Write a single combined output file (otherwise writes per-dataset) |
|
|
230
|
+
| `-w, --workdir PATH` | Prepend to relative dataset paths |
|
|
231
|
+
| `-s, --strict` | Abort on duplicate IDs or >20% regex misses |
|
|
232
|
+
| `--sample N` | Keep every Nth matched file |
|
|
233
|
+
| `--match-index PATH` | Only include file IDs present in this output.json |
|
|
234
|
+
| `-v, --verbose` | Log every skipped file |
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
## Python API
|
|
239
|
+
|
|
240
|
+
All public symbols are re-exported from the top-level `ds_crawler`
|
|
241
|
+
package. They live in four submodules:
|
|
242
|
+
|
|
243
|
+
### Indexing (`ds_crawler.parser`)
|
|
244
|
+
|
|
245
|
+
#### `index_dataset(config, *, strict, save_index, sample, match_index) -> dict`
|
|
246
|
+
|
|
247
|
+
Index a single dataset from a config dict.
|
|
248
|
+
|
|
249
|
+
#### `index_dataset_from_path(path, *, strict, save_index, force_reindex, sample, match_index) -> dict`
|
|
250
|
+
|
|
251
|
+
Index a dataset by path, reading `ds-crawler.json` from the dataset root.
|
|
252
|
+
Returns a cached `output.json` when available (unless `force_reindex=True`).
|
|
253
|
+
|
|
254
|
+
#### `index_dataset_from_files(config, files, *, base_path, strict, sample, match_index) -> dict`
|
|
255
|
+
|
|
256
|
+
Index from pre-collected file paths (useful when files aren't on the
|
|
257
|
+
local filesystem).
|
|
258
|
+
|
|
259
|
+
#### `DatasetParser(config, *, strict, sample, match_index)`
|
|
260
|
+
|
|
261
|
+
Lower-level class wrapping the full `Config` object. Methods:
|
|
262
|
+
|
|
263
|
+
- `parse_all() -> list[dict]`
|
|
264
|
+
- `parse_dataset(ds_config, ...) -> dict`
|
|
265
|
+
- `parse_dataset_from_files(ds_config, files, ...) -> dict`
|
|
266
|
+
- `write_output(output_path)`
|
|
267
|
+
- `write_outputs_per_dataset(filename="output.json") -> list[Path]`
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
### Traversal & filtering (`ds_crawler.traversal`)
|
|
272
|
+
|
|
273
|
+
#### `get_files(output_json) -> list[str]`
|
|
274
|
+
|
|
275
|
+
Flat list of every file path in an output dict (or list of output dicts).
|
|
276
|
+
|
|
277
|
+
#### `collect_qualified_ids(output_json) -> set[tuple[str, ...]]`
|
|
278
|
+
|
|
279
|
+
Set of `(*hierarchy_keys, file_id)` tuples. Qualified IDs distinguish
|
|
280
|
+
files that share the same raw ID but live at different hierarchy levels.
|
|
281
|
+
|
|
282
|
+
#### `filter_index_by_qualified_ids(output_json, qualified_ids) -> dict`
|
|
283
|
+
|
|
284
|
+
Return a pruned copy of the output dict keeping only the given IDs.
|
|
285
|
+
|
|
286
|
+
#### `split_qualified_ids(qualified_ids, ratios, *, seed=None) -> list[set]`
|
|
287
|
+
|
|
288
|
+
Partition qualified IDs by percentage (e.g. `[80, 20]`). Deterministic
|
|
289
|
+
when `seed` is `None` (sorted order) or fixed.
|
|
290
|
+
|
|
291
|
+
---
|
|
292
|
+
|
|
293
|
+
### Operations (`ds_crawler.operations`)
|
|
294
|
+
|
|
295
|
+
#### `align_datasets(*args) -> dict[str, dict[str, dict]]`
|
|
296
|
+
|
|
297
|
+
Align multiple modalities by file ID. Each positional argument is a dict
|
|
298
|
+
with `"modality"` (label) and `"source"` (path or output dict). Returns
|
|
299
|
+
`{file_id: {modality: file_entry, ...}, ...}`.
|
|
300
|
+
|
|
301
|
+
#### `copy_dataset(input_path, output_path, *, index=None, sample=None) -> dict`
|
|
302
|
+
|
|
303
|
+
Copy dataset files to a new location (directory or `.zip`), preserving
|
|
304
|
+
structure. Returns `{"copied": int, "missing": int, "missing_files": [...]}`.
|
|
305
|
+
|
|
306
|
+
#### `split_dataset(source_path, ratios, target_paths, *, qualified_ids, seed) -> dict`
|
|
307
|
+
|
|
308
|
+
Split a single dataset into multiple targets by percentage.
|
|
309
|
+
|
|
310
|
+
#### `split_datasets(source_paths, suffixes, ratios, *, seed) -> dict`
|
|
311
|
+
|
|
312
|
+
Split multiple aligned datasets using their common ID intersection.
|
|
313
|
+
Target paths are derived by appending the suffix
|
|
314
|
+
(`/data/rgb` + `"train"` -> `/data/rgb_train`).
|
|
315
|
+
|
|
316
|
+
---
|
|
317
|
+
|
|
318
|
+
### Writer (`ds_crawler.writer`)
|
|
319
|
+
|
|
320
|
+
#### `DatasetWriter(root, *, name, type, euler_train, separator=":", **properties)`
|
|
321
|
+
|
|
322
|
+
Stateful helper that turns `(full_id, basename)` pairs into filesystem
|
|
323
|
+
paths while accumulating an `output.json`-compatible index.
|
|
324
|
+
|
|
325
|
+
| Method | Description |
|
|
326
|
+
|---|---|
|
|
327
|
+
| `get_path(full_id, basename, *, source_meta=None) -> Path` | Register a file and get the absolute path to write to. Directories are created automatically. |
|
|
328
|
+
| `build_output() -> dict` | Return the accumulated index as an output dict. |
|
|
329
|
+
| `save_index(filename="output.json") -> Path` | Persist the index to `<root>/.ds_crawler/<filename>`. |
|
|
330
|
+
|
|
331
|
+
`full_id` follows the format produced by euler-loading:
|
|
332
|
+
`/scene:Scene01/camera:Cam0/scene-Scene01+camera-Cam0+frame-00001`
|
|
333
|
+
where each `/key:value` segment maps to a directory level and the final
|
|
334
|
+
component is the ds-crawler file ID.
|
|
335
|
+
|
|
336
|
+
---
|
|
337
|
+
|
|
338
|
+
### Validation (`ds_crawler.validation`)
|
|
339
|
+
|
|
340
|
+
#### `validate_crawler_config(config, workdir=None) -> DatasetConfig`
|
|
341
|
+
|
|
342
|
+
Validate a `ds-crawler.json` dict. Raises `ValueError` on failure.
|
|
343
|
+
|
|
344
|
+
#### `validate_output(output) -> dict`
|
|
345
|
+
|
|
346
|
+
Validate an `output.json` object (single dict or list). Raises
|
|
347
|
+
`ValueError` on failure.
|
|
348
|
+
|
|
349
|
+
#### `validate_dataset(path) -> dict`
|
|
350
|
+
|
|
351
|
+
Check a dataset path for valid metadata files. Returns
|
|
352
|
+
`{"path", "has_config", "has_output", "config", "output"}`.
|
|
353
|
+
|
|
354
|
+
---
|
|
355
|
+
|
|
356
|
+
### Schema (`ds_crawler.schema`)
|
|
357
|
+
|
|
358
|
+
#### `DatasetDescriptor(name, path, type, properties={})`
|
|
359
|
+
|
|
360
|
+
Minimal dataset description dataclass. Class methods:
|
|
361
|
+
|
|
362
|
+
- `from_output(data, path) -> DatasetDescriptor`
|
|
363
|
+
- `from_output_file(path, dataset_root) -> list[DatasetDescriptor]`
|
|
364
|
+
|
|
365
|
+
---
|
|
366
|
+
|
|
367
|
+
### Config (`ds_crawler.config`)
|
|
368
|
+
|
|
369
|
+
#### `DatasetConfig`
|
|
370
|
+
|
|
371
|
+
Full dataset configuration (extends `DatasetDescriptor` with regex
|
|
372
|
+
fields). Usually created via `DatasetConfig.from_dict(data, workdir)` or
|
|
373
|
+
`load_dataset_config(data, workdir)`.
|
|
374
|
+
|
|
375
|
+
#### `Config(datasets: list[DatasetConfig])`
|
|
376
|
+
|
|
377
|
+
Container loaded with `Config.from_file(path, workdir)`.
|
|
378
|
+
|
|
379
|
+
---
|
|
380
|
+
|
|
381
|
+
## ZIP support
|
|
382
|
+
|
|
383
|
+
All operations work transparently with `.zip` archives. Paths like
|
|
384
|
+
`/data/dataset.zip` are handled the same as directories: the crawler
|
|
385
|
+
reads file listings from the archive, `copy_dataset` writes into a new
|
|
386
|
+
archive, and `save_index` / `write_outputs_per_dataset` embed
|
|
387
|
+
`output.json` inside the zip.
|
|
388
|
+
|
|
389
|
+
---
|
|
390
|
+
|
|
391
|
+
## Examples
|
|
392
|
+
|
|
393
|
+
See the [`examples/`](examples/) directory:
|
|
394
|
+
|
|
395
|
+
- **`example.py`** -- index a dataset, match against an existing index,
|
|
396
|
+
and copy a subset.
|
|
397
|
+
- **`sample_realdrivesim.py`** -- subsample a multi-modal RealDriveSim
|
|
398
|
+
dataset (RGB, depth, segmentation) preserving cross-modality alignment.
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
{
|
|
2
|
+
"datasets": [
|
|
3
|
+
{
|
|
4
|
+
"name": "VKITTI2",
|
|
5
|
+
"path": "/path/to/vkitti2/rgb",
|
|
6
|
+
"type": "rgb",
|
|
7
|
+
"path_regex": "^(?P<scene>Scene\\d+)/(?P<variation>[^/]+)/frames/rgb/(?P<camera>Camera_\\d+)/",
|
|
8
|
+
"basename_regex": "^rgb_(?P<frame>\\d+)\\.(?P<ext>jpg|png)$",
|
|
9
|
+
"named_capture_group_value_separator": ":",
|
|
10
|
+
"intrinsics_regex": "^(?P<scene>Scene\\d+)/(?P<variation>[^/]+)/intrinsics/(?P<camera>Camera_\\d+)_intrinsics\\.txt$",
|
|
11
|
+
"hierarchy_regex": "^(?P<scene>Scene\\d+)/(?P<variation>[^/]+)/frames/rgb/(?P<camera>Camera_\\d+)/rgb_(?P<frame>\\d+)\\.png$",
|
|
12
|
+
"id_regex": "^(?P<scene>Scene\\d+)/(?P<variation>[^/]+)/frames/rgb/(?P<camera>Camera_\\d+)/rgb_(?P<frame>\\d+)\\.(?:jpg|png)$",
|
|
13
|
+
"properties": {
|
|
14
|
+
"gt": true,
|
|
15
|
+
"baseline": true,
|
|
16
|
+
"euler_train": {
|
|
17
|
+
"used_as": "target",
|
|
18
|
+
"slot": "demo.target.rgb",
|
|
19
|
+
"modality_type": "rgb"
|
|
20
|
+
},
|
|
21
|
+
"meta": {
|
|
22
|
+
"range": [0, 255]
|
|
23
|
+
},
|
|
24
|
+
"specifier": "whatever",
|
|
25
|
+
"some_other_prop": 1234566
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"name": "VKITTI2_depth",
|
|
30
|
+
"path": "/path/to/vkitti2/depth",
|
|
31
|
+
"type": "depth",
|
|
32
|
+
"path_regex": "^(?P<scene>Scene\\d+)/(?P<variation>[^/]+)/frames/depth/(?P<camera>Camera_\\d+)/",
|
|
33
|
+
"basename_regex": "^depth_(?P<frame>\\d+)\\.(?P<ext>png)$",
|
|
34
|
+
"named_capture_group_value_separator": ":",
|
|
35
|
+
"hierarchy_regex": "^(?P<scene>Scene\\d+)/(?P<variation>[^/]+)/frames/depth/(?P<camera>Camera_\\d+)/depth_(?P<frame>\\d+)\\.png$",
|
|
36
|
+
"id_regex": "^(?P<scene>Scene\\d+)/(?P<variation>[^/]+)/frames/depth/(?P<camera>Camera_\\d+)/depth_(?P<frame>\\d+)\\.png$",
|
|
37
|
+
"properties": {
|
|
38
|
+
"gt": true,
|
|
39
|
+
"euler_train": {
|
|
40
|
+
"used_as": "target",
|
|
41
|
+
"slot": "demo.target.depth",
|
|
42
|
+
"modality_type": "depth"
|
|
43
|
+
},
|
|
44
|
+
"meta": {
|
|
45
|
+
"radial_depth": false,
|
|
46
|
+
"scale_to_meters": 1.0,
|
|
47
|
+
"range": [0, 65535]
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"name": "predicted_depth",
|
|
53
|
+
"path": "/path/to/predicted_depth",
|
|
54
|
+
"type": "depth",
|
|
55
|
+
"file_extensions": [".png", ".npy"],
|
|
56
|
+
"basename_regex": "^(?P<frame>\\d+)_pred\\.(?P<ext>png|npy)$",
|
|
57
|
+
"named_capture_group_value_separator": ":",
|
|
58
|
+
"id_regex": "^(?P<frame>\\d+)_pred\\.png$",
|
|
59
|
+
"hierarchy_regex": "^(?P<frame>\\d+)_pred\\.(?:png|npy)$",
|
|
60
|
+
"properties": {
|
|
61
|
+
"gt": false,
|
|
62
|
+
"euler_train": {
|
|
63
|
+
"used_as": "input",
|
|
64
|
+
"slot": "demo.input.depth",
|
|
65
|
+
"modality_type": "depth"
|
|
66
|
+
},
|
|
67
|
+
"meta": {
|
|
68
|
+
"radial_depth": false,
|
|
69
|
+
"scale_to_meters": 1.0,
|
|
70
|
+
"range": [0, 65535]
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
]
|
|
75
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Dataset crawler package."""
|
|
2
|
+
|
|
3
|
+
from .schema import DatasetDescriptor
|
|
4
|
+
from .config import Config, DatasetConfig, load_dataset_config
|
|
5
|
+
from .validation import validate_crawler_config, validate_dataset, validate_output
|
|
6
|
+
from .parser import (
|
|
7
|
+
DatasetParser,
|
|
8
|
+
index_dataset,
|
|
9
|
+
index_dataset_from_files,
|
|
10
|
+
index_dataset_from_path,
|
|
11
|
+
)
|
|
12
|
+
from .traversal import (
|
|
13
|
+
collect_qualified_ids,
|
|
14
|
+
filter_index_by_qualified_ids,
|
|
15
|
+
get_files,
|
|
16
|
+
split_qualified_ids,
|
|
17
|
+
)
|
|
18
|
+
from .operations import (
|
|
19
|
+
align_datasets,
|
|
20
|
+
copy_dataset,
|
|
21
|
+
extract_datasets,
|
|
22
|
+
split_dataset,
|
|
23
|
+
split_datasets,
|
|
24
|
+
)
|
|
25
|
+
from .writer import DatasetWriter, ZipDatasetWriter
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"DatasetDescriptor",
|
|
29
|
+
"DatasetWriter",
|
|
30
|
+
"ZipDatasetWriter",
|
|
31
|
+
"Config",
|
|
32
|
+
"DatasetConfig",
|
|
33
|
+
"DatasetParser",
|
|
34
|
+
"align_datasets",
|
|
35
|
+
"collect_qualified_ids",
|
|
36
|
+
"copy_dataset",
|
|
37
|
+
"extract_datasets",
|
|
38
|
+
"filter_index_by_qualified_ids",
|
|
39
|
+
"get_files",
|
|
40
|
+
"index_dataset",
|
|
41
|
+
"index_dataset_from_files",
|
|
42
|
+
"index_dataset_from_path",
|
|
43
|
+
"load_dataset_config",
|
|
44
|
+
"split_dataset",
|
|
45
|
+
"split_datasets",
|
|
46
|
+
"split_qualified_ids",
|
|
47
|
+
"validate_crawler_config",
|
|
48
|
+
"validate_dataset",
|
|
49
|
+
"validate_output",
|
|
50
|
+
]
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Dataset crawler main entry point."""
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from ds_crawler.config import Config
|
|
11
|
+
from ds_crawler.parser import DatasetParser
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def setup_logging(verbose: bool) -> None:
|
|
15
|
+
"""Configure logging with immediate flushing for cluster compatibility."""
|
|
16
|
+
|
|
17
|
+
class FlushingHandler(logging.StreamHandler):
|
|
18
|
+
def emit(self, record):
|
|
19
|
+
super().emit(record)
|
|
20
|
+
self.flush()
|
|
21
|
+
|
|
22
|
+
level = logging.DEBUG if verbose else logging.INFO
|
|
23
|
+
handler = FlushingHandler(sys.stderr)
|
|
24
|
+
handler.setLevel(level)
|
|
25
|
+
handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
|
|
26
|
+
|
|
27
|
+
logging.root.handlers = []
|
|
28
|
+
logging.root.addHandler(handler)
|
|
29
|
+
logging.root.setLevel(level)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def main() -> int:
|
|
33
|
+
"""Main entry point."""
|
|
34
|
+
parser = argparse.ArgumentParser(
|
|
35
|
+
description="Crawl datasets and extract metadata based on configuration."
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"config",
|
|
39
|
+
type=Path,
|
|
40
|
+
help="Path to the configuration JSON file",
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"-o",
|
|
44
|
+
"--output",
|
|
45
|
+
type=Path,
|
|
46
|
+
default=None,
|
|
47
|
+
help="Single output JSON file path. If not specified, writes output.json to each dataset's root folder.",
|
|
48
|
+
)
|
|
49
|
+
parser.add_argument(
|
|
50
|
+
"-v",
|
|
51
|
+
"--verbose",
|
|
52
|
+
action="store_true",
|
|
53
|
+
help="Enable verbose output (show each skipped file)",
|
|
54
|
+
)
|
|
55
|
+
parser.add_argument(
|
|
56
|
+
"-w",
|
|
57
|
+
"--workdir",
|
|
58
|
+
type=Path,
|
|
59
|
+
default=None,
|
|
60
|
+
help="Working directory to prepend to dataset paths.",
|
|
61
|
+
)
|
|
62
|
+
parser.add_argument(
|
|
63
|
+
"-s",
|
|
64
|
+
"--strict",
|
|
65
|
+
action="store_true",
|
|
66
|
+
help="Strict mode: abort on duplicate IDs or excessive regex misses.",
|
|
67
|
+
)
|
|
68
|
+
parser.add_argument(
|
|
69
|
+
"--sample",
|
|
70
|
+
type=int,
|
|
71
|
+
default=None,
|
|
72
|
+
metavar="N",
|
|
73
|
+
help="Keep every Nth regex-matched file (deterministic subsampling).",
|
|
74
|
+
)
|
|
75
|
+
parser.add_argument(
|
|
76
|
+
"--match-index",
|
|
77
|
+
type=Path,
|
|
78
|
+
default=None,
|
|
79
|
+
metavar="PATH",
|
|
80
|
+
help="Path to an output.json whose file IDs are used as a filter.",
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
args = parser.parse_args()
|
|
84
|
+
|
|
85
|
+
setup_logging(args.verbose)
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
config = Config.from_file(args.config, workdir=args.workdir)
|
|
89
|
+
except FileNotFoundError as e:
|
|
90
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
91
|
+
return 1
|
|
92
|
+
except ValueError as e:
|
|
93
|
+
print(f"Configuration error: {e}", file=sys.stderr)
|
|
94
|
+
return 1
|
|
95
|
+
|
|
96
|
+
match_index = None
|
|
97
|
+
if args.match_index is not None:
|
|
98
|
+
try:
|
|
99
|
+
with open(args.match_index) as f:
|
|
100
|
+
match_index = json.load(f)
|
|
101
|
+
except (FileNotFoundError, json.JSONDecodeError) as e:
|
|
102
|
+
print(f"Error loading match-index: {e}", file=sys.stderr)
|
|
103
|
+
return 1
|
|
104
|
+
|
|
105
|
+
parser_instance = DatasetParser(
|
|
106
|
+
config,
|
|
107
|
+
strict=args.strict,
|
|
108
|
+
sample=args.sample,
|
|
109
|
+
match_index=match_index,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
logger = logging.getLogger(__name__)
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
if args.output:
|
|
116
|
+
parser_instance.write_output(args.output)
|
|
117
|
+
logger.info(f"Output written to: {args.output}")
|
|
118
|
+
else:
|
|
119
|
+
output_paths = parser_instance.write_outputs_per_dataset()
|
|
120
|
+
for path in output_paths:
|
|
121
|
+
logger.info(f"Output written to: {path}")
|
|
122
|
+
except Exception as e:
|
|
123
|
+
print(f"Error processing datasets: {e}", file=sys.stderr)
|
|
124
|
+
return 1
|
|
125
|
+
|
|
126
|
+
return 0
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
if __name__ == "__main__":
|
|
130
|
+
sys.exit(main())
|