tree-distribution-shift 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tree_distribution_shift-0.1.0/LICENSE +21 -0
- tree_distribution_shift-0.1.0/PKG-INFO +281 -0
- tree_distribution_shift-0.1.0/README.md +265 -0
- tree_distribution_shift-0.1.0/pyproject.toml +30 -0
- tree_distribution_shift-0.1.0/setup.cfg +4 -0
- tree_distribution_shift-0.1.0/src/tree_distribution_shift.egg-info/PKG-INFO +281 -0
- tree_distribution_shift-0.1.0/src/tree_distribution_shift.egg-info/SOURCES.txt +14 -0
- tree_distribution_shift-0.1.0/src/tree_distribution_shift.egg-info/dependency_links.txt +1 -0
- tree_distribution_shift-0.1.0/src/tree_distribution_shift.egg-info/entry_points.txt +2 -0
- tree_distribution_shift-0.1.0/src/tree_distribution_shift.egg-info/requires.txt +4 -0
- tree_distribution_shift-0.1.0/src/tree_distribution_shift.egg-info/top_level.txt +1 -0
- tree_distribution_shift-0.1.0/src/tree_shift/__init__.py +5 -0
- tree_distribution_shift-0.1.0/src/tree_shift/cli.py +90 -0
- tree_distribution_shift-0.1.0/src/tree_shift/constants.py +2 -0
- tree_distribution_shift-0.1.0/src/tree_shift/export.py +139 -0
- tree_distribution_shift-0.1.0/src/tree_shift/hf.py +14 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Aaditya Nalawade
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tree-distribution-shift
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Tree Distribution Shift benchmark: export HF configs to COCO (train/id_test/ood_test + density buckets).
|
|
5
|
+
Author: Aaditya Nalawade
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: datasets>=2.19.0
|
|
12
|
+
Requires-Dist: huggingface_hub>=0.23.0
|
|
13
|
+
Requires-Dist: orjson>=3.9.0
|
|
14
|
+
Requires-Dist: tqdm>=4.66.0
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
# tree-distribution-shift
|
|
18
|
+
|
|
19
|
+
A pip-installable benchmark for **Tree Distribution Shift** — export HuggingFace dataset configs to COCO format on disk.
|
|
20
|
+
|
|
21
|
+
**No git. No git-lfs. Works on clusters.**
|
|
22
|
+
|
|
23
|
+
Data lives on the HF Hub: [`aadityabuilds/tree-distribution-shift`](https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift)
|
|
24
|
+
|
|
25
|
+
## Getting started
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install -U pip
|
|
29
|
+
pip install tree-distribution-shift
|
|
30
|
+
|
|
31
|
+
# See all available configs (queries HF Hub live)
|
|
32
|
+
tree-shift list
|
|
33
|
+
|
|
34
|
+
# Export a config to COCO format
|
|
35
|
+
tree-shift export --config intl_train_IN__ood_US --out ./data
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
That's it. No cloning, no LFS, no large downloads up front — data streams directly from the Hub.
|
|
39
|
+
|
|
40
|
+
### Authentication (recommended)
|
|
41
|
+
|
|
42
|
+
Set an HF token for higher rate limits and faster downloads:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
export HF_TOKEN=hf_... # recommended
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Or log in once with the CLI:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
huggingface-cli login
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
You can also pass a token per-command:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
tree-shift list --token hf_...
|
|
58
|
+
tree-shift export --config intl_train_IN__ood_US --out ./data --token hf_...
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Export only the splits you need
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
tree-shift export \
|
|
65
|
+
--config in_state_train_Karnataka__ood_Rajasthan \
|
|
66
|
+
--out ./data \
|
|
67
|
+
--splits train ood_test
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Pin a revision for reproducibility
|
|
71
|
+
|
|
72
|
+
For papers and exact replication, pin a commit hash:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
tree-shift export \
|
|
76
|
+
--config in_state_train_Karnataka__ood_Rajasthan \
|
|
77
|
+
--out ./data \
|
|
78
|
+
--revision abc123def456
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Python API
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from tree_shift import list_configs, export_coco
|
|
85
|
+
|
|
86
|
+
print(list_configs())
|
|
87
|
+
|
|
88
|
+
export_coco(
|
|
89
|
+
config="in_state_train_Karnataka__ood_Rajasthan",
|
|
90
|
+
out_root="./data",
|
|
91
|
+
splits=["train", "id_test", "ood_test"],
|
|
92
|
+
streaming=True, # default; avoids OOM on low-mem machines
|
|
93
|
+
revision=None, # or pin a commit hash
|
|
94
|
+
)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Output structure
|
|
98
|
+
|
|
99
|
+
```
|
|
100
|
+
./data/<config>/
|
|
101
|
+
train/
|
|
102
|
+
images/
|
|
103
|
+
*.png
|
|
104
|
+
annotations/
|
|
105
|
+
instances_train.json
|
|
106
|
+
id_test/
|
|
107
|
+
images/
|
|
108
|
+
annotations/
|
|
109
|
+
ood_test/
|
|
110
|
+
images/
|
|
111
|
+
annotations/
|
|
112
|
+
ood_same_density/
|
|
113
|
+
images/
|
|
114
|
+
annotations/
|
|
115
|
+
ood_diff_density/
|
|
116
|
+
images/
|
|
117
|
+
annotations/
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Annotation files follow the [COCO format](https://cocodataset.org/#format-data).
|
|
121
|
+
|
|
122
|
+
## Dataset contract
|
|
123
|
+
|
|
124
|
+
Every example row in the HF dataset contains:
|
|
125
|
+
|
|
126
|
+
| Column | Type | Description |
|
|
127
|
+
|--------|------|-------------|
|
|
128
|
+
| `image_id` | int | Unique image identifier |
|
|
129
|
+
| `filename` | str | Image filename (e.g. `00001.png`) |
|
|
130
|
+
| `width` | int | Image width in pixels |
|
|
131
|
+
| `height` | int | Image height in pixels |
|
|
132
|
+
| `image_bytes` | bytes | Raw image file contents |
|
|
133
|
+
| `coco_annotations` | str | JSON string — list of COCO annotation dicts |
|
|
134
|
+
| `coco_categories` | str | JSON string — list of COCO category dicts |
|
|
135
|
+
| `country` | str | Country code (e.g. `IN`, `US`) |
|
|
136
|
+
| `state` | str | State name |
|
|
137
|
+
| `zone` | str | Geographic zone |
|
|
138
|
+
| `biome` | str | Biome classification |
|
|
139
|
+
| `density_category` | str | Tree density bucket |
|
|
140
|
+
|
|
141
|
+
### Expected splits
|
|
142
|
+
|
|
143
|
+
- **`train`** — training set
|
|
144
|
+
- **`id_test`** — in-distribution test set
|
|
145
|
+
- **`ood_test`** — out-of-distribution test set
|
|
146
|
+
- **`ood_same_density`** — OOD test filtered to same density bucket as train
|
|
147
|
+
- **`ood_diff_density`** — OOD test filtered to different density bucket
|
|
148
|
+
|
|
149
|
+
> **Note:** Some older configs may not include the `ood_same_density` / `ood_diff_density` splits.
|
|
150
|
+
|
|
151
|
+
## Resume support
|
|
152
|
+
|
|
153
|
+
Exports are **resume-safe**. If a job is preempted or interrupted:
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
# Just re-run the same command — already-written images are skipped
|
|
157
|
+
tree-shift export --config intl_train_IN__ood_US --out ./data
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Images that already exist on disk are not re-downloaded. The COCO annotation JSON is always regenerated to stay consistent with the full split.
|
|
161
|
+
|
|
162
|
+
## Rate limits & performance
|
|
163
|
+
|
|
164
|
+
- **Streaming is on by default** — data is fetched shard-by-shard, avoiding OOM and memory-map errors.
|
|
165
|
+
- **`--no-streaming`** downloads the full split into an Arrow table first. This can OOM on large splits or cause `mmap` errors on low-memory machines. Use only if you have plenty of RAM.
|
|
166
|
+
- For faster downloads (when available), set the environment variable:
|
|
167
|
+
```bash
|
|
168
|
+
export HF_HUB_ENABLE_HF_TRANSFER=1
|
|
169
|
+
```
|
|
170
|
+
- Exports call `load_dataset` once per split (not per example), keeping HF API calls minimal.
|
|
171
|
+
- If you hit 429 rate-limit errors, wait a few minutes and re-run — resume will skip already-written images.
|
|
172
|
+
|
|
173
|
+
## Cache location (clusters/shared machines)
|
|
174
|
+
|
|
175
|
+
The exporter uses the standard HuggingFace cache. On clusters with limited home directory space, set a custom cache location on a large disk:
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
export HF_HOME=/path/to/big/disk/hf_cache
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
This prevents re-downloading and reduces load on the HF Hub.
|
|
182
|
+
|
|
183
|
+
## Diagnostic command
|
|
184
|
+
|
|
185
|
+
If exports fail or behave unexpectedly, run the doctor command and share the output:
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
tree-shift doctor
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
Output includes package versions, environment variables, and Python version — helpful for debugging "works on my machine" issues.
|
|
192
|
+
|
|
193
|
+
## Manifest file
|
|
194
|
+
|
|
195
|
+
Every export writes a `manifest.json` in the config output folder:
|
|
196
|
+
|
|
197
|
+
```json
|
|
198
|
+
{
|
|
199
|
+
"repo_id": "aadityabuilds/tree-distribution-shift",
|
|
200
|
+
"config": "intl_train_IN__ood_US",
|
|
201
|
+
"revision": "abc123def456",
|
|
202
|
+
"splits": ["train", "ood_test"],
|
|
203
|
+
"timestamp": "2026-02-17T04:20:00+00:00",
|
|
204
|
+
"counts": {
|
|
205
|
+
"train": {"images": 11783, "annotations": 181208},
|
|
206
|
+
"ood_test": {"images": 2345, "annotations": 36000}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
This is essential for reproducibility: it records the exact revision, splits exported, and counts per split. Include `manifest.json` when sharing or publishing data exports.
|
|
212
|
+
|
|
213
|
+
## API reference
|
|
214
|
+
|
|
215
|
+
### `list_configs(repo_id=..., revision=None) -> list[str]`
|
|
216
|
+
|
|
217
|
+
Returns the list of available dataset configs from the Hub. No data is downloaded.
|
|
218
|
+
|
|
219
|
+
### `export_coco(config, out_root, repo_id=..., splits=None, revision=None, streaming=True) -> Path`
|
|
220
|
+
|
|
221
|
+
Exports a config to COCO folders on disk.
|
|
222
|
+
|
|
223
|
+
| Parameter | Default | Description |
|
|
224
|
+
|-------------|----------------------------------|------------------------------------------------|
|
|
225
|
+
| `config` | *(required)* | Name of the HF dataset config |
|
|
226
|
+
| `out_root` | *(required)* | Root output directory |
|
|
227
|
+
| `splits` | all 5 splits | Which splits to export |
|
|
228
|
+
| `streaming` | `True` | Stream data (recommended for low-mem machines) |
|
|
229
|
+
| `revision` | `None` | HF dataset revision / commit hash |
|
|
230
|
+
|
|
231
|
+
## Dependencies
|
|
232
|
+
|
|
233
|
+
This package is intentionally lightweight:
|
|
234
|
+
|
|
235
|
+
- `datasets` — HF dataset loading
|
|
236
|
+
- `huggingface_hub` — Hub API access
|
|
237
|
+
- `orjson` — fast JSON serialization
|
|
238
|
+
- `tqdm` — progress display
|
|
239
|
+
|
|
240
|
+
No `torch`, `pycocotools`, or `opencv` required.
|
|
241
|
+
|
|
242
|
+
## Why this package?
|
|
243
|
+
|
|
244
|
+
- **No git / git-lfs** — works on clusters where git-lfs is painful.
|
|
245
|
+
- **Streaming by default** — avoids OOM and memory-map errors.
|
|
246
|
+
- **Resume-safe** — re-run after interruption without re-downloading.
|
|
247
|
+
- **Uses HF cache** — respects revisions, reproducible.
|
|
248
|
+
- **Thin client** — new configs on the Hub are visible immediately via `list_configs()` (no pip update needed).
|
|
249
|
+
|
|
250
|
+
## Versioning policy
|
|
251
|
+
|
|
252
|
+
The pip package contains tools only. Data versions are pinned with `--revision`.
|
|
253
|
+
|
|
254
|
+
- **0.1.x** — exporter stability, backward compatible CLI/API.
|
|
255
|
+
- **Minor bump** — when on-disk COCO layout or CLI flags change.
|
|
256
|
+
- **Major bump** — if split semantics change (unlikely).
|
|
257
|
+
|
|
258
|
+
New configs on the HF Hub are visible immediately via `tree-shift list`; no pip update required to access new data.
|
|
259
|
+
|
|
260
|
+
## Maintenance
|
|
261
|
+
|
|
262
|
+
- **Adding new configs/data** → push to the HF dataset repo only.
|
|
263
|
+
- **Package updates** → only needed for new CLI flags or exporter behavior changes.
|
|
264
|
+
- **Users see new configs automatically** via `tree-shift list`.
|
|
265
|
+
|
|
266
|
+
## Citation
|
|
267
|
+
|
|
268
|
+
If you use this dataset in research, please cite:
|
|
269
|
+
|
|
270
|
+
```bibtex
|
|
271
|
+
@misc{nalawade2025tree_distribution_shift,
|
|
272
|
+
title={Tree Distribution Shift: A Benchmark for Out-of-Distribution Tree Detection},
|
|
273
|
+
author={Nalawade, Aaditya},
|
|
274
|
+
year={2025},
|
|
275
|
+
howpublished={\url{https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift}}
|
|
276
|
+
}
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
## License
|
|
280
|
+
|
|
281
|
+
MIT
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
# tree-distribution-shift
|
|
2
|
+
|
|
3
|
+
A pip-installable benchmark for **Tree Distribution Shift** — export HuggingFace dataset configs to COCO format on disk.
|
|
4
|
+
|
|
5
|
+
**No git. No git-lfs. Works on clusters.**
|
|
6
|
+
|
|
7
|
+
Data lives on the HF Hub: [`aadityabuilds/tree-distribution-shift`](https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift)
|
|
8
|
+
|
|
9
|
+
## Getting started
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install -U pip
|
|
13
|
+
pip install tree-distribution-shift
|
|
14
|
+
|
|
15
|
+
# See all available configs (queries HF Hub live)
|
|
16
|
+
tree-shift list
|
|
17
|
+
|
|
18
|
+
# Export a config to COCO format
|
|
19
|
+
tree-shift export --config intl_train_IN__ood_US --out ./data
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
That's it. No cloning, no LFS, no large downloads up front — data streams directly from the Hub.
|
|
23
|
+
|
|
24
|
+
### Authentication (recommended)
|
|
25
|
+
|
|
26
|
+
Set an HF token for higher rate limits and faster downloads:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
export HF_TOKEN=hf_... # recommended
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Or log in once with the CLI:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
huggingface-cli login
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
You can also pass a token per-command:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
tree-shift list --token hf_...
|
|
42
|
+
tree-shift export --config intl_train_IN__ood_US --out ./data --token hf_...
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Export only the splits you need
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
tree-shift export \
|
|
49
|
+
--config in_state_train_Karnataka__ood_Rajasthan \
|
|
50
|
+
--out ./data \
|
|
51
|
+
--splits train ood_test
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Pin a revision for reproducibility
|
|
55
|
+
|
|
56
|
+
For papers and exact replication, pin a commit hash:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
tree-shift export \
|
|
60
|
+
--config in_state_train_Karnataka__ood_Rajasthan \
|
|
61
|
+
--out ./data \
|
|
62
|
+
--revision abc123def456
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Python API
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from tree_shift import list_configs, export_coco
|
|
69
|
+
|
|
70
|
+
print(list_configs())
|
|
71
|
+
|
|
72
|
+
export_coco(
|
|
73
|
+
config="in_state_train_Karnataka__ood_Rajasthan",
|
|
74
|
+
out_root="./data",
|
|
75
|
+
splits=["train", "id_test", "ood_test"],
|
|
76
|
+
streaming=True, # default; avoids OOM on low-mem machines
|
|
77
|
+
revision=None, # or pin a commit hash
|
|
78
|
+
)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Output structure
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
./data/<config>/
|
|
85
|
+
train/
|
|
86
|
+
images/
|
|
87
|
+
*.png
|
|
88
|
+
annotations/
|
|
89
|
+
instances_train.json
|
|
90
|
+
id_test/
|
|
91
|
+
images/
|
|
92
|
+
annotations/
|
|
93
|
+
ood_test/
|
|
94
|
+
images/
|
|
95
|
+
annotations/
|
|
96
|
+
ood_same_density/
|
|
97
|
+
images/
|
|
98
|
+
annotations/
|
|
99
|
+
ood_diff_density/
|
|
100
|
+
images/
|
|
101
|
+
annotations/
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Annotation files follow the [COCO format](https://cocodataset.org/#format-data).
|
|
105
|
+
|
|
106
|
+
## Dataset contract
|
|
107
|
+
|
|
108
|
+
Every example row in the HF dataset contains:
|
|
109
|
+
|
|
110
|
+
| Column | Type | Description |
|
|
111
|
+
|--------|------|-------------|
|
|
112
|
+
| `image_id` | int | Unique image identifier |
|
|
113
|
+
| `filename` | str | Image filename (e.g. `00001.png`) |
|
|
114
|
+
| `width` | int | Image width in pixels |
|
|
115
|
+
| `height` | int | Image height in pixels |
|
|
116
|
+
| `image_bytes` | bytes | Raw image file contents |
|
|
117
|
+
| `coco_annotations` | str | JSON string — list of COCO annotation dicts |
|
|
118
|
+
| `coco_categories` | str | JSON string — list of COCO category dicts |
|
|
119
|
+
| `country` | str | Country code (e.g. `IN`, `US`) |
|
|
120
|
+
| `state` | str | State name |
|
|
121
|
+
| `zone` | str | Geographic zone |
|
|
122
|
+
| `biome` | str | Biome classification |
|
|
123
|
+
| `density_category` | str | Tree density bucket |
|
|
124
|
+
|
|
125
|
+
### Expected splits
|
|
126
|
+
|
|
127
|
+
- **`train`** — training set
|
|
128
|
+
- **`id_test`** — in-distribution test set
|
|
129
|
+
- **`ood_test`** — out-of-distribution test set
|
|
130
|
+
- **`ood_same_density`** — OOD test filtered to same density bucket as train
|
|
131
|
+
- **`ood_diff_density`** — OOD test filtered to different density bucket
|
|
132
|
+
|
|
133
|
+
> **Note:** Some older configs may not include the `ood_same_density` / `ood_diff_density` splits.
|
|
134
|
+
|
|
135
|
+
## Resume support
|
|
136
|
+
|
|
137
|
+
Exports are **resume-safe**. If a job is preempted or interrupted:
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
# Just re-run the same command — already-written images are skipped
|
|
141
|
+
tree-shift export --config intl_train_IN__ood_US --out ./data
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Images that already exist on disk are not re-downloaded. The COCO annotation JSON is always regenerated to stay consistent with the full split.
|
|
145
|
+
|
|
146
|
+
## Rate limits & performance
|
|
147
|
+
|
|
148
|
+
- **Streaming is on by default** — data is fetched shard-by-shard, avoiding OOM and memory-map errors.
|
|
149
|
+
- **`--no-streaming`** downloads the full split into an Arrow table first. This can OOM on large splits or cause `mmap` errors on low-memory machines. Use only if you have plenty of RAM.
|
|
150
|
+
- For faster downloads (when available), set the environment variable:
|
|
151
|
+
```bash
|
|
152
|
+
export HF_HUB_ENABLE_HF_TRANSFER=1
|
|
153
|
+
```
|
|
154
|
+
- Exports call `load_dataset` once per split (not per example), keeping HF API calls minimal.
|
|
155
|
+
- If you hit 429 rate-limit errors, wait a few minutes and re-run — resume will skip already-written images.
|
|
156
|
+
|
|
157
|
+
## Cache location (clusters/shared machines)
|
|
158
|
+
|
|
159
|
+
The exporter uses the standard HuggingFace cache. On clusters with limited home directory space, set a custom cache location on a large disk:
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
export HF_HOME=/path/to/big/disk/hf_cache
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
This prevents re-downloading and reduces load on the HF Hub.
|
|
166
|
+
|
|
167
|
+
## Diagnostic command
|
|
168
|
+
|
|
169
|
+
If exports fail or behave unexpectedly, run the doctor command and share the output:
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
tree-shift doctor
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
Output includes package versions, environment variables, and Python version — helpful for debugging "works on my machine" issues.
|
|
176
|
+
|
|
177
|
+
## Manifest file
|
|
178
|
+
|
|
179
|
+
Every export writes a `manifest.json` in the config output folder:
|
|
180
|
+
|
|
181
|
+
```json
|
|
182
|
+
{
|
|
183
|
+
"repo_id": "aadityabuilds/tree-distribution-shift",
|
|
184
|
+
"config": "intl_train_IN__ood_US",
|
|
185
|
+
"revision": "abc123def456",
|
|
186
|
+
"splits": ["train", "ood_test"],
|
|
187
|
+
"timestamp": "2026-02-17T04:20:00+00:00",
|
|
188
|
+
"counts": {
|
|
189
|
+
"train": {"images": 11783, "annotations": 181208},
|
|
190
|
+
"ood_test": {"images": 2345, "annotations": 36000}
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
This is essential for reproducibility: it records the exact revision, splits exported, and counts per split. Include `manifest.json` when sharing or publishing data exports.
|
|
196
|
+
|
|
197
|
+
## API reference
|
|
198
|
+
|
|
199
|
+
### `list_configs(repo_id=..., revision=None) -> list[str]`
|
|
200
|
+
|
|
201
|
+
Returns the list of available dataset configs from the Hub. No data is downloaded.
|
|
202
|
+
|
|
203
|
+
### `export_coco(config, out_root, repo_id=..., splits=None, revision=None, streaming=True) -> Path`
|
|
204
|
+
|
|
205
|
+
Exports a config to COCO folders on disk.
|
|
206
|
+
|
|
207
|
+
| Parameter | Default | Description |
|
|
208
|
+
|-------------|----------------------------------|------------------------------------------------|
|
|
209
|
+
| `config` | *(required)* | Name of the HF dataset config |
|
|
210
|
+
| `out_root` | *(required)* | Root output directory |
|
|
211
|
+
| `splits` | all 5 splits | Which splits to export |
|
|
212
|
+
| `streaming` | `True` | Stream data (recommended for low-mem machines) |
|
|
213
|
+
| `revision` | `None` | HF dataset revision / commit hash |
|
|
214
|
+
|
|
215
|
+
## Dependencies
|
|
216
|
+
|
|
217
|
+
This package is intentionally lightweight:
|
|
218
|
+
|
|
219
|
+
- `datasets` — HF dataset loading
|
|
220
|
+
- `huggingface_hub` — Hub API access
|
|
221
|
+
- `orjson` — fast JSON serialization
|
|
222
|
+
- `tqdm` — progress display
|
|
223
|
+
|
|
224
|
+
No `torch`, `pycocotools`, or `opencv` required.
|
|
225
|
+
|
|
226
|
+
## Why this package?
|
|
227
|
+
|
|
228
|
+
- **No git / git-lfs** — works on clusters where git-lfs is painful.
|
|
229
|
+
- **Streaming by default** — avoids OOM and memory-map errors.
|
|
230
|
+
- **Resume-safe** — re-run after interruption without re-downloading.
|
|
231
|
+
- **Uses HF cache** — respects revisions, reproducible.
|
|
232
|
+
- **Thin client** — new configs on the Hub are visible immediately via `list_configs()` (no pip update needed).
|
|
233
|
+
|
|
234
|
+
## Versioning policy
|
|
235
|
+
|
|
236
|
+
The pip package contains tools only. Data versions are pinned with `--revision`.
|
|
237
|
+
|
|
238
|
+
- **0.1.x** — exporter stability, backward compatible CLI/API.
|
|
239
|
+
- **Minor bump** — when on-disk COCO layout or CLI flags change.
|
|
240
|
+
- **Major bump** — if split semantics change (unlikely).
|
|
241
|
+
|
|
242
|
+
New configs on the HF Hub are visible immediately via `tree-shift list`; no pip update required to access new data.
|
|
243
|
+
|
|
244
|
+
## Maintenance
|
|
245
|
+
|
|
246
|
+
- **Adding new configs/data** → push to the HF dataset repo only.
|
|
247
|
+
- **Package updates** → only needed for new CLI flags or exporter behavior changes.
|
|
248
|
+
- **Users see new configs automatically** via `tree-shift list`.
|
|
249
|
+
|
|
250
|
+
## Citation
|
|
251
|
+
|
|
252
|
+
If you use this dataset in research, please cite:
|
|
253
|
+
|
|
254
|
+
```bibtex
|
|
255
|
+
@misc{nalawade2025tree_distribution_shift,
|
|
256
|
+
title={Tree Distribution Shift: A Benchmark for Out-of-Distribution Tree Detection},
|
|
257
|
+
author={Nalawade, Aaditya},
|
|
258
|
+
year={2025},
|
|
259
|
+
howpublished={\url{https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift}}
|
|
260
|
+
}
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## License
|
|
264
|
+
|
|
265
|
+
MIT
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tree-distribution-shift"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Tree Distribution Shift benchmark: export HF configs to COCO (train/id_test/ood_test + density buckets)."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = [{name = "Aaditya Nalawade"}]
|
|
13
|
+
dependencies = [
|
|
14
|
+
"datasets>=2.19.0",
|
|
15
|
+
"huggingface_hub>=0.23.0",
|
|
16
|
+
"orjson>=3.9.0",
|
|
17
|
+
"tqdm>=4.66.0",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.urls]
|
|
21
|
+
Homepage = "https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift"
|
|
22
|
+
|
|
23
|
+
[project.scripts]
|
|
24
|
+
tree-shift = "tree_shift.cli:main"
|
|
25
|
+
|
|
26
|
+
[tool.setuptools]
|
|
27
|
+
package-dir = {"" = "src"}
|
|
28
|
+
|
|
29
|
+
[tool.setuptools.packages.find]
|
|
30
|
+
where = ["src"]
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tree-distribution-shift
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Tree Distribution Shift benchmark: export HF configs to COCO (train/id_test/ood_test + density buckets).
|
|
5
|
+
Author: Aaditya Nalawade
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: datasets>=2.19.0
|
|
12
|
+
Requires-Dist: huggingface_hub>=0.23.0
|
|
13
|
+
Requires-Dist: orjson>=3.9.0
|
|
14
|
+
Requires-Dist: tqdm>=4.66.0
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
# tree-distribution-shift
|
|
18
|
+
|
|
19
|
+
A pip-installable benchmark for **Tree Distribution Shift** — export HuggingFace dataset configs to COCO format on disk.
|
|
20
|
+
|
|
21
|
+
**No git. No git-lfs. Works on clusters.**
|
|
22
|
+
|
|
23
|
+
Data lives on the HF Hub: [`aadityabuilds/tree-distribution-shift`](https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift)
|
|
24
|
+
|
|
25
|
+
## Getting started
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install -U pip
|
|
29
|
+
pip install tree-distribution-shift
|
|
30
|
+
|
|
31
|
+
# See all available configs (queries HF Hub live)
|
|
32
|
+
tree-shift list
|
|
33
|
+
|
|
34
|
+
# Export a config to COCO format
|
|
35
|
+
tree-shift export --config intl_train_IN__ood_US --out ./data
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
That's it. No cloning, no LFS, no large downloads up front — data streams directly from the Hub.
|
|
39
|
+
|
|
40
|
+
### Authentication (recommended)
|
|
41
|
+
|
|
42
|
+
Set an HF token for higher rate limits and faster downloads:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
export HF_TOKEN=hf_... # recommended
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Or log in once with the CLI:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
huggingface-cli login
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
You can also pass a token per-command:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
tree-shift list --token hf_...
|
|
58
|
+
tree-shift export --config intl_train_IN__ood_US --out ./data --token hf_...
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Export only the splits you need
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
tree-shift export \
|
|
65
|
+
--config in_state_train_Karnataka__ood_Rajasthan \
|
|
66
|
+
--out ./data \
|
|
67
|
+
--splits train ood_test
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Pin a revision for reproducibility
|
|
71
|
+
|
|
72
|
+
For papers and exact replication, pin a commit hash:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
tree-shift export \
|
|
76
|
+
--config in_state_train_Karnataka__ood_Rajasthan \
|
|
77
|
+
--out ./data \
|
|
78
|
+
--revision abc123def456
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Python API
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from tree_shift import list_configs, export_coco
|
|
85
|
+
|
|
86
|
+
print(list_configs())
|
|
87
|
+
|
|
88
|
+
export_coco(
|
|
89
|
+
config="in_state_train_Karnataka__ood_Rajasthan",
|
|
90
|
+
out_root="./data",
|
|
91
|
+
splits=["train", "id_test", "ood_test"],
|
|
92
|
+
streaming=True, # default; avoids OOM on low-mem machines
|
|
93
|
+
revision=None, # or pin a commit hash
|
|
94
|
+
)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Output structure
|
|
98
|
+
|
|
99
|
+
```
|
|
100
|
+
./data/<config>/
|
|
101
|
+
train/
|
|
102
|
+
images/
|
|
103
|
+
*.png
|
|
104
|
+
annotations/
|
|
105
|
+
instances_train.json
|
|
106
|
+
id_test/
|
|
107
|
+
images/
|
|
108
|
+
annotations/
|
|
109
|
+
ood_test/
|
|
110
|
+
images/
|
|
111
|
+
annotations/
|
|
112
|
+
ood_same_density/
|
|
113
|
+
images/
|
|
114
|
+
annotations/
|
|
115
|
+
ood_diff_density/
|
|
116
|
+
images/
|
|
117
|
+
annotations/
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Annotation files follow the [COCO format](https://cocodataset.org/#format-data).
|
|
121
|
+
|
|
122
|
+
## Dataset contract
|
|
123
|
+
|
|
124
|
+
Every example row in the HF dataset contains:
|
|
125
|
+
|
|
126
|
+
| Column | Type | Description |
|
|
127
|
+
|--------|------|-------------|
|
|
128
|
+
| `image_id` | int | Unique image identifier |
|
|
129
|
+
| `filename` | str | Image filename (e.g. `00001.png`) |
|
|
130
|
+
| `width` | int | Image width in pixels |
|
|
131
|
+
| `height` | int | Image height in pixels |
|
|
132
|
+
| `image_bytes` | bytes | Raw image file contents |
|
|
133
|
+
| `coco_annotations` | str | JSON string — list of COCO annotation dicts |
|
|
134
|
+
| `coco_categories` | str | JSON string — list of COCO category dicts |
|
|
135
|
+
| `country` | str | Country code (e.g. `IN`, `US`) |
|
|
136
|
+
| `state` | str | State name |
|
|
137
|
+
| `zone` | str | Geographic zone |
|
|
138
|
+
| `biome` | str | Biome classification |
|
|
139
|
+
| `density_category` | str | Tree density bucket |
|
|
140
|
+
|
|
141
|
+
### Expected splits
|
|
142
|
+
|
|
143
|
+
- **`train`** — training set
|
|
144
|
+
- **`id_test`** — in-distribution test set
|
|
145
|
+
- **`ood_test`** — out-of-distribution test set
|
|
146
|
+
- **`ood_same_density`** — OOD test filtered to same density bucket as train
|
|
147
|
+
- **`ood_diff_density`** — OOD test filtered to different density bucket
|
|
148
|
+
|
|
149
|
+
> **Note:** Some older configs may not include the `ood_same_density` / `ood_diff_density` splits.
|
|
150
|
+
|
|
151
|
+
## Resume support
|
|
152
|
+
|
|
153
|
+
Exports are **resume-safe**. If a job is preempted or interrupted:
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
# Just re-run the same command — already-written images are skipped
|
|
157
|
+
tree-shift export --config intl_train_IN__ood_US --out ./data
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Images that already exist on disk are not re-downloaded. The COCO annotation JSON is always regenerated to stay consistent with the full split.
|
|
161
|
+
|
|
162
|
+
## Rate limits & performance
|
|
163
|
+
|
|
164
|
+
- **Streaming is on by default** — data is fetched shard-by-shard, avoiding OOM and memory-map errors.
|
|
165
|
+
- **`--no-streaming`** downloads the full split into an Arrow table first. This can OOM on large splits or cause `mmap` errors on low-memory machines. Use only if you have plenty of RAM.
|
|
166
|
+
- For faster downloads (when available), set the environment variable:
|
|
167
|
+
```bash
|
|
168
|
+
export HF_HUB_ENABLE_HF_TRANSFER=1
|
|
169
|
+
```
|
|
170
|
+
- Exports call `load_dataset` once per split (not per example), keeping HF API calls minimal.
|
|
171
|
+
- If you hit 429 rate-limit errors, wait a few minutes and re-run — resume will skip already-written images.
|
|
172
|
+
|
|
173
|
+
## Cache location (clusters/shared machines)
|
|
174
|
+
|
|
175
|
+
The exporter uses the standard HuggingFace cache. On clusters with limited home directory space, set a custom cache location on a large disk:
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
export HF_HOME=/path/to/big/disk/hf_cache
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
This prevents re-downloading and reduces load on the HF Hub.
|
|
182
|
+
|
|
183
|
+
## Diagnostic command
|
|
184
|
+
|
|
185
|
+
If exports fail or behave unexpectedly, run the doctor command and share the output:
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
tree-shift doctor
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
Output includes package versions, environment variables, and Python version — helpful for debugging "works on my machine" issues.
|
|
192
|
+
|
|
193
|
+
## Manifest file
|
|
194
|
+
|
|
195
|
+
Every export writes a `manifest.json` in the config output folder:
|
|
196
|
+
|
|
197
|
+
```json
|
|
198
|
+
{
|
|
199
|
+
"repo_id": "aadityabuilds/tree-distribution-shift",
|
|
200
|
+
"config": "intl_train_IN__ood_US",
|
|
201
|
+
"revision": "abc123def456",
|
|
202
|
+
"splits": ["train", "ood_test"],
|
|
203
|
+
"timestamp": "2026-02-17T04:20:00+00:00",
|
|
204
|
+
"counts": {
|
|
205
|
+
"train": {"images": 11783, "annotations": 181208},
|
|
206
|
+
"ood_test": {"images": 2345, "annotations": 36000}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
This is essential for reproducibility: it records the exact revision, splits exported, and counts per split. Include `manifest.json` when sharing or publishing data exports.
|
|
212
|
+
|
|
213
|
+
## API reference
|
|
214
|
+
|
|
215
|
+
### `list_configs(repo_id=..., revision=None) -> list[str]`
|
|
216
|
+
|
|
217
|
+
Returns the list of available dataset configs from the Hub. No data is downloaded.
|
|
218
|
+
|
|
219
|
+
### `export_coco(config, out_root, repo_id=..., splits=None, revision=None, streaming=True) -> Path`
|
|
220
|
+
|
|
221
|
+
Exports a config to COCO folders on disk.
|
|
222
|
+
|
|
223
|
+
| Parameter | Default | Description |
|
|
224
|
+
|-------------|----------------------------------|------------------------------------------------|
|
|
225
|
+
| `config` | *(required)* | Name of the HF dataset config |
|
|
226
|
+
| `out_root` | *(required)* | Root output directory |
|
|
227
|
+
| `splits` | all 5 splits | Which splits to export |
|
|
228
|
+
| `streaming` | `True` | Stream data (recommended for low-mem machines) |
|
|
229
|
+
| `revision` | `None` | HF dataset revision / commit hash |
|
|
230
|
+
|
|
231
|
+
## Dependencies
|
|
232
|
+
|
|
233
|
+
This package is intentionally lightweight:
|
|
234
|
+
|
|
235
|
+
- `datasets` — HF dataset loading
|
|
236
|
+
- `huggingface_hub` — Hub API access
|
|
237
|
+
- `orjson` — fast JSON serialization
|
|
238
|
+
- `tqdm` — progress display
|
|
239
|
+
|
|
240
|
+
No `torch`, `pycocotools`, or `opencv` required.
|
|
241
|
+
|
|
242
|
+
## Why this package?
|
|
243
|
+
|
|
244
|
+
- **No git / git-lfs** — works on clusters where git-lfs is painful.
|
|
245
|
+
- **Streaming by default** — avoids OOM and memory-map errors.
|
|
246
|
+
- **Resume-safe** — re-run after interruption without re-downloading.
|
|
247
|
+
- **Uses HF cache** — respects revisions, reproducible.
|
|
248
|
+
- **Thin client** — new configs on the Hub are visible immediately via `list_configs()` (no pip update needed).
|
|
249
|
+
|
|
250
|
+
## Versioning policy
|
|
251
|
+
|
|
252
|
+
The pip package contains tools only. Data versions are pinned with `--revision`.
|
|
253
|
+
|
|
254
|
+
- **0.1.x** — exporter stability, backward compatible CLI/API.
|
|
255
|
+
- **Minor bump** — when on-disk COCO layout or CLI flags change.
|
|
256
|
+
- **Major bump** — if split semantics change (unlikely).
|
|
257
|
+
|
|
258
|
+
New configs on the HF Hub are visible immediately via `tree-shift list`; no pip update required to access new data.
|
|
259
|
+
|
|
260
|
+
## Maintenance
|
|
261
|
+
|
|
262
|
+
- **Adding new configs/data** → push to the HF dataset repo only.
|
|
263
|
+
- **Package updates** → only needed for new CLI flags or exporter behavior changes.
|
|
264
|
+
- **Users see new configs automatically** via `tree-shift list`.
|
|
265
|
+
|
|
266
|
+
## Citation
|
|
267
|
+
|
|
268
|
+
If you use this dataset in research, please cite:
|
|
269
|
+
|
|
270
|
+
```bibtex
|
|
271
|
+
@misc{nalawade2025tree_distribution_shift,
|
|
272
|
+
title={Tree Distribution Shift: A Benchmark for Out-of-Distribution Tree Detection},
|
|
273
|
+
author={Nalawade, Aaditya},
|
|
274
|
+
year={2025},
|
|
275
|
+
howpublished={\url{https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift}}
|
|
276
|
+
}
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
## License
|
|
280
|
+
|
|
281
|
+
MIT
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/tree_distribution_shift.egg-info/PKG-INFO
|
|
5
|
+
src/tree_distribution_shift.egg-info/SOURCES.txt
|
|
6
|
+
src/tree_distribution_shift.egg-info/dependency_links.txt
|
|
7
|
+
src/tree_distribution_shift.egg-info/entry_points.txt
|
|
8
|
+
src/tree_distribution_shift.egg-info/requires.txt
|
|
9
|
+
src/tree_distribution_shift.egg-info/top_level.txt
|
|
10
|
+
src/tree_shift/__init__.py
|
|
11
|
+
src/tree_shift/cli.py
|
|
12
|
+
src/tree_shift/constants.py
|
|
13
|
+
src/tree_shift/export.py
|
|
14
|
+
src/tree_shift/hf.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
tree_shift
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
import datasets
|
|
9
|
+
import huggingface_hub
|
|
10
|
+
|
|
11
|
+
from .constants import DEFAULT_SPLITS, HF_REPO_ID
|
|
12
|
+
from .export import export_coco
|
|
13
|
+
from .hf import list_configs
|
|
14
|
+
from . import __version__
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _setup_token(token: str | None) -> None:
|
|
18
|
+
"""Authenticate with HF Hub if a token is provided, otherwise print a hint."""
|
|
19
|
+
# Suppress the noisy "unauthenticated requests" warning from huggingface_hub
|
|
20
|
+
logging.getLogger("huggingface_hub.utils._auth").setLevel(logging.ERROR)
|
|
21
|
+
|
|
22
|
+
if token:
|
|
23
|
+
huggingface_hub.login(token=token, add_to_git_credential=False)
|
|
24
|
+
elif not huggingface_hub.utils.get_token():
|
|
25
|
+
print(
|
|
26
|
+
"Tip: set HF_TOKEN for higher rate limits "
|
|
27
|
+
"(export HF_TOKEN=... or run: huggingface-cli login)",
|
|
28
|
+
file=sys.stderr,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _doctor() -> int:
|
|
33
|
+
"""Print diagnostic information."""
|
|
34
|
+
token = huggingface_hub.utils.get_token()
|
|
35
|
+
print(f"tree-distribution-shift version: {__version__}")
|
|
36
|
+
print(f"datasets version: {datasets.__version__}")
|
|
37
|
+
print(f"huggingface_hub version: {huggingface_hub.__version__}")
|
|
38
|
+
print(f"HF_TOKEN: {'set' if token else 'not set'}")
|
|
39
|
+
print(f"HF_HUB_ENABLE_HF_TRANSFER: {os.environ.get('HF_HUB_ENABLE_HF_TRANSFER', 'not set')}")
|
|
40
|
+
print(f"HF_HOME: {os.environ.get('HF_HOME', 'not set (using default ~/.cache/huggingface)')}")
|
|
41
|
+
print(f"Default repo id: {HF_REPO_ID}")
|
|
42
|
+
print(f"Python version: {sys.version}")
|
|
43
|
+
return 0
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def main() -> None:
|
|
47
|
+
ap = argparse.ArgumentParser(prog="tree-shift")
|
|
48
|
+
sub = ap.add_subparsers(dest="cmd", required=True)
|
|
49
|
+
|
|
50
|
+
# ── list ──────────────────────────────────────────────────────────
|
|
51
|
+
ls = sub.add_parser("list", help="List available configs from the Hub")
|
|
52
|
+
ls.add_argument("--repo", default=HF_REPO_ID)
|
|
53
|
+
ls.add_argument("--revision", default=None)
|
|
54
|
+
ls.add_argument("--token", default=None, help="HF Hub token (or set HF_TOKEN env var)")
|
|
55
|
+
|
|
56
|
+
# ── export ────────────────────────────────────────────────────────
|
|
57
|
+
ex = sub.add_parser("export", help="Export a config to COCO on disk")
|
|
58
|
+
ex.add_argument("--repo", default=HF_REPO_ID)
|
|
59
|
+
ex.add_argument("--config", required=True)
|
|
60
|
+
ex.add_argument("--out", required=True)
|
|
61
|
+
ex.add_argument("--revision", default=None)
|
|
62
|
+
ex.add_argument("--splits", nargs="*", default=DEFAULT_SPLITS)
|
|
63
|
+
ex.add_argument("--no-streaming", action="store_true")
|
|
64
|
+
ex.add_argument("--token", default=None, help="HF Hub token (or set HF_TOKEN env var)")
|
|
65
|
+
|
|
66
|
+
# ── doctor ───────────────────────────────────────────────────────
|
|
67
|
+
sub.add_parser("doctor", help="Print diagnostic info for debugging")
|
|
68
|
+
|
|
69
|
+
args = ap.parse_args()
|
|
70
|
+
|
|
71
|
+
if args.cmd == "doctor":
|
|
72
|
+
return _doctor()
|
|
73
|
+
|
|
74
|
+
_setup_token(getattr(args, "token", None))
|
|
75
|
+
|
|
76
|
+
if args.cmd == "list":
|
|
77
|
+
for c in list_configs(repo_id=args.repo, revision=args.revision):
|
|
78
|
+
print(c)
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
if args.cmd == "export":
|
|
82
|
+
export_coco(
|
|
83
|
+
repo_id=args.repo,
|
|
84
|
+
config=args.config,
|
|
85
|
+
out_root=args.out,
|
|
86
|
+
splits=args.splits,
|
|
87
|
+
revision=args.revision,
|
|
88
|
+
streaming=(not args.no_streaming),
|
|
89
|
+
)
|
|
90
|
+
return
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Dict, Iterable, List, Optional, Union
|
|
6
|
+
|
|
7
|
+
import orjson
|
|
8
|
+
from datasets import load_dataset
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
|
|
11
|
+
from .constants import DEFAULT_SPLITS, HF_REPO_ID
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class _SplitStats:
|
|
15
|
+
"""Track statistics for a single split export."""
|
|
16
|
+
def __init__(self):
|
|
17
|
+
self.images = 0
|
|
18
|
+
self.skipped = 0
|
|
19
|
+
self.annotations = 0
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _export_split_iter(ds: Iterable[dict], out_dir: Path, split_name: str) -> _SplitStats:
|
|
23
|
+
"""Stream a single split to COCO format on disk.
|
|
24
|
+
|
|
25
|
+
Resume-safe: images that already exist on disk are skipped.
|
|
26
|
+
The COCO annotation JSON is always regenerated so it stays
|
|
27
|
+
consistent with the full set of images.
|
|
28
|
+
"""
|
|
29
|
+
images_dir = out_dir / split_name / "images"
|
|
30
|
+
ann_dir = out_dir / split_name / "annotations"
|
|
31
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
32
|
+
ann_dir.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
|
|
34
|
+
images: list[dict] = []
|
|
35
|
+
annotations: list[dict] = []
|
|
36
|
+
categories: list[dict] | None = None
|
|
37
|
+
ann_id = 1
|
|
38
|
+
|
|
39
|
+
stats = _SplitStats()
|
|
40
|
+
|
|
41
|
+
for ex in ds:
|
|
42
|
+
stats.images += 1
|
|
43
|
+
if stats.images % 500 == 0:
|
|
44
|
+
tqdm.write(f"[{split_name}] processed {stats.images} images ({stats.skipped} skipped)...")
|
|
45
|
+
|
|
46
|
+
image_id = int(ex["image_id"])
|
|
47
|
+
filename = ex["filename"]
|
|
48
|
+
width = int(ex["width"])
|
|
49
|
+
height = int(ex["height"])
|
|
50
|
+
|
|
51
|
+
# Resume-safe: skip writing if the image already exists on disk
|
|
52
|
+
img_path = images_dir / filename
|
|
53
|
+
if img_path.exists():
|
|
54
|
+
stats.skipped += 1
|
|
55
|
+
else:
|
|
56
|
+
img_path.write_bytes(ex["image_bytes"])
|
|
57
|
+
|
|
58
|
+
images.append(
|
|
59
|
+
{"id": image_id, "file_name": filename, "width": width, "height": height}
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
annos = orjson.loads(ex["coco_annotations"].encode("utf-8"))
|
|
63
|
+
if categories is None:
|
|
64
|
+
categories = orjson.loads(ex["coco_categories"].encode("utf-8"))
|
|
65
|
+
|
|
66
|
+
for a in annos:
|
|
67
|
+
a = dict(a)
|
|
68
|
+
a["id"] = ann_id
|
|
69
|
+
ann_id += 1
|
|
70
|
+
annotations.append(a)
|
|
71
|
+
|
|
72
|
+
stats.annotations = len(annotations)
|
|
73
|
+
|
|
74
|
+
coco = {
|
|
75
|
+
"images": images,
|
|
76
|
+
"annotations": annotations,
|
|
77
|
+
"categories": categories or [],
|
|
78
|
+
}
|
|
79
|
+
(ann_dir / f"instances_{split_name}.json").write_bytes(orjson.dumps(coco))
|
|
80
|
+
tqdm.write(
|
|
81
|
+
f"[{split_name}] done — {stats.images} images ({stats.skipped} skipped), "
|
|
82
|
+
f"{stats.annotations} annotations"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
return stats
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def export_coco(
|
|
89
|
+
config: str,
|
|
90
|
+
out_root: Union[str, Path],
|
|
91
|
+
repo_id: str = HF_REPO_ID,
|
|
92
|
+
splits: Optional[List[str]] = None,
|
|
93
|
+
revision: Optional[str] = None,
|
|
94
|
+
streaming: bool = True,
|
|
95
|
+
) -> Path:
|
|
96
|
+
"""
|
|
97
|
+
Export a HF config to COCO folders.
|
|
98
|
+
|
|
99
|
+
Writes to: <out_root>/<config>/<split>/{images, annotations}
|
|
100
|
+
Also writes <out_root>/<config>/manifest.json with export metadata.
|
|
101
|
+
"""
|
|
102
|
+
out_root = Path(out_root)
|
|
103
|
+
out_dir = out_root / config
|
|
104
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
105
|
+
|
|
106
|
+
if splits is None:
|
|
107
|
+
splits = DEFAULT_SPLITS
|
|
108
|
+
|
|
109
|
+
split_counts: Dict[str, Any] = {}
|
|
110
|
+
|
|
111
|
+
for split in splits:
|
|
112
|
+
tqdm.write(f"Exporting split '{split}' for config '{config}' ...")
|
|
113
|
+
ds = load_dataset(
|
|
114
|
+
repo_id,
|
|
115
|
+
config,
|
|
116
|
+
split=split,
|
|
117
|
+
streaming=streaming,
|
|
118
|
+
revision=revision,
|
|
119
|
+
)
|
|
120
|
+
stats = _export_split_iter(ds, out_dir, split)
|
|
121
|
+
split_counts[split] = {
|
|
122
|
+
"images": stats.images,
|
|
123
|
+
"annotations": stats.annotations,
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
# Write manifest.json
|
|
127
|
+
manifest = {
|
|
128
|
+
"repo_id": repo_id,
|
|
129
|
+
"config": config,
|
|
130
|
+
"revision": revision,
|
|
131
|
+
"splits": splits,
|
|
132
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
133
|
+
"counts": split_counts,
|
|
134
|
+
}
|
|
135
|
+
manifest_path = out_dir / "manifest.json"
|
|
136
|
+
manifest_path.write_bytes(orjson.dumps(manifest, option=orjson.OPT_INDENT_2))
|
|
137
|
+
tqdm.write(f"Manifest written to {manifest_path}")
|
|
138
|
+
|
|
139
|
+
return out_dir
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
|
|
5
|
+
from datasets import get_dataset_config_names
|
|
6
|
+
|
|
7
|
+
from .constants import HF_REPO_ID
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def list_configs(
|
|
11
|
+
repo_id: str = HF_REPO_ID, revision: Optional[str] = None
|
|
12
|
+
) -> List[str]:
|
|
13
|
+
"""List available configs from the HF dataset repo (no data download)."""
|
|
14
|
+
return get_dataset_config_names(repo_id, revision=revision)
|