tree-distribution-shift 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Aaditya Nalawade
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,281 @@
1
+ Metadata-Version: 2.4
2
+ Name: tree-distribution-shift
3
+ Version: 0.1.0
4
+ Summary: Tree Distribution Shift benchmark: export HF configs to COCO (train/id_test/ood_test + density buckets).
5
+ Author: Aaditya Nalawade
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: datasets>=2.19.0
12
+ Requires-Dist: huggingface_hub>=0.23.0
13
+ Requires-Dist: orjson>=3.9.0
14
+ Requires-Dist: tqdm>=4.66.0
15
+ Dynamic: license-file
16
+
17
+ # tree-distribution-shift
18
+
19
+ A pip-installable benchmark for **Tree Distribution Shift** — export HuggingFace dataset configs to COCO format on disk.
20
+
21
+ **No git. No git-lfs. Works on clusters.**
22
+
23
+ Data lives on the HF Hub: [`aadityabuilds/tree-distribution-shift`](https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift)
24
+
25
+ ## Getting started
26
+
27
+ ```bash
28
+ pip install -U pip
29
+ pip install tree-distribution-shift
30
+
31
+ # See all available configs (queries HF Hub live)
32
+ tree-shift list
33
+
34
+ # Export a config to COCO format
35
+ tree-shift export --config intl_train_IN__ood_US --out ./data
36
+ ```
37
+
38
+ That's it. No cloning, no LFS, no large downloads up front — data streams directly from the Hub.
39
+
40
+ ### Authentication (recommended)
41
+
42
+ Set an HF token for higher rate limits and faster downloads:
43
+
44
+ ```bash
45
+ export HF_TOKEN=hf_... # recommended
46
+ ```
47
+
48
+ Or log in once with the CLI:
49
+
50
+ ```bash
51
+ huggingface-cli login
52
+ ```
53
+
54
+ You can also pass a token per-command:
55
+
56
+ ```bash
57
+ tree-shift list --token hf_...
58
+ tree-shift export --config intl_train_IN__ood_US --out ./data --token hf_...
59
+ ```
60
+
61
+ ### Export only the splits you need
62
+
63
+ ```bash
64
+ tree-shift export \
65
+ --config in_state_train_Karnataka__ood_Rajasthan \
66
+ --out ./data \
67
+ --splits train ood_test
68
+ ```
69
+
70
+ ### Pin a revision for reproducibility
71
+
72
+ For papers and exact replication, pin a commit hash:
73
+
74
+ ```bash
75
+ tree-shift export \
76
+ --config in_state_train_Karnataka__ood_Rajasthan \
77
+ --out ./data \
78
+ --revision abc123def456
79
+ ```
80
+
81
+ ### Python API
82
+
83
+ ```python
84
+ from tree_shift import list_configs, export_coco
85
+
86
+ print(list_configs())
87
+
88
+ export_coco(
89
+ config="in_state_train_Karnataka__ood_Rajasthan",
90
+ out_root="./data",
91
+ splits=["train", "id_test", "ood_test"],
92
+ streaming=True, # default; avoids OOM on low-mem machines
93
+ revision=None, # or pin a commit hash
94
+ )
95
+ ```
96
+
97
+ ## Output structure
98
+
99
+ ```
100
+ ./data/<config>/
101
+ train/
102
+ images/
103
+ *.png
104
+ annotations/
105
+ instances_train.json
106
+ id_test/
107
+ images/
108
+ annotations/
109
+ ood_test/
110
+ images/
111
+ annotations/
112
+ ood_same_density/
113
+ images/
114
+ annotations/
115
+ ood_diff_density/
116
+ images/
117
+ annotations/
118
+ ```
119
+
120
+ Annotation files follow the [COCO format](https://cocodataset.org/#format-data).
121
+
122
+ ## Dataset contract
123
+
124
+ Every example row in the HF dataset contains:
125
+
126
+ | Column | Type | Description |
127
+ |--------|------|-------------|
128
+ | `image_id` | int | Unique image identifier |
129
+ | `filename` | str | Image filename (e.g. `00001.png`) |
130
+ | `width` | int | Image width in pixels |
131
+ | `height` | int | Image height in pixels |
132
+ | `image_bytes` | bytes | Raw image file contents |
133
+ | `coco_annotations` | str | JSON string — list of COCO annotation dicts |
134
+ | `coco_categories` | str | JSON string — list of COCO category dicts |
135
+ | `country` | str | Country code (e.g. `IN`, `US`) |
136
+ | `state` | str | State name |
137
+ | `zone` | str | Geographic zone |
138
+ | `biome` | str | Biome classification |
139
+ | `density_category` | str | Tree density bucket |
140
+
141
+ ### Expected splits
142
+
143
+ - **`train`** — training set
144
+ - **`id_test`** — in-distribution test set
145
+ - **`ood_test`** — out-of-distribution test set
146
+ - **`ood_same_density`** — OOD test filtered to same density bucket as train
147
+ - **`ood_diff_density`** — OOD test filtered to different density bucket
148
+
149
+ > **Note:** Some older configs may not include the `ood_same_density` / `ood_diff_density` splits.
150
+
151
+ ## Resume support
152
+
153
+ Exports are **resume-safe**. If a job is preempted or interrupted:
154
+
155
+ ```bash
156
+ # Just re-run the same command — already-written images are skipped
157
+ tree-shift export --config intl_train_IN__ood_US --out ./data
158
+ ```
159
+
160
+ Images that already exist on disk are not re-downloaded. The COCO annotation JSON is always regenerated to stay consistent with the full split.
161
+
162
+ ## Rate limits & performance
163
+
164
+ - **Streaming is on by default** — data is fetched shard-by-shard, avoiding OOM and memory-map errors.
165
+ - **`--no-streaming`** downloads the full split into an Arrow table first. This can OOM on large splits or cause `mmap` errors on low-memory machines. Use only if you have plenty of RAM.
166
+ - For faster downloads (when available), set the environment variable:
167
+ ```bash
168
+ export HF_HUB_ENABLE_HF_TRANSFER=1
169
+ ```
170
+ - Exports call `load_dataset` once per split (not per example), keeping HF API calls minimal.
171
+ - If you hit 429 rate-limit errors, wait a few minutes and re-run — resume will skip already-written images.
172
+
173
+ ## Cache location (clusters/shared machines)
174
+
175
+ The exporter uses the standard HuggingFace cache. On clusters with limited home directory space, set a custom cache location on a large disk:
176
+
177
+ ```bash
178
+ export HF_HOME=/path/to/big/disk/hf_cache
179
+ ```
180
+
181
+ This prevents re-downloading and reduces load on the HF Hub.
182
+
183
+ ## Diagnostic command
184
+
185
+ If exports fail or behave unexpectedly, run the doctor command and share the output:
186
+
187
+ ```bash
188
+ tree-shift doctor
189
+ ```
190
+
191
+ Output includes package versions, environment variables, and Python version — helpful for debugging "works on my machine" issues.
192
+
193
+ ## Manifest file
194
+
195
+ Every export writes a `manifest.json` in the config output folder:
196
+
197
+ ```json
198
+ {
199
+ "repo_id": "aadityabuilds/tree-distribution-shift",
200
+ "config": "intl_train_IN__ood_US",
201
+ "revision": "abc123def456",
202
+ "splits": ["train", "ood_test"],
203
+ "timestamp": "2026-02-17T04:20:00+00:00",
204
+ "counts": {
205
+ "train": {"images": 11783, "annotations": 181208},
206
+ "ood_test": {"images": 2345, "annotations": 36000}
207
+ }
208
+ }
209
+ ```
210
+
211
+ This is essential for reproducibility: it records the exact revision, splits exported, and counts per split. Include `manifest.json` when sharing or publishing data exports.
212
+
213
+ ## API reference
214
+
215
+ ### `list_configs(repo_id=..., revision=None) -> list[str]`
216
+
217
+ Returns the list of available dataset configs from the Hub. No data is downloaded.
218
+
219
+ ### `export_coco(config, out_root, repo_id=..., splits=None, revision=None, streaming=True) -> Path`
220
+
221
+ Exports a config to COCO folders on disk.
222
+
223
+ | Parameter | Default | Description |
224
+ |-------------|----------------------------------|------------------------------------------------|
225
+ | `config` | *(required)* | Name of the HF dataset config |
226
+ | `out_root` | *(required)* | Root output directory |
227
+ | `splits` | all 5 splits | Which splits to export |
228
+ | `streaming` | `True` | Stream data (recommended for low-mem machines) |
229
+ | `revision` | `None` | HF dataset revision / commit hash |
230
+
231
+ ## Dependencies
232
+
233
+ This package is intentionally lightweight:
234
+
235
+ - `datasets` — HF dataset loading
236
+ - `huggingface_hub` — Hub API access
237
+ - `orjson` — fast JSON serialization
238
+ - `tqdm` — progress display
239
+
240
+ No `torch`, `pycocotools`, or `opencv` required.
241
+
242
+ ## Why this package?
243
+
244
+ - **No git / git-lfs** — works on clusters where git-lfs is painful.
245
+ - **Streaming by default** — avoids OOM and memory-map errors.
246
+ - **Resume-safe** — re-run after interruption without re-downloading.
247
+ - **Uses HF cache** — respects revisions, reproducible.
248
+ - **Thin client** — new configs on the Hub are visible immediately via `list_configs()` (no pip update needed).
249
+
250
+ ## Versioning policy
251
+
252
+ The pip package contains tools only. Data versions are pinned with `--revision`.
253
+
254
+ - **0.1.x** — exporter stability, backward compatible CLI/API.
255
+ - **Minor bump** — when on-disk COCO layout or CLI flags change.
256
+ - **Major bump** — if split semantics change (unlikely).
257
+
258
+ New configs on the HF Hub are visible immediately via `tree-shift list`; no pip update required to access new data.
259
+
260
+ ## Maintenance
261
+
262
+ - **Adding new configs/data** → push to the HF dataset repo only.
263
+ - **Package updates** → only needed for new CLI flags or exporter behavior changes.
264
+ - **Users see new configs automatically** via `tree-shift list`.
265
+
266
+ ## Citation
267
+
268
+ If you use this dataset in research, please cite:
269
+
270
+ ```bibtex
271
+ @misc{nalawade2025tree_distribution_shift,
272
+ title={Tree Distribution Shift: A Benchmark for Out-of-Distribution Tree Detection},
273
+ author={Nalawade, Aaditya},
274
+ year={2025},
275
+ howpublished={\url{https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift}}
276
+ }
277
+ ```
278
+
279
+ ## License
280
+
281
+ MIT
@@ -0,0 +1,265 @@
1
+ # tree-distribution-shift
2
+
3
+ A pip-installable benchmark for **Tree Distribution Shift** — export HuggingFace dataset configs to COCO format on disk.
4
+
5
+ **No git. No git-lfs. Works on clusters.**
6
+
7
+ Data lives on the HF Hub: [`aadityabuilds/tree-distribution-shift`](https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift)
8
+
9
+ ## Getting started
10
+
11
+ ```bash
12
+ pip install -U pip
13
+ pip install tree-distribution-shift
14
+
15
+ # See all available configs (queries HF Hub live)
16
+ tree-shift list
17
+
18
+ # Export a config to COCO format
19
+ tree-shift export --config intl_train_IN__ood_US --out ./data
20
+ ```
21
+
22
+ That's it. No cloning, no LFS, no large downloads up front — data streams directly from the Hub.
23
+
24
+ ### Authentication (recommended)
25
+
26
+ Set an HF token for higher rate limits and faster downloads:
27
+
28
+ ```bash
29
+ export HF_TOKEN=hf_... # recommended
30
+ ```
31
+
32
+ Or log in once with the CLI:
33
+
34
+ ```bash
35
+ huggingface-cli login
36
+ ```
37
+
38
+ You can also pass a token per-command:
39
+
40
+ ```bash
41
+ tree-shift list --token hf_...
42
+ tree-shift export --config intl_train_IN__ood_US --out ./data --token hf_...
43
+ ```
44
+
45
+ ### Export only the splits you need
46
+
47
+ ```bash
48
+ tree-shift export \
49
+ --config in_state_train_Karnataka__ood_Rajasthan \
50
+ --out ./data \
51
+ --splits train ood_test
52
+ ```
53
+
54
+ ### Pin a revision for reproducibility
55
+
56
+ For papers and exact replication, pin a commit hash:
57
+
58
+ ```bash
59
+ tree-shift export \
60
+ --config in_state_train_Karnataka__ood_Rajasthan \
61
+ --out ./data \
62
+ --revision abc123def456
63
+ ```
64
+
65
+ ### Python API
66
+
67
+ ```python
68
+ from tree_shift import list_configs, export_coco
69
+
70
+ print(list_configs())
71
+
72
+ export_coco(
73
+ config="in_state_train_Karnataka__ood_Rajasthan",
74
+ out_root="./data",
75
+ splits=["train", "id_test", "ood_test"],
76
+ streaming=True, # default; avoids OOM on low-mem machines
77
+ revision=None, # or pin a commit hash
78
+ )
79
+ ```
80
+
81
+ ## Output structure
82
+
83
+ ```
84
+ ./data/<config>/
85
+ train/
86
+ images/
87
+ *.png
88
+ annotations/
89
+ instances_train.json
90
+ id_test/
91
+ images/
92
+ annotations/
93
+ ood_test/
94
+ images/
95
+ annotations/
96
+ ood_same_density/
97
+ images/
98
+ annotations/
99
+ ood_diff_density/
100
+ images/
101
+ annotations/
102
+ ```
103
+
104
+ Annotation files follow the [COCO format](https://cocodataset.org/#format-data).
105
+
106
+ ## Dataset contract
107
+
108
+ Every example row in the HF dataset contains:
109
+
110
+ | Column | Type | Description |
111
+ |--------|------|-------------|
112
+ | `image_id` | int | Unique image identifier |
113
+ | `filename` | str | Image filename (e.g. `00001.png`) |
114
+ | `width` | int | Image width in pixels |
115
+ | `height` | int | Image height in pixels |
116
+ | `image_bytes` | bytes | Raw image file contents |
117
+ | `coco_annotations` | str | JSON string — list of COCO annotation dicts |
118
+ | `coco_categories` | str | JSON string — list of COCO category dicts |
119
+ | `country` | str | Country code (e.g. `IN`, `US`) |
120
+ | `state` | str | State name |
121
+ | `zone` | str | Geographic zone |
122
+ | `biome` | str | Biome classification |
123
+ | `density_category` | str | Tree density bucket |
124
+
125
+ ### Expected splits
126
+
127
+ - **`train`** — training set
128
+ - **`id_test`** — in-distribution test set
129
+ - **`ood_test`** — out-of-distribution test set
130
+ - **`ood_same_density`** — OOD test filtered to same density bucket as train
131
+ - **`ood_diff_density`** — OOD test filtered to different density bucket
132
+
133
+ > **Note:** Some older configs may not include the `ood_same_density` / `ood_diff_density` splits.
134
+
135
+ ## Resume support
136
+
137
+ Exports are **resume-safe**. If a job is preempted or interrupted:
138
+
139
+ ```bash
140
+ # Just re-run the same command — already-written images are skipped
141
+ tree-shift export --config intl_train_IN__ood_US --out ./data
142
+ ```
143
+
144
+ Images that already exist on disk are not re-downloaded. The COCO annotation JSON is always regenerated to stay consistent with the full split.
145
+
146
+ ## Rate limits & performance
147
+
148
+ - **Streaming is on by default** — data is fetched shard-by-shard, avoiding OOM and memory-map errors.
149
+ - **`--no-streaming`** downloads the full split into an Arrow table first. This can OOM on large splits or cause `mmap` errors on low-memory machines. Use only if you have plenty of RAM.
150
+ - For faster downloads (when available), set the environment variable:
151
+ ```bash
152
+ export HF_HUB_ENABLE_HF_TRANSFER=1
153
+ ```
154
+ - Exports call `load_dataset` once per split (not per example), keeping HF API calls minimal.
155
+ - If you hit 429 rate-limit errors, wait a few minutes and re-run — resume will skip already-written images.
156
+
157
+ ## Cache location (clusters/shared machines)
158
+
159
+ The exporter uses the standard HuggingFace cache. On clusters with limited home directory space, set a custom cache location on a large disk:
160
+
161
+ ```bash
162
+ export HF_HOME=/path/to/big/disk/hf_cache
163
+ ```
164
+
165
+ This prevents re-downloading and reduces load on the HF Hub.
166
+
167
+ ## Diagnostic command
168
+
169
+ If exports fail or behave unexpectedly, run the doctor command and share the output:
170
+
171
+ ```bash
172
+ tree-shift doctor
173
+ ```
174
+
175
+ Output includes package versions, environment variables, and Python version — helpful for debugging "works on my machine" issues.
176
+
177
+ ## Manifest file
178
+
179
+ Every export writes a `manifest.json` in the config output folder:
180
+
181
+ ```json
182
+ {
183
+ "repo_id": "aadityabuilds/tree-distribution-shift",
184
+ "config": "intl_train_IN__ood_US",
185
+ "revision": "abc123def456",
186
+ "splits": ["train", "ood_test"],
187
+ "timestamp": "2026-02-17T04:20:00+00:00",
188
+ "counts": {
189
+ "train": {"images": 11783, "annotations": 181208},
190
+ "ood_test": {"images": 2345, "annotations": 36000}
191
+ }
192
+ }
193
+ ```
194
+
195
+ This is essential for reproducibility: it records the exact revision, splits exported, and counts per split. Include `manifest.json` when sharing or publishing data exports.
196
+
197
+ ## API reference
198
+
199
+ ### `list_configs(repo_id=..., revision=None) -> list[str]`
200
+
201
+ Returns the list of available dataset configs from the Hub. No data is downloaded.
202
+
203
+ ### `export_coco(config, out_root, repo_id=..., splits=None, revision=None, streaming=True) -> Path`
204
+
205
+ Exports a config to COCO folders on disk.
206
+
207
+ | Parameter | Default | Description |
208
+ |-------------|----------------------------------|------------------------------------------------|
209
+ | `config` | *(required)* | Name of the HF dataset config |
210
+ | `out_root` | *(required)* | Root output directory |
211
+ | `splits` | all 5 splits | Which splits to export |
212
+ | `streaming` | `True` | Stream data (recommended for low-mem machines) |
213
+ | `revision` | `None` | HF dataset revision / commit hash |
214
+
215
+ ## Dependencies
216
+
217
+ This package is intentionally lightweight:
218
+
219
+ - `datasets` — HF dataset loading
220
+ - `huggingface_hub` — Hub API access
221
+ - `orjson` — fast JSON serialization
222
+ - `tqdm` — progress display
223
+
224
+ No `torch`, `pycocotools`, or `opencv` required.
225
+
226
+ ## Why this package?
227
+
228
+ - **No git / git-lfs** — works on clusters where git-lfs is painful.
229
+ - **Streaming by default** — avoids OOM and memory-map errors.
230
+ - **Resume-safe** — re-run after interruption without re-downloading.
231
+ - **Uses HF cache** — respects revisions, reproducible.
232
+ - **Thin client** — new configs on the Hub are visible immediately via `list_configs()` (no pip update needed).
233
+
234
+ ## Versioning policy
235
+
236
+ The pip package contains tools only. Data versions are pinned with `--revision`.
237
+
238
+ - **0.1.x** — exporter stability, backward compatible CLI/API.
239
+ - **Minor bump** — when on-disk COCO layout or CLI flags change.
240
+ - **Major bump** — if split semantics change (unlikely).
241
+
242
+ New configs on the HF Hub are visible immediately via `tree-shift list`; no pip update required to access new data.
243
+
244
+ ## Maintenance
245
+
246
+ - **Adding new configs/data** → push to the HF dataset repo only.
247
+ - **Package updates** → only needed for new CLI flags or exporter behavior changes.
248
+ - **Users see new configs automatically** via `tree-shift list`.
249
+
250
+ ## Citation
251
+
252
+ If you use this dataset in research, please cite:
253
+
254
+ ```bibtex
255
+ @misc{nalawade2025tree_distribution_shift,
256
+ title={Tree Distribution Shift: A Benchmark for Out-of-Distribution Tree Detection},
257
+ author={Nalawade, Aaditya},
258
+ year={2025},
259
+ howpublished={\url{https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift}}
260
+ }
261
+ ```
262
+
263
+ ## License
264
+
265
+ MIT
@@ -0,0 +1,30 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tree-distribution-shift"
7
+ version = "0.1.0"
8
+ description = "Tree Distribution Shift benchmark: export HF configs to COCO (train/id_test/ood_test + density buckets)."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = "MIT"
12
+ authors = [{name = "Aaditya Nalawade"}]
13
+ dependencies = [
14
+ "datasets>=2.19.0",
15
+ "huggingface_hub>=0.23.0",
16
+ "orjson>=3.9.0",
17
+ "tqdm>=4.66.0",
18
+ ]
19
+
20
+ [project.urls]
21
+ Homepage = "https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift"
22
+
23
+ [project.scripts]
24
+ tree-shift = "tree_shift.cli:main"
25
+
26
+ [tool.setuptools]
27
+ package-dir = {"" = "src"}
28
+
29
+ [tool.setuptools.packages.find]
30
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,281 @@
1
+ Metadata-Version: 2.4
2
+ Name: tree-distribution-shift
3
+ Version: 0.1.0
4
+ Summary: Tree Distribution Shift benchmark: export HF configs to COCO (train/id_test/ood_test + density buckets).
5
+ Author: Aaditya Nalawade
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: datasets>=2.19.0
12
+ Requires-Dist: huggingface_hub>=0.23.0
13
+ Requires-Dist: orjson>=3.9.0
14
+ Requires-Dist: tqdm>=4.66.0
15
+ Dynamic: license-file
16
+
17
+ # tree-distribution-shift
18
+
19
+ A pip-installable benchmark for **Tree Distribution Shift** — export HuggingFace dataset configs to COCO format on disk.
20
+
21
+ **No git. No git-lfs. Works on clusters.**
22
+
23
+ Data lives on the HF Hub: [`aadityabuilds/tree-distribution-shift`](https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift)
24
+
25
+ ## Getting started
26
+
27
+ ```bash
28
+ pip install -U pip
29
+ pip install tree-distribution-shift
30
+
31
+ # See all available configs (queries HF Hub live)
32
+ tree-shift list
33
+
34
+ # Export a config to COCO format
35
+ tree-shift export --config intl_train_IN__ood_US --out ./data
36
+ ```
37
+
38
+ That's it. No cloning, no LFS, no large downloads up front — data streams directly from the Hub.
39
+
40
+ ### Authentication (recommended)
41
+
42
+ Set an HF token for higher rate limits and faster downloads:
43
+
44
+ ```bash
45
+ export HF_TOKEN=hf_... # recommended
46
+ ```
47
+
48
+ Or log in once with the CLI:
49
+
50
+ ```bash
51
+ huggingface-cli login
52
+ ```
53
+
54
+ You can also pass a token per-command:
55
+
56
+ ```bash
57
+ tree-shift list --token hf_...
58
+ tree-shift export --config intl_train_IN__ood_US --out ./data --token hf_...
59
+ ```
60
+
61
+ ### Export only the splits you need
62
+
63
+ ```bash
64
+ tree-shift export \
65
+ --config in_state_train_Karnataka__ood_Rajasthan \
66
+ --out ./data \
67
+ --splits train ood_test
68
+ ```
69
+
70
+ ### Pin a revision for reproducibility
71
+
72
+ For papers and exact replication, pin a commit hash:
73
+
74
+ ```bash
75
+ tree-shift export \
76
+ --config in_state_train_Karnataka__ood_Rajasthan \
77
+ --out ./data \
78
+ --revision abc123def456
79
+ ```
80
+
81
+ ### Python API
82
+
83
+ ```python
84
+ from tree_shift import list_configs, export_coco
85
+
86
+ print(list_configs())
87
+
88
+ export_coco(
89
+ config="in_state_train_Karnataka__ood_Rajasthan",
90
+ out_root="./data",
91
+ splits=["train", "id_test", "ood_test"],
92
+ streaming=True, # default; avoids OOM on low-mem machines
93
+ revision=None, # or pin a commit hash
94
+ )
95
+ ```
96
+
97
+ ## Output structure
98
+
99
+ ```
100
+ ./data/<config>/
101
+ train/
102
+ images/
103
+ *.png
104
+ annotations/
105
+ instances_train.json
106
+ id_test/
107
+ images/
108
+ annotations/
109
+ ood_test/
110
+ images/
111
+ annotations/
112
+ ood_same_density/
113
+ images/
114
+ annotations/
115
+ ood_diff_density/
116
+ images/
117
+ annotations/
118
+ ```
119
+
120
+ Annotation files follow the [COCO format](https://cocodataset.org/#format-data).
121
+
122
+ ## Dataset contract
123
+
124
+ Every example row in the HF dataset contains:
125
+
126
+ | Column | Type | Description |
127
+ |--------|------|-------------|
128
+ | `image_id` | int | Unique image identifier |
129
+ | `filename` | str | Image filename (e.g. `00001.png`) |
130
+ | `width` | int | Image width in pixels |
131
+ | `height` | int | Image height in pixels |
132
+ | `image_bytes` | bytes | Raw image file contents |
133
+ | `coco_annotations` | str | JSON string — list of COCO annotation dicts |
134
+ | `coco_categories` | str | JSON string — list of COCO category dicts |
135
+ | `country` | str | Country code (e.g. `IN`, `US`) |
136
+ | `state` | str | State name |
137
+ | `zone` | str | Geographic zone |
138
+ | `biome` | str | Biome classification |
139
+ | `density_category` | str | Tree density bucket |
140
+
141
+ ### Expected splits
142
+
143
+ - **`train`** — training set
144
+ - **`id_test`** — in-distribution test set
145
+ - **`ood_test`** — out-of-distribution test set
146
+ - **`ood_same_density`** — OOD test filtered to same density bucket as train
147
+ - **`ood_diff_density`** — OOD test filtered to different density bucket
148
+
149
+ > **Note:** Some older configs may not include the `ood_same_density` / `ood_diff_density` splits.
150
+
151
+ ## Resume support
152
+
153
+ Exports are **resume-safe**. If a job is preempted or interrupted:
154
+
155
+ ```bash
156
+ # Just re-run the same command — already-written images are skipped
157
+ tree-shift export --config intl_train_IN__ood_US --out ./data
158
+ ```
159
+
160
+ Images that already exist on disk are not re-downloaded. The COCO annotation JSON is always regenerated to stay consistent with the full split.
161
+
162
+ ## Rate limits & performance
163
+
164
+ - **Streaming is on by default** — data is fetched shard-by-shard, avoiding OOM and memory-map errors.
165
+ - **`--no-streaming`** downloads the full split into an Arrow table first. This can OOM on large splits or cause `mmap` errors on low-memory machines. Use only if you have plenty of RAM.
166
+ - For faster downloads (when available), set the environment variable:
167
+ ```bash
168
+ export HF_HUB_ENABLE_HF_TRANSFER=1
169
+ ```
170
+ - Exports call `load_dataset` once per split (not per example), keeping HF API calls minimal.
171
+ - If you hit 429 rate-limit errors, wait a few minutes and re-run — resume will skip already-written images.
172
+
173
+ ## Cache location (clusters/shared machines)
174
+
175
+ The exporter uses the standard HuggingFace cache. On clusters with limited home directory space, set a custom cache location on a large disk:
176
+
177
+ ```bash
178
+ export HF_HOME=/path/to/big/disk/hf_cache
179
+ ```
180
+
181
+ This prevents re-downloading and reduces load on the HF Hub.
182
+
183
+ ## Diagnostic command
184
+
185
+ If exports fail or behave unexpectedly, run the doctor command and share the output:
186
+
187
+ ```bash
188
+ tree-shift doctor
189
+ ```
190
+
191
+ Output includes package versions, environment variables, and Python version — helpful for debugging "works on my machine" issues.
192
+
193
+ ## Manifest file
194
+
195
+ Every export writes a `manifest.json` in the config output folder:
196
+
197
+ ```json
198
+ {
199
+ "repo_id": "aadityabuilds/tree-distribution-shift",
200
+ "config": "intl_train_IN__ood_US",
201
+ "revision": "abc123def456",
202
+ "splits": ["train", "ood_test"],
203
+ "timestamp": "2026-02-17T04:20:00+00:00",
204
+ "counts": {
205
+ "train": {"images": 11783, "annotations": 181208},
206
+ "ood_test": {"images": 2345, "annotations": 36000}
207
+ }
208
+ }
209
+ ```
210
+
211
+ This is essential for reproducibility: it records the exact revision, splits exported, and counts per split. Include `manifest.json` when sharing or publishing data exports.
212
+
213
+ ## API reference
214
+
215
+ ### `list_configs(repo_id=..., revision=None) -> list[str]`
216
+
217
+ Returns the list of available dataset configs from the Hub. No data is downloaded.
218
+
219
+ ### `export_coco(config, out_root, repo_id=..., splits=None, revision=None, streaming=True) -> Path`
220
+
221
+ Exports a config to COCO folders on disk.
222
+
223
+ | Parameter | Default | Description |
224
+ |-------------|----------------------------------|------------------------------------------------|
225
+ | `config` | *(required)* | Name of the HF dataset config |
226
+ | `out_root` | *(required)* | Root output directory |
227
+ | `splits` | all 5 splits | Which splits to export |
228
+ | `streaming` | `True` | Stream data (recommended for low-mem machines) |
229
+ | `revision` | `None` | HF dataset revision / commit hash |
230
+
231
+ ## Dependencies
232
+
233
+ This package is intentionally lightweight:
234
+
235
+ - `datasets` — HF dataset loading
236
+ - `huggingface_hub` — Hub API access
237
+ - `orjson` — fast JSON serialization
238
+ - `tqdm` — progress display
239
+
240
+ No `torch`, `pycocotools`, or `opencv` required.
241
+
242
+ ## Why this package?
243
+
244
+ - **No git / git-lfs** — works on clusters where git-lfs is painful.
245
+ - **Streaming by default** — avoids OOM and memory-map errors.
246
+ - **Resume-safe** — re-run after interruption without re-downloading.
247
+ - **Uses HF cache** — respects revisions, reproducible.
248
+ - **Thin client** — new configs on the Hub are visible immediately via `list_configs()` (no pip update needed).
249
+
250
+ ## Versioning policy
251
+
252
+ The pip package contains tools only. Data versions are pinned with `--revision`.
253
+
254
+ - **0.1.x** — exporter stability, backward compatible CLI/API.
255
+ - **Minor bump** — when on-disk COCO layout or CLI flags change.
256
+ - **Major bump** — if split semantics change (unlikely).
257
+
258
+ New configs on the HF Hub are visible immediately via `tree-shift list`; no pip update required to access new data.
259
+
260
+ ## Maintenance
261
+
262
+ - **Adding new configs/data** → push to the HF dataset repo only.
263
+ - **Package updates** → only needed for new CLI flags or exporter behavior changes.
264
+ - **Users see new configs automatically** via `tree-shift list`.
265
+
266
+ ## Citation
267
+
268
+ If you use this dataset in research, please cite:
269
+
270
+ ```bibtex
271
+ @misc{nalawade2025tree_distribution_shift,
272
+ title={Tree Distribution Shift: A Benchmark for Out-of-Distribution Tree Detection},
273
+ author={Nalawade, Aaditya},
274
+ year={2025},
275
+ howpublished={\url{https://huggingface.co/datasets/aadityabuilds/tree-distribution-shift}}
276
+ }
277
+ ```
278
+
279
+ ## License
280
+
281
+ MIT
@@ -0,0 +1,14 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/tree_distribution_shift.egg-info/PKG-INFO
5
+ src/tree_distribution_shift.egg-info/SOURCES.txt
6
+ src/tree_distribution_shift.egg-info/dependency_links.txt
7
+ src/tree_distribution_shift.egg-info/entry_points.txt
8
+ src/tree_distribution_shift.egg-info/requires.txt
9
+ src/tree_distribution_shift.egg-info/top_level.txt
10
+ src/tree_shift/__init__.py
11
+ src/tree_shift/cli.py
12
+ src/tree_shift/constants.py
13
+ src/tree_shift/export.py
14
+ src/tree_shift/hf.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ tree-shift = tree_shift.cli:main
@@ -0,0 +1,4 @@
1
+ datasets>=2.19.0
2
+ huggingface_hub>=0.23.0
3
+ orjson>=3.9.0
4
+ tqdm>=4.66.0
@@ -0,0 +1,5 @@
1
+ from .export import export_coco
2
+ from .hf import list_configs
3
+
4
+ __all__ = ["list_configs", "export_coco"]
5
+ __version__ = "0.1.0"
@@ -0,0 +1,90 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import logging
5
+ import os
6
+ import sys
7
+
8
+ import datasets
9
+ import huggingface_hub
10
+
11
+ from .constants import DEFAULT_SPLITS, HF_REPO_ID
12
+ from .export import export_coco
13
+ from .hf import list_configs
14
+ from . import __version__
15
+
16
+
17
+ def _setup_token(token: str | None) -> None:
18
+ """Authenticate with HF Hub if a token is provided, otherwise print a hint."""
19
+ # Suppress the noisy "unauthenticated requests" warning from huggingface_hub
20
+ logging.getLogger("huggingface_hub.utils._auth").setLevel(logging.ERROR)
21
+
22
+ if token:
23
+ huggingface_hub.login(token=token, add_to_git_credential=False)
24
+ elif not huggingface_hub.utils.get_token():
25
+ print(
26
+ "Tip: set HF_TOKEN for higher rate limits "
27
+ "(export HF_TOKEN=... or run: huggingface-cli login)",
28
+ file=sys.stderr,
29
+ )
30
+
31
+
32
+ def _doctor() -> int:
33
+ """Print diagnostic information."""
34
+ token = huggingface_hub.utils.get_token()
35
+ print(f"tree-distribution-shift version: {__version__}")
36
+ print(f"datasets version: {datasets.__version__}")
37
+ print(f"huggingface_hub version: {huggingface_hub.__version__}")
38
+ print(f"HF_TOKEN: {'set' if token else 'not set'}")
39
+ print(f"HF_HUB_ENABLE_HF_TRANSFER: {os.environ.get('HF_HUB_ENABLE_HF_TRANSFER', 'not set')}")
40
+ print(f"HF_HOME: {os.environ.get('HF_HOME', 'not set (using default ~/.cache/huggingface)')}")
41
+ print(f"Default repo id: {HF_REPO_ID}")
42
+ print(f"Python version: {sys.version}")
43
+ return 0
44
+
45
+
46
+ def main() -> None:
47
+ ap = argparse.ArgumentParser(prog="tree-shift")
48
+ sub = ap.add_subparsers(dest="cmd", required=True)
49
+
50
+ # ── list ──────────────────────────────────────────────────────────
51
+ ls = sub.add_parser("list", help="List available configs from the Hub")
52
+ ls.add_argument("--repo", default=HF_REPO_ID)
53
+ ls.add_argument("--revision", default=None)
54
+ ls.add_argument("--token", default=None, help="HF Hub token (or set HF_TOKEN env var)")
55
+
56
+ # ── export ────────────────────────────────────────────────────────
57
+ ex = sub.add_parser("export", help="Export a config to COCO on disk")
58
+ ex.add_argument("--repo", default=HF_REPO_ID)
59
+ ex.add_argument("--config", required=True)
60
+ ex.add_argument("--out", required=True)
61
+ ex.add_argument("--revision", default=None)
62
+ ex.add_argument("--splits", nargs="*", default=DEFAULT_SPLITS)
63
+ ex.add_argument("--no-streaming", action="store_true")
64
+ ex.add_argument("--token", default=None, help="HF Hub token (or set HF_TOKEN env var)")
65
+
66
+ # ── doctor ───────────────────────────────────────────────────────
67
+ sub.add_parser("doctor", help="Print diagnostic info for debugging")
68
+
69
+ args = ap.parse_args()
70
+
71
+ if args.cmd == "doctor":
72
+ return _doctor()
73
+
74
+ _setup_token(getattr(args, "token", None))
75
+
76
+ if args.cmd == "list":
77
+ for c in list_configs(repo_id=args.repo, revision=args.revision):
78
+ print(c)
79
+ return
80
+
81
+ if args.cmd == "export":
82
+ export_coco(
83
+ repo_id=args.repo,
84
+ config=args.config,
85
+ out_root=args.out,
86
+ splits=args.splits,
87
+ revision=args.revision,
88
+ streaming=(not args.no_streaming),
89
+ )
90
+ return
@@ -0,0 +1,2 @@
1
+ HF_REPO_ID = "aadityabuilds/tree-distribution-shift"
2
+ DEFAULT_SPLITS = ["train", "id_test", "ood_test", "ood_same_density", "ood_diff_density"]
@@ -0,0 +1,139 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime, timezone
4
+ from pathlib import Path
5
+ from typing import Any, Dict, Iterable, List, Optional, Union
6
+
7
+ import orjson
8
+ from datasets import load_dataset
9
+ from tqdm import tqdm
10
+
11
+ from .constants import DEFAULT_SPLITS, HF_REPO_ID
12
+
13
+
14
+ class _SplitStats:
15
+ """Track statistics for a single split export."""
16
+ def __init__(self):
17
+ self.images = 0
18
+ self.skipped = 0
19
+ self.annotations = 0
20
+
21
+
22
+ def _export_split_iter(ds: Iterable[dict], out_dir: Path, split_name: str) -> _SplitStats:
23
+ """Stream a single split to COCO format on disk.
24
+
25
+ Resume-safe: images that already exist on disk are skipped.
26
+ The COCO annotation JSON is always regenerated so it stays
27
+ consistent with the full set of images.
28
+ """
29
+ images_dir = out_dir / split_name / "images"
30
+ ann_dir = out_dir / split_name / "annotations"
31
+ images_dir.mkdir(parents=True, exist_ok=True)
32
+ ann_dir.mkdir(parents=True, exist_ok=True)
33
+
34
+ images: list[dict] = []
35
+ annotations: list[dict] = []
36
+ categories: list[dict] | None = None
37
+ ann_id = 1
38
+
39
+ stats = _SplitStats()
40
+
41
+ for ex in ds:
42
+ stats.images += 1
43
+ if stats.images % 500 == 0:
44
+ tqdm.write(f"[{split_name}] processed {stats.images} images ({stats.skipped} skipped)...")
45
+
46
+ image_id = int(ex["image_id"])
47
+ filename = ex["filename"]
48
+ width = int(ex["width"])
49
+ height = int(ex["height"])
50
+
51
+ # Resume-safe: skip writing if the image already exists on disk
52
+ img_path = images_dir / filename
53
+ if img_path.exists():
54
+ stats.skipped += 1
55
+ else:
56
+ img_path.write_bytes(ex["image_bytes"])
57
+
58
+ images.append(
59
+ {"id": image_id, "file_name": filename, "width": width, "height": height}
60
+ )
61
+
62
+ annos = orjson.loads(ex["coco_annotations"].encode("utf-8"))
63
+ if categories is None:
64
+ categories = orjson.loads(ex["coco_categories"].encode("utf-8"))
65
+
66
+ for a in annos:
67
+ a = dict(a)
68
+ a["id"] = ann_id
69
+ ann_id += 1
70
+ annotations.append(a)
71
+
72
+ stats.annotations = len(annotations)
73
+
74
+ coco = {
75
+ "images": images,
76
+ "annotations": annotations,
77
+ "categories": categories or [],
78
+ }
79
+ (ann_dir / f"instances_{split_name}.json").write_bytes(orjson.dumps(coco))
80
+ tqdm.write(
81
+ f"[{split_name}] done — {stats.images} images ({stats.skipped} skipped), "
82
+ f"{stats.annotations} annotations"
83
+ )
84
+
85
+ return stats
86
+
87
+
88
+ def export_coco(
89
+ config: str,
90
+ out_root: Union[str, Path],
91
+ repo_id: str = HF_REPO_ID,
92
+ splits: Optional[List[str]] = None,
93
+ revision: Optional[str] = None,
94
+ streaming: bool = True,
95
+ ) -> Path:
96
+ """
97
+ Export a HF config to COCO folders.
98
+
99
+ Writes to: <out_root>/<config>/<split>/{images, annotations}
100
+ Also writes <out_root>/<config>/manifest.json with export metadata.
101
+ """
102
+ out_root = Path(out_root)
103
+ out_dir = out_root / config
104
+ out_dir.mkdir(parents=True, exist_ok=True)
105
+
106
+ if splits is None:
107
+ splits = DEFAULT_SPLITS
108
+
109
+ split_counts: Dict[str, Any] = {}
110
+
111
+ for split in splits:
112
+ tqdm.write(f"Exporting split '{split}' for config '{config}' ...")
113
+ ds = load_dataset(
114
+ repo_id,
115
+ config,
116
+ split=split,
117
+ streaming=streaming,
118
+ revision=revision,
119
+ )
120
+ stats = _export_split_iter(ds, out_dir, split)
121
+ split_counts[split] = {
122
+ "images": stats.images,
123
+ "annotations": stats.annotations,
124
+ }
125
+
126
+ # Write manifest.json
127
+ manifest = {
128
+ "repo_id": repo_id,
129
+ "config": config,
130
+ "revision": revision,
131
+ "splits": splits,
132
+ "timestamp": datetime.now(timezone.utc).isoformat(),
133
+ "counts": split_counts,
134
+ }
135
+ manifest_path = out_dir / "manifest.json"
136
+ manifest_path.write_bytes(orjson.dumps(manifest, option=orjson.OPT_INDENT_2))
137
+ tqdm.write(f"Manifest written to {manifest_path}")
138
+
139
+ return out_dir
@@ -0,0 +1,14 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Optional
4
+
5
+ from datasets import get_dataset_config_names
6
+
7
+ from .constants import HF_REPO_ID
8
+
9
+
10
+ def list_configs(
11
+ repo_id: str = HF_REPO_ID, revision: Optional[str] = None
12
+ ) -> List[str]:
13
+ """List available configs from the HF dataset repo (no data download)."""
14
+ return get_dataset_config_names(repo_id, revision=revision)