nirs4all-datasets 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. nirs4all_datasets-0.2.0/Cargo.toml +47 -0
  2. nirs4all_datasets-0.2.0/LICENSE +21 -0
  3. nirs4all_datasets-0.2.0/PKG-INFO +192 -0
  4. nirs4all_datasets-0.2.0/README.md +145 -0
  5. nirs4all_datasets-0.2.0/bindings/python/Cargo.lock +1053 -0
  6. nirs4all_datasets-0.2.0/bindings/python/Cargo.toml +23 -0
  7. nirs4all_datasets-0.2.0/bindings/python/README.md +29 -0
  8. nirs4all_datasets-0.2.0/bindings/python/src/lib.rs +65 -0
  9. nirs4all_datasets-0.2.0/catalog/index.json +72546 -0
  10. nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/Cargo.toml +30 -0
  11. nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/cache.rs +119 -0
  12. nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/error.rs +53 -0
  13. nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/fetch.rs +448 -0
  14. nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/hash.rs +111 -0
  15. nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/http.rs +248 -0
  16. nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/lib.rs +62 -0
  17. nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/model.rs +161 -0
  18. nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/origins.rs +442 -0
  19. nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/resolve.rs +58 -0
  20. nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/verify.rs +115 -0
  21. nirs4all_datasets-0.2.0/pyproject.toml +139 -0
  22. nirs4all_datasets-0.2.0/src/nirs4all_datasets/__init__.py +81 -0
  23. nirs4all_datasets-0.2.0/src/nirs4all_datasets/_acquire.py +67 -0
  24. nirs4all_datasets-0.2.0/src/nirs4all_datasets/_n4ds.pyi +11 -0
  25. nirs4all_datasets-0.2.0/src/nirs4all_datasets/access.py +208 -0
  26. nirs4all_datasets-0.2.0/src/nirs4all_datasets/bootstrap.py +638 -0
  27. nirs4all_datasets-0.2.0/src/nirs4all_datasets/bulk.py +143 -0
  28. nirs4all_datasets-0.2.0/src/nirs4all_datasets/canonical.py +409 -0
  29. nirs4all_datasets-0.2.0/src/nirs4all_datasets/catalog.py +208 -0
  30. nirs4all_datasets-0.2.0/src/nirs4all_datasets/cli.py +354 -0
  31. nirs4all_datasets-0.2.0/src/nirs4all_datasets/config.py +145 -0
  32. nirs4all_datasets-0.2.0/src/nirs4all_datasets/dataset.py +349 -0
  33. nirs4all_datasets-0.2.0/src/nirs4all_datasets/dataverse.py +266 -0
  34. nirs4all_datasets-0.2.0/src/nirs4all_datasets/health.py +103 -0
  35. nirs4all_datasets-0.2.0/src/nirs4all_datasets/index.py +156 -0
  36. nirs4all_datasets-0.2.0/src/nirs4all_datasets/manifest.py +215 -0
  37. nirs4all_datasets-0.2.0/src/nirs4all_datasets/organize.py +112 -0
  38. nirs4all_datasets-0.2.0/src/nirs4all_datasets/publish.py +170 -0
  39. nirs4all_datasets-0.2.0/src/nirs4all_datasets/py.typed +0 -0
  40. nirs4all_datasets-0.2.0/src/nirs4all_datasets/qualify/__init__.py +21 -0
  41. nirs4all_datasets-0.2.0/src/nirs4all_datasets/qualify/anonymize.py +252 -0
  42. nirs4all_datasets-0.2.0/src/nirs4all_datasets/qualify/croissant.py +224 -0
  43. nirs4all_datasets-0.2.0/src/nirs4all_datasets/qualify/datasheet.py +223 -0
  44. nirs4all_datasets-0.2.0/src/nirs4all_datasets/qualify/metrics.py +231 -0
  45. nirs4all_datasets-0.2.0/src/nirs4all_datasets/qualify/profile.py +513 -0
  46. nirs4all_datasets-0.2.0/src/nirs4all_datasets/qualify/registry.py +131 -0
  47. nirs4all_datasets-0.2.0/src/nirs4all_datasets/reproduce.py +131 -0
  48. nirs4all_datasets-0.2.0/src/nirs4all_datasets/schema.py +663 -0
  49. nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/__init__.py +18 -0
  50. nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/build.py +51 -0
  51. nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/charts.py +392 -0
  52. nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/components.py +307 -0
  53. nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/escape.py +73 -0
  54. nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/icons.py +62 -0
  55. nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/model.py +136 -0
  56. nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/pages.py +684 -0
  57. nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/theme.py +490 -0
  58. nirs4all_datasets-0.2.0/src/nirs4all_datasets/status.py +206 -0
@@ -0,0 +1,47 @@
1
+ # SPDX-License-Identifier: MIT
2
+ #
3
+ # The dataset-acquisition core of nirs4all-datasets: resolve a dataset id (from the
4
+ # distributable catalog/index.json) to its pinned origin, download the canonical
5
+ # Parquet (Dataverse / Zenodo / figshare / URL), SHA-256-verify against the index,
6
+ # and cache it. The scientific *analysis* layer (qualify/card/site/health) stays in
7
+ # Python; only acquisition lives here. See migration_ABI_C.md.
8
+ [workspace]
9
+ resolver = "2"
10
+ members = [
11
+ "crates/nirs4all-datasets-core",
12
+ "crates/nirs4all-datasets-capi",
13
+ "crates/nirs4all-datasets-cli",
14
+ ]
15
+ # Language bindings build with their own toolchains (maturin / wasm-pack) and are
16
+ # excluded from the cargo workspace so `cargo test --workspace` stays self-contained.
17
+ exclude = [
18
+ "bindings/python",
19
+ "bindings/wasm",
20
+ ]
21
+
22
+ [workspace.package]
23
+ version = "0.2.0"
24
+ edition = "2021"
25
+ license = "MIT"
26
+ authors = ["Gregory Beurier <beurier@cirad.fr>"]
27
+ repository = "https://github.com/GBeurier/nirs4all-datasets"
28
+ homepage = "https://github.com/GBeurier/nirs4all-datasets"
29
+
30
+ [workspace.dependencies]
31
+ # Internal crates (path deps; a `version` is REQUIRED for crates.io publishing and
32
+ # is kept in lock-step with [workspace.package] by scripts/bump_version.sh).
33
+ nirs4all-datasets-core = { path = "crates/nirs4all-datasets-core", version = "0.2.0" }
34
+ # Third-party.
35
+ serde = { version = "1.0", features = ["derive"] }
36
+ serde_json = { version = "1.0", features = ["preserve_order", "float_roundtrip"] }
37
+ thiserror = "2.0"
38
+ sha2 = "0.10"
39
+ hex = "0.4"
40
+ directories = "5"
41
+ ureq = "2"
42
+ clap = { version = "4.5", features = ["derive"] }
43
+ tempfile = "3"
44
+
45
+ [profile.release]
46
+ opt-level = 2
47
+ strip = "debuginfo"
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 CIRAD — Gregory Beurier and the nirs4all contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,192 @@
1
+ Metadata-Version: 2.4
2
+ Name: nirs4all-datasets
3
+ Version: 0.2.0
4
+ Classifier: Development Status :: 3 - Alpha
5
+ Classifier: Intended Audience :: Science/Research
6
+ Classifier: Operating System :: OS Independent
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.11
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Topic :: Scientific/Engineering
11
+ Classifier: Typing :: Typed
12
+ Requires-Dist: numpy>=1.24.0
13
+ Requires-Dist: scipy>=1.10.0
14
+ Requires-Dist: pandas>=2.0.0
15
+ Requires-Dist: pyarrow>=14.0.0
16
+ Requires-Dist: pydantic>=2.0.0
17
+ Requires-Dist: pyyaml>=6.0.0
18
+ Requires-Dist: jsonschema>=4.17.0
19
+ Requires-Dist: python-dotenv>=1.0.0
20
+ Requires-Dist: matplotlib>=3.7.0
21
+ Requires-Dist: requests>=2.31.0
22
+ Requires-Dist: typer>=0.12.0
23
+ Requires-Dist: pytest>=7.4.0 ; extra == 'dev'
24
+ Requires-Dist: pytest-cov>=4.1.0 ; extra == 'dev'
25
+ Requires-Dist: ruff>=0.1.0 ; extra == 'dev'
26
+ Requires-Dist: mypy>=1.7.0 ; extra == 'dev'
27
+ Requires-Dist: types-requests ; extra == 'dev'
28
+ Requires-Dist: types-pyyaml ; extra == 'dev'
29
+ Requires-Dist: build>=1.0.0 ; extra == 'dev'
30
+ Requires-Dist: twine>=5.0.0 ; extra == 'dev'
31
+ Requires-Dist: nirs4all-io>=0.1 ; extra == 'io'
32
+ Requires-Dist: nirs4all-formats>=0.1 ; extra == 'io'
33
+ Requires-Dist: nirs4all>=0.9 ; extra == 'nirs4all'
34
+ Provides-Extra: dev
35
+ Provides-Extra: io
36
+ Provides-Extra: nirs4all
37
+ License-File: LICENSE
38
+ Summary: A citable, reproducible bank of raw NIRS reference datasets — multi-source/multi-target, tier-governed, with on-demand checksum-verified access from each dataset's origin.
39
+ Keywords: nirs,near-infrared,spectroscopy,dataset,registry,dataverse,FAIR,benchmark
40
+ Author-email: Gregory Beurier <beurier@cirad.fr>
41
+ Requires-Python: >=3.11
42
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
43
+ Project-URL: Homepage, https://github.com/GBeurier/nirs4all-datasets
44
+ Project-URL: Issues, https://github.com/GBeurier/nirs4all-datasets/issues
45
+ Project-URL: Repository, https://github.com/GBeurier/nirs4all-datasets
46
+
47
+ # nirs4all-datasets
48
+
49
+ A **citable, reproducible bank of raw NIRS** (Near-Infrared Spectroscopy) **reference datasets** — for
50
+ benchmarking, exploring, and comparing models on a common, version-pinned, provenance-rich footing.
51
+
52
+ A dataset here is **raw measured reality, not a benchmark task**: one or more spectral **sources**
53
+ (instruments), any number of **variables** (every target *and* metadata column — nothing is invented, and
54
+ nothing is thrown away), the native splits if the source defined them, and full provenance back to the
55
+ **origin** that published the data. The *task* — which Y, which split, which metric — is a choice the
56
+ consumer makes; it is never baked into the dataset.
57
+
58
+ Three deliverables:
59
+
60
+ 1. a git-tracked **catalog** — one hand-checkable descriptor + a machine-generated *identity card* (stats,
61
+ per-source/per-variable dataviz, MLCommons Croissant, a Datasheet) per dataset. The heavy bytes never
62
+ enter git.
63
+ 2. a Python **plugin** — `get("name")` downloads a dataset on demand from its **origin**, verifies its
64
+ SHA-256, caches it, and returns a `NirsDataset`.
65
+ 3. a **static site** — a browsable, qualified catalog with whole-bank dataviz and per-dataset id-cards.
66
+
67
+ It reuses [`nirs4all`](../nirs4all) for qualification and [`nirs4all-io`](../nirs4all-io) /
68
+ [`nirs4all-formats`](../nirs4all-formats) for reading instrument files (OPUS, JCAMP-DX, SPC, ASD, …).
69
+ **It never re-implements NIRS/IO logic.**
70
+
71
+ > Status: **alpha (0.x), pre-1.0** — the on-disk and API contracts may still change.
72
+
73
+ ## The dataset model
74
+
75
+ - **Sources (X) — `1..n`, kept separate.** Multi-instrument / multi-block datasets keep each block as its
76
+ own source. Sources may even carry *different numbers of spectra* (asymmetric repetitions): they are
77
+ aligned by **sample identity** (`sample_id`), **never by row position**.
78
+ - **Variables (Y + metadata) — `0..n`.** There is no intrinsic Y/metadata distinction: every column is a
79
+ *potential* target. A dataset may declare no target at all (X-only / metadata-only is valid). Declared
80
+ targets are flagged; everything else is kept as metadata, with full per-variable dataviz either way.
81
+ - **Splits — documented, never auto-applied.** Native train/test/fold partitions are recorded so you can
82
+ reproduce a paper's split, but `get()` never silently applies one.
83
+ - **Tiers — how a dataset is shown and exported.** `public` (everything shown, openly fetchable from the
84
+ origin), `private` (everything shown; export needs a token), `anonymized` (variable names masked +
85
+ targets normalized; export needs a token). **Bytes are never served from git or the site** — the catalog
86
+ points at the origin DOI/URL; a personal Dataverse is only a *future* fallback for protected datasets.
87
+ - **Versions — two axes.** A **content** version (bumps when the dataset bytes change) and a
88
+ **metric-protocol** version (lets the cards be re-qualified under a new protocol without rebuilding the
89
+ data).
90
+
91
+ ## Install (development)
92
+
93
+ ```bash
94
+ uv venv && uv pip install -e ".[dev]" # maturin: builds the native acquisition core into the package
95
+ # (uses local editable nirs4all via [tool.uv.sources]; needs a Rust toolchain)
96
+ ```
97
+
98
+ ## Native acquisition core & language bindings
99
+
100
+ The **download** of a dataset — version-pinned DOI resolution, redirect-safe Dataverse / Zenodo /
101
+ figshare fetch, streaming SHA-256 verification and the pooch-style cache — lives in a small **Rust
102
+ core** (`crates/nirs4all-datasets-core`) behind a stable **C ABI** (`n4ds_`), and is published like the
103
+ rest of the ecosystem (`nirs4all-io` is the template). The scientific **analysis** layer (cards,
104
+ qualify, site, health) stays in pure Python. The cross-language contract is one distributable
105
+ `catalog/index.json`; the `n4ds` CLI is the parity oracle. Bindings (all over the same C ABI):
106
+
107
+ | Binding | Package | Status |
108
+ |---|---|---|
109
+ | Python | embedded in `nirs4all-datasets` (`nirs4all_datasets._n4ds`, pyo3) | built + tested |
110
+ | Rust | `nirs4all-datasets-core` / `-capi` (crates.io) | built + tested |
111
+ | WASM/JS | `@nirs4all/datasets-wasm` (npm) — metadata + small public datasets | built + tested |
112
+ | R | `nirs4alldatasets` (C shim, r-universe / Release) | built + tested |
113
+ | Octave/MATLAB | MEX (GitHub Release zip) | built + tested |
114
+
115
+ See [`bindings/SPEC.md`](bindings/SPEC.md) (the binding contract) and
116
+ [`docs/dev/release_process.md`](docs/dev/release_process.md).
117
+
118
+ ## Quickstart
119
+
120
+ ```python
121
+ import nirs4all_datasets as n4ad
122
+
123
+ n4ad.list() # the catalog index
124
+ n4ad.card("corn_eigenvector_nir") # the identity card (dict): sources, variables, stats, provenance
125
+
126
+ ds = n4ad.get("corn_eigenvector_nir") # -> NirsDataset (fetched from origin, checksum-verified, cached)
127
+ ds.sources() # ['X1', 'X2', 'X3'] — the same corn measured on three NIR instruments
128
+ ds.x("X1") # one source's spectra as a 2D numpy array
129
+ ds.x(concat=False) # {source_id: array} for every source (sample-aligned, not row-aligned)
130
+ ds.y() # all declared targets, per sample
131
+ ds.metadata() # the metadata columns (each a potential target)
132
+ ds.split("original") # the native split labels, if the source defined one
133
+ ds.to_nirs4all() # hand off to nirs4all for modelling
134
+ ```
135
+
136
+ Private / anonymized datasets need a Dataverse token: `n4ad.get("name", token=...)`.
137
+
138
+ ## CLI (`n4a-datasets`)
139
+
140
+ ```text
141
+ bootstrap <tree> author schema-2.0 descriptors from <tree>/v2.0/* (--prune to re-base)
142
+ build-all --source-tree <tree> organize + qualify every dataset in parallel (--protocol-refresh, --site)
143
+ add <raw_source> <id> one raw source -> canonical + card + index
144
+ qualify <id> (re)build a dataset's card (--anonymize -> card.anon.json)
145
+ health-check probe each dataset's open origins -> catalog/health.json
146
+ catalog | list | card | get regenerate the index / inspect / load a dataset
147
+ publish | grant | revoke | restrict personal-Dataverse governance for protected data (future)
148
+ ```
149
+
150
+ `n4a-datasets <command> --help` documents every flag.
151
+
152
+ ## What lives where (3-tier storage)
153
+
154
+ - **git** (small, tracked): `catalog/datasets/<id>.yaml` (descriptor), `catalog/datasets.yaml` (index +
155
+ whole-bank summary), and per-dataset `card.json` / `card.md` / `croissant.json` / `manifest.json`.
156
+ - **the origin** (Zenodo, a data Dataverse, a vendor archive, …): the raw + canonical **bytes**, fetched
157
+ on demand and never re-hosted by this project.
158
+ - **local cache** (downloaded on demand): the verified canonical Parquet under `pooch.os_cache`.
159
+
160
+ ## API token — where to put it
161
+
162
+ A Dataverse API token is **only** needed to fetch **private/anonymized** datasets or to publish to a
163
+ personal Dataverse; public datasets need none. Resolution order:
164
+
165
+ 1. Environment variable `NIRS4ALL_DATAVERSE_TOKEN` (recommended; required in CI).
166
+ 2. `~/.config/nirs4all-datasets/config.toml` (`chmod 600`):
167
+ ```toml
168
+ [dataverse]
169
+ instance = "https://entrepot.recherche.data.gouv.fr"
170
+ token = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
171
+ ```
172
+ 3. A project `.env` (gitignored) — see `.env.example`.
173
+
174
+ The token travels only in the `X-Dataverse-key` header, is never logged, and is never sent on a redirect
175
+ to signed object storage. **Never commit it** (`.env`, `config.toml`, `*.token` are gitignored).
176
+
177
+ ## Contributing
178
+
179
+ Full walkthrough in **[CONTRIBUTING.md](CONTRIBUTING.md)**; the design is in
180
+ **[docs/DESIGN.md](docs/DESIGN.md)**. The green gate (run before every commit) mirrors CI:
181
+
182
+ ```bash
183
+ ruff check . && mypy --config-file pyproject.toml src
184
+ python catalog/scripts/validate.py # every descriptor is schema-valid
185
+ pytest -q
186
+ ```
187
+
188
+ ## License
189
+
190
+ Code: MIT (see [`LICENSE`](LICENSE)). Each dataset carries its **own** SPDX license in its descriptor, and
191
+ is only ever linked to its origin — open data is never re-hosted under a different license.
192
+
@@ -0,0 +1,145 @@
1
+ # nirs4all-datasets
2
+
3
+ A **citable, reproducible bank of raw NIRS** (Near-Infrared Spectroscopy) **reference datasets** — for
4
+ benchmarking, exploring, and comparing models on a common, version-pinned, provenance-rich footing.
5
+
6
+ A dataset here is **raw measured reality, not a benchmark task**: one or more spectral **sources**
7
+ (instruments), any number of **variables** (every target *and* metadata column — nothing is invented, and
8
+ nothing is thrown away), the native splits if the source defined them, and full provenance back to the
9
+ **origin** that published the data. The *task* — which Y, which split, which metric — is a choice the
10
+ consumer makes; it is never baked into the dataset.
11
+
12
+ Three deliverables:
13
+
14
+ 1. a git-tracked **catalog** — one hand-checkable descriptor + a machine-generated *identity card* (stats,
15
+ per-source/per-variable dataviz, MLCommons Croissant, a Datasheet) per dataset. The heavy bytes never
16
+ enter git.
17
+ 2. a Python **plugin** — `get("name")` downloads a dataset on demand from its **origin**, verifies its
18
+ SHA-256, caches it, and returns a `NirsDataset`.
19
+ 3. a **static site** — a browsable, qualified catalog with whole-bank dataviz and per-dataset id-cards.
20
+
21
+ It reuses [`nirs4all`](../nirs4all) for qualification and [`nirs4all-io`](../nirs4all-io) /
22
+ [`nirs4all-formats`](../nirs4all-formats) for reading instrument files (OPUS, JCAMP-DX, SPC, ASD, …).
23
+ **It never re-implements NIRS/IO logic.**
24
+
25
+ > Status: **alpha (0.x), pre-1.0** — the on-disk and API contracts may still change.
26
+
27
+ ## The dataset model
28
+
29
+ - **Sources (X) — `1..n`, kept separate.** Multi-instrument / multi-block datasets keep each block as its
30
+ own source. Sources may even carry *different numbers of spectra* (asymmetric repetitions): they are
31
+ aligned by **sample identity** (`sample_id`), **never by row position**.
32
+ - **Variables (Y + metadata) — `0..n`.** There is no intrinsic Y/metadata distinction: every column is a
33
+ *potential* target. A dataset may declare no target at all (X-only / metadata-only is valid). Declared
34
+ targets are flagged; everything else is kept as metadata, with full per-variable dataviz either way.
35
+ - **Splits — documented, never auto-applied.** Native train/test/fold partitions are recorded so you can
36
+ reproduce a paper's split, but `get()` never silently applies one.
37
+ - **Tiers — how a dataset is shown and exported.** `public` (everything shown, openly fetchable from the
38
+ origin), `private` (everything shown; export needs a token), `anonymized` (variable names masked +
39
+ targets normalized; export needs a token). **Bytes are never served from git or the site** — the catalog
40
+ points at the origin DOI/URL; a personal Dataverse is only a *future* fallback for protected datasets.
41
+ - **Versions — two axes.** A **content** version (bumps when the dataset bytes change) and a
42
+ **metric-protocol** version (lets the cards be re-qualified under a new protocol without rebuilding the
43
+ data).
44
+
45
+ ## Install (development)
46
+
47
+ ```bash
48
+ uv venv && uv pip install -e ".[dev]" # maturin: builds the native acquisition core into the package
49
+ # (uses local editable nirs4all via [tool.uv.sources]; needs a Rust toolchain)
50
+ ```
51
+
52
+ ## Native acquisition core & language bindings
53
+
54
+ The **download** of a dataset — version-pinned DOI resolution, redirect-safe Dataverse / Zenodo /
55
+ figshare fetch, streaming SHA-256 verification and the pooch-style cache — lives in a small **Rust
56
+ core** (`crates/nirs4all-datasets-core`) behind a stable **C ABI** (`n4ds_`), and is published like the
57
+ rest of the ecosystem (`nirs4all-io` is the template). The scientific **analysis** layer (cards,
58
+ qualify, site, health) stays in pure Python. The cross-language contract is one distributable
59
+ `catalog/index.json`; the `n4ds` CLI is the parity oracle. Bindings (all over the same C ABI):
60
+
61
+ | Binding | Package | Status |
62
+ |---|---|---|
63
+ | Python | embedded in `nirs4all-datasets` (`nirs4all_datasets._n4ds`, pyo3) | built + tested |
64
+ | Rust | `nirs4all-datasets-core` / `-capi` (crates.io) | built + tested |
65
+ | WASM/JS | `@nirs4all/datasets-wasm` (npm) — metadata + small public datasets | built + tested |
66
+ | R | `nirs4alldatasets` (C shim, r-universe / Release) | built + tested |
67
+ | Octave/MATLAB | MEX (GitHub Release zip) | built + tested |
68
+
69
+ See [`bindings/SPEC.md`](bindings/SPEC.md) (the binding contract) and
70
+ [`docs/dev/release_process.md`](docs/dev/release_process.md).
71
+
72
+ ## Quickstart
73
+
74
+ ```python
75
+ import nirs4all_datasets as n4ad
76
+
77
+ n4ad.list() # the catalog index
78
+ n4ad.card("corn_eigenvector_nir") # the identity card (dict): sources, variables, stats, provenance
79
+
80
+ ds = n4ad.get("corn_eigenvector_nir") # -> NirsDataset (fetched from origin, checksum-verified, cached)
81
+ ds.sources() # ['X1', 'X2', 'X3'] — the same corn measured on three NIR instruments
82
+ ds.x("X1") # one source's spectra as a 2D numpy array
83
+ ds.x(concat=False) # {source_id: array} for every source (sample-aligned, not row-aligned)
84
+ ds.y() # all declared targets, per sample
85
+ ds.metadata() # the metadata columns (each a potential target)
86
+ ds.split("original") # the native split labels, if the source defined one
87
+ ds.to_nirs4all() # hand off to nirs4all for modelling
88
+ ```
89
+
90
+ Private / anonymized datasets need a Dataverse token: `n4ad.get("name", token=...)`.
91
+
92
+ ## CLI (`n4a-datasets`)
93
+
94
+ ```text
95
+ bootstrap <tree> author schema-2.0 descriptors from <tree>/v2.0/* (--prune to re-base)
96
+ build-all --source-tree <tree> organize + qualify every dataset in parallel (--protocol-refresh, --site)
97
+ add <raw_source> <id> one raw source -> canonical + card + index
98
+ qualify <id> (re)build a dataset's card (--anonymize -> card.anon.json)
99
+ health-check probe each dataset's open origins -> catalog/health.json
100
+ catalog | list | card | get regenerate the index / inspect / load a dataset
101
+ publish | grant | revoke | restrict personal-Dataverse governance for protected data (future)
102
+ ```
103
+
104
+ `n4a-datasets <command> --help` documents every flag.
105
+
106
+ ## What lives where (3-tier storage)
107
+
108
+ - **git** (small, tracked): `catalog/datasets/<id>.yaml` (descriptor), `catalog/datasets.yaml` (index +
109
+ whole-bank summary), and per-dataset `card.json` / `card.md` / `croissant.json` / `manifest.json`.
110
+ - **the origin** (Zenodo, a data Dataverse, a vendor archive, …): the raw + canonical **bytes**, fetched
111
+ on demand and never re-hosted by this project.
112
+ - **local cache** (downloaded on demand): the verified canonical Parquet under `pooch.os_cache`.
113
+
114
+ ## API token — where to put it
115
+
116
+ A Dataverse API token is **only** needed to fetch **private/anonymized** datasets or to publish to a
117
+ personal Dataverse; public datasets need none. Resolution order:
118
+
119
+ 1. Environment variable `NIRS4ALL_DATAVERSE_TOKEN` (recommended; required in CI).
120
+ 2. `~/.config/nirs4all-datasets/config.toml` (`chmod 600`):
121
+ ```toml
122
+ [dataverse]
123
+ instance = "https://entrepot.recherche.data.gouv.fr"
124
+ token = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
125
+ ```
126
+ 3. A project `.env` (gitignored) — see `.env.example`.
127
+
128
+ The token travels only in the `X-Dataverse-key` header, is never logged, and is never sent on a redirect
129
+ to signed object storage. **Never commit it** (`.env`, `config.toml`, `*.token` are gitignored).
130
+
131
+ ## Contributing
132
+
133
+ Full walkthrough in **[CONTRIBUTING.md](CONTRIBUTING.md)**; the design is in
134
+ **[docs/DESIGN.md](docs/DESIGN.md)**. The green gate (run before every commit) mirrors CI:
135
+
136
+ ```bash
137
+ ruff check . && mypy --config-file pyproject.toml src
138
+ python catalog/scripts/validate.py # every descriptor is schema-valid
139
+ pytest -q
140
+ ```
141
+
142
+ ## License
143
+
144
+ Code: MIT (see [`LICENSE`](LICENSE)). Each dataset carries its **own** SPDX license in its descriptor, and
145
+ is only ever linked to its origin — open data is never re-hosted under a different license.