nirs4all-datasets 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nirs4all_datasets-0.2.0/Cargo.toml +47 -0
- nirs4all_datasets-0.2.0/LICENSE +21 -0
- nirs4all_datasets-0.2.0/PKG-INFO +192 -0
- nirs4all_datasets-0.2.0/README.md +145 -0
- nirs4all_datasets-0.2.0/bindings/python/Cargo.lock +1053 -0
- nirs4all_datasets-0.2.0/bindings/python/Cargo.toml +23 -0
- nirs4all_datasets-0.2.0/bindings/python/README.md +29 -0
- nirs4all_datasets-0.2.0/bindings/python/src/lib.rs +65 -0
- nirs4all_datasets-0.2.0/catalog/index.json +72546 -0
- nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/Cargo.toml +30 -0
- nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/cache.rs +119 -0
- nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/error.rs +53 -0
- nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/fetch.rs +448 -0
- nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/hash.rs +111 -0
- nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/http.rs +248 -0
- nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/lib.rs +62 -0
- nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/model.rs +161 -0
- nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/origins.rs +442 -0
- nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/resolve.rs +58 -0
- nirs4all_datasets-0.2.0/crates/nirs4all-datasets-core/src/verify.rs +115 -0
- nirs4all_datasets-0.2.0/pyproject.toml +139 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/__init__.py +81 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/_acquire.py +67 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/_n4ds.pyi +11 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/access.py +208 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/bootstrap.py +638 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/bulk.py +143 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/canonical.py +409 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/catalog.py +208 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/cli.py +354 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/config.py +145 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/dataset.py +349 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/dataverse.py +266 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/health.py +103 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/index.py +156 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/manifest.py +215 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/organize.py +112 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/publish.py +170 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/py.typed +0 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/qualify/__init__.py +21 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/qualify/anonymize.py +252 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/qualify/croissant.py +224 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/qualify/datasheet.py +223 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/qualify/metrics.py +231 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/qualify/profile.py +513 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/qualify/registry.py +131 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/reproduce.py +131 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/schema.py +663 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/__init__.py +18 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/build.py +51 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/charts.py +392 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/components.py +307 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/escape.py +73 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/icons.py +62 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/model.py +136 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/pages.py +684 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/site/theme.py +490 -0
- nirs4all_datasets-0.2.0/src/nirs4all_datasets/status.py +206 -0
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
#
|
|
3
|
+
# The dataset-acquisition core of nirs4all-datasets: resolve a dataset id (from the
|
|
4
|
+
# distributable catalog/index.json) to its pinned origin, download the canonical
|
|
5
|
+
# Parquet (Dataverse / Zenodo / figshare / URL), SHA-256-verify against the index,
|
|
6
|
+
# and cache it. The scientific *analysis* layer (qualify/card/site/health) stays in
|
|
7
|
+
# Python; only acquisition lives here. See migration_ABI_C.md.
|
|
8
|
+
[workspace]
|
|
9
|
+
resolver = "2"
|
|
10
|
+
members = [
|
|
11
|
+
"crates/nirs4all-datasets-core",
|
|
12
|
+
"crates/nirs4all-datasets-capi",
|
|
13
|
+
"crates/nirs4all-datasets-cli",
|
|
14
|
+
]
|
|
15
|
+
# Language bindings build with their own toolchains (maturin / wasm-pack) and are
|
|
16
|
+
# excluded from the cargo workspace so `cargo test --workspace` stays self-contained.
|
|
17
|
+
exclude = [
|
|
18
|
+
"bindings/python",
|
|
19
|
+
"bindings/wasm",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[workspace.package]
|
|
23
|
+
version = "0.2.0"
|
|
24
|
+
edition = "2021"
|
|
25
|
+
license = "MIT"
|
|
26
|
+
authors = ["Gregory Beurier <beurier@cirad.fr>"]
|
|
27
|
+
repository = "https://github.com/GBeurier/nirs4all-datasets"
|
|
28
|
+
homepage = "https://github.com/GBeurier/nirs4all-datasets"
|
|
29
|
+
|
|
30
|
+
[workspace.dependencies]
|
|
31
|
+
# Internal crates (path deps; a `version` is REQUIRED for crates.io publishing and
|
|
32
|
+
# is kept in lock-step with [workspace.package] by scripts/bump_version.sh).
|
|
33
|
+
nirs4all-datasets-core = { path = "crates/nirs4all-datasets-core", version = "0.2.0" }
|
|
34
|
+
# Third-party.
|
|
35
|
+
serde = { version = "1.0", features = ["derive"] }
|
|
36
|
+
serde_json = { version = "1.0", features = ["preserve_order", "float_roundtrip"] }
|
|
37
|
+
thiserror = "2.0"
|
|
38
|
+
sha2 = "0.10"
|
|
39
|
+
hex = "0.4"
|
|
40
|
+
directories = "5"
|
|
41
|
+
ureq = "2"
|
|
42
|
+
clap = { version = "4.5", features = ["derive"] }
|
|
43
|
+
tempfile = "3"
|
|
44
|
+
|
|
45
|
+
[profile.release]
|
|
46
|
+
opt-level = 2
|
|
47
|
+
strip = "debuginfo"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 CIRAD — Gregory Beurier and the nirs4all contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nirs4all-datasets
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Classifier: Development Status :: 3 - Alpha
|
|
5
|
+
Classifier: Intended Audience :: Science/Research
|
|
6
|
+
Classifier: Operating System :: OS Independent
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering
|
|
11
|
+
Classifier: Typing :: Typed
|
|
12
|
+
Requires-Dist: numpy>=1.24.0
|
|
13
|
+
Requires-Dist: scipy>=1.10.0
|
|
14
|
+
Requires-Dist: pandas>=2.0.0
|
|
15
|
+
Requires-Dist: pyarrow>=14.0.0
|
|
16
|
+
Requires-Dist: pydantic>=2.0.0
|
|
17
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
18
|
+
Requires-Dist: jsonschema>=4.17.0
|
|
19
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
20
|
+
Requires-Dist: matplotlib>=3.7.0
|
|
21
|
+
Requires-Dist: requests>=2.31.0
|
|
22
|
+
Requires-Dist: typer>=0.12.0
|
|
23
|
+
Requires-Dist: pytest>=7.4.0 ; extra == 'dev'
|
|
24
|
+
Requires-Dist: pytest-cov>=4.1.0 ; extra == 'dev'
|
|
25
|
+
Requires-Dist: ruff>=0.1.0 ; extra == 'dev'
|
|
26
|
+
Requires-Dist: mypy>=1.7.0 ; extra == 'dev'
|
|
27
|
+
Requires-Dist: types-requests ; extra == 'dev'
|
|
28
|
+
Requires-Dist: types-pyyaml ; extra == 'dev'
|
|
29
|
+
Requires-Dist: build>=1.0.0 ; extra == 'dev'
|
|
30
|
+
Requires-Dist: twine>=5.0.0 ; extra == 'dev'
|
|
31
|
+
Requires-Dist: nirs4all-io>=0.1 ; extra == 'io'
|
|
32
|
+
Requires-Dist: nirs4all-formats>=0.1 ; extra == 'io'
|
|
33
|
+
Requires-Dist: nirs4all>=0.9 ; extra == 'nirs4all'
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Provides-Extra: io
|
|
36
|
+
Provides-Extra: nirs4all
|
|
37
|
+
License-File: LICENSE
|
|
38
|
+
Summary: A citable, reproducible bank of raw NIRS reference datasets — multi-source/multi-target, tier-governed, with on-demand checksum-verified access from each dataset's origin.
|
|
39
|
+
Keywords: nirs,near-infrared,spectroscopy,dataset,registry,dataverse,FAIR,benchmark
|
|
40
|
+
Author-email: Gregory Beurier <beurier@cirad.fr>
|
|
41
|
+
Requires-Python: >=3.11
|
|
42
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
43
|
+
Project-URL: Homepage, https://github.com/GBeurier/nirs4all-datasets
|
|
44
|
+
Project-URL: Issues, https://github.com/GBeurier/nirs4all-datasets/issues
|
|
45
|
+
Project-URL: Repository, https://github.com/GBeurier/nirs4all-datasets
|
|
46
|
+
|
|
47
|
+
# nirs4all-datasets
|
|
48
|
+
|
|
49
|
+
A **citable, reproducible bank of raw NIRS** (Near-Infrared Spectroscopy) **reference datasets** — for
|
|
50
|
+
benchmarking, exploring, and comparing models on a common, version-pinned, provenance-rich footing.
|
|
51
|
+
|
|
52
|
+
A dataset here is **raw measured reality, not a benchmark task**: one or more spectral **sources**
|
|
53
|
+
(instruments), any number of **variables** (every target *and* metadata column — nothing is invented, and
|
|
54
|
+
nothing is thrown away), the native splits if the source defined them, and full provenance back to the
|
|
55
|
+
**origin** that published the data. The *task* — which Y, which split, which metric — is a choice the
|
|
56
|
+
consumer makes; it is never baked into the dataset.
|
|
57
|
+
|
|
58
|
+
Three deliverables:
|
|
59
|
+
|
|
60
|
+
1. a git-tracked **catalog** — one hand-checkable descriptor + a machine-generated *identity card* (stats,
|
|
61
|
+
per-source/per-variable dataviz, MLCommons Croissant, a Datasheet) per dataset. The heavy bytes never
|
|
62
|
+
enter git.
|
|
63
|
+
2. a Python **plugin** — `get("name")` downloads a dataset on demand from its **origin**, verifies its
|
|
64
|
+
SHA-256, caches it, and returns a `NirsDataset`.
|
|
65
|
+
3. a **static site** — a browsable, qualified catalog with whole-bank dataviz and per-dataset id-cards.
|
|
66
|
+
|
|
67
|
+
It reuses [`nirs4all`](../nirs4all) for qualification and [`nirs4all-io`](../nirs4all-io) /
|
|
68
|
+
[`nirs4all-formats`](../nirs4all-formats) for reading instrument files (OPUS, JCAMP-DX, SPC, ASD, …).
|
|
69
|
+
**It never re-implements NIRS/IO logic.**
|
|
70
|
+
|
|
71
|
+
> Status: **alpha (0.x), pre-1.0** — the on-disk and API contracts may still change.
|
|
72
|
+
|
|
73
|
+
## The dataset model
|
|
74
|
+
|
|
75
|
+
- **Sources (X) — `1..n`, kept separate.** Multi-instrument / multi-block datasets keep each block as its
|
|
76
|
+
own source. Sources may even carry *different numbers of spectra* (asymmetric repetitions): they are
|
|
77
|
+
aligned by **sample identity** (`sample_id`), **never by row position**.
|
|
78
|
+
- **Variables (Y + metadata) — `0..n`.** There is no intrinsic Y/metadata distinction: every column is a
|
|
79
|
+
*potential* target. A dataset may declare no target at all (X-only / metadata-only is valid). Declared
|
|
80
|
+
targets are flagged; everything else is kept as metadata, with full per-variable dataviz either way.
|
|
81
|
+
- **Splits — documented, never auto-applied.** Native train/test/fold partitions are recorded so you can
|
|
82
|
+
reproduce a paper's split, but `get()` never silently applies one.
|
|
83
|
+
- **Tiers — how a dataset is shown and exported.** `public` (everything shown, openly fetchable from the
|
|
84
|
+
origin), `private` (everything shown; export needs a token), `anonymized` (variable names masked +
|
|
85
|
+
targets normalized; export needs a token). **Bytes are never served from git or the site** — the catalog
|
|
86
|
+
points at the origin DOI/URL; a personal Dataverse is only a *future* fallback for protected datasets.
|
|
87
|
+
- **Versions — two axes.** A **content** version (bumps when the dataset bytes change) and a
|
|
88
|
+
**metric-protocol** version (lets the cards be re-qualified under a new protocol without rebuilding the
|
|
89
|
+
data).
|
|
90
|
+
|
|
91
|
+
## Install (development)
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
uv venv && uv pip install -e ".[dev]" # maturin: builds the native acquisition core into the package
|
|
95
|
+
# (uses local editable nirs4all via [tool.uv.sources]; needs a Rust toolchain)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Native acquisition core & language bindings
|
|
99
|
+
|
|
100
|
+
The **download** of a dataset — version-pinned DOI resolution, redirect-safe Dataverse / Zenodo /
|
|
101
|
+
figshare fetch, streaming SHA-256 verification and the pooch-style cache — lives in a small **Rust
|
|
102
|
+
core** (`crates/nirs4all-datasets-core`) behind a stable **C ABI** (`n4ds_`), and is published like the
|
|
103
|
+
rest of the ecosystem (`nirs4all-io` is the template). The scientific **analysis** layer (cards,
|
|
104
|
+
qualify, site, health) stays in pure Python. The cross-language contract is one distributable
|
|
105
|
+
`catalog/index.json`; the `n4ds` CLI is the parity oracle. Bindings (all over the same C ABI):
|
|
106
|
+
|
|
107
|
+
| Binding | Package | Status |
|
|
108
|
+
|---|---|---|
|
|
109
|
+
| Python | embedded in `nirs4all-datasets` (`nirs4all_datasets._n4ds`, pyo3) | built + tested |
|
|
110
|
+
| Rust | `nirs4all-datasets-core` / `-capi` (crates.io) | built + tested |
|
|
111
|
+
| WASM/JS | `@nirs4all/datasets-wasm` (npm) — metadata + small public datasets | built + tested |
|
|
112
|
+
| R | `nirs4alldatasets` (C shim, r-universe / Release) | built + tested |
|
|
113
|
+
| Octave/MATLAB | MEX (GitHub Release zip) | built + tested |
|
|
114
|
+
|
|
115
|
+
See [`bindings/SPEC.md`](bindings/SPEC.md) (the binding contract) and
|
|
116
|
+
[`docs/dev/release_process.md`](docs/dev/release_process.md).
|
|
117
|
+
|
|
118
|
+
## Quickstart
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
import nirs4all_datasets as n4ad
|
|
122
|
+
|
|
123
|
+
n4ad.list() # the catalog index
|
|
124
|
+
n4ad.card("corn_eigenvector_nir") # the identity card (dict): sources, variables, stats, provenance
|
|
125
|
+
|
|
126
|
+
ds = n4ad.get("corn_eigenvector_nir") # -> NirsDataset (fetched from origin, checksum-verified, cached)
|
|
127
|
+
ds.sources() # ['X1', 'X2', 'X3'] — the same corn measured on three NIR instruments
|
|
128
|
+
ds.x("X1") # one source's spectra as a 2D numpy array
|
|
129
|
+
ds.x(concat=False) # {source_id: array} for every source (sample-aligned, not row-aligned)
|
|
130
|
+
ds.y() # all declared targets, per sample
|
|
131
|
+
ds.metadata() # the metadata columns (each a potential target)
|
|
132
|
+
ds.split("original") # the native split labels, if the source defined one
|
|
133
|
+
ds.to_nirs4all() # hand off to nirs4all for modelling
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Private / anonymized datasets need a Dataverse token: `n4ad.get("name", token=...)`.
|
|
137
|
+
|
|
138
|
+
## CLI (`n4a-datasets`)
|
|
139
|
+
|
|
140
|
+
```text
|
|
141
|
+
bootstrap <tree> author schema-2.0 descriptors from <tree>/v2.0/* (--prune to re-base)
|
|
142
|
+
build-all --source-tree <tree> organize + qualify every dataset in parallel (--protocol-refresh, --site)
|
|
143
|
+
add <raw_source> <id> one raw source -> canonical + card + index
|
|
144
|
+
qualify <id> (re)build a dataset's card (--anonymize -> card.anon.json)
|
|
145
|
+
health-check probe each dataset's open origins -> catalog/health.json
|
|
146
|
+
catalog | list | card | get regenerate the index / inspect / load a dataset
|
|
147
|
+
publish | grant | revoke | restrict personal-Dataverse governance for protected data (future)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
`n4a-datasets <command> --help` documents every flag.
|
|
151
|
+
|
|
152
|
+
## What lives where (3-tier storage)
|
|
153
|
+
|
|
154
|
+
- **git** (small, tracked): `catalog/datasets/<id>.yaml` (descriptor), `catalog/datasets.yaml` (index +
|
|
155
|
+
whole-bank summary), and per-dataset `card.json` / `card.md` / `croissant.json` / `manifest.json`.
|
|
156
|
+
- **the origin** (Zenodo, a data Dataverse, a vendor archive, …): the raw + canonical **bytes**, fetched
|
|
157
|
+
on demand and never re-hosted by this project.
|
|
158
|
+
- **local cache** (downloaded on demand): the verified canonical Parquet under `pooch.os_cache`.
|
|
159
|
+
|
|
160
|
+
## API token — where to put it
|
|
161
|
+
|
|
162
|
+
A Dataverse API token is **only** needed to fetch **private/anonymized** datasets or to publish to a
|
|
163
|
+
personal Dataverse; public datasets need none. Resolution order:
|
|
164
|
+
|
|
165
|
+
1. Environment variable `NIRS4ALL_DATAVERSE_TOKEN` (recommended; required in CI).
|
|
166
|
+
2. `~/.config/nirs4all-datasets/config.toml` (`chmod 600`):
|
|
167
|
+
```toml
|
|
168
|
+
[dataverse]
|
|
169
|
+
instance = "https://entrepot.recherche.data.gouv.fr"
|
|
170
|
+
token = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
|
171
|
+
```
|
|
172
|
+
3. A project `.env` (gitignored) — see `.env.example`.
|
|
173
|
+
|
|
174
|
+
The token travels only in the `X-Dataverse-key` header, is never logged, and is never sent on a redirect
|
|
175
|
+
to signed object storage. **Never commit it** (`.env`, `config.toml`, `*.token` are gitignored).
|
|
176
|
+
|
|
177
|
+
## Contributing
|
|
178
|
+
|
|
179
|
+
Full walkthrough in **[CONTRIBUTING.md](CONTRIBUTING.md)**; the design is in
|
|
180
|
+
**[docs/DESIGN.md](docs/DESIGN.md)**. The green gate (run before every commit) mirrors CI:
|
|
181
|
+
|
|
182
|
+
```bash
|
|
183
|
+
ruff check . && mypy --config-file pyproject.toml src
|
|
184
|
+
python catalog/scripts/validate.py # every descriptor is schema-valid
|
|
185
|
+
pytest -q
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
## License
|
|
189
|
+
|
|
190
|
+
Code: MIT (see [`LICENSE`](LICENSE)). Each dataset carries its **own** SPDX license in its descriptor, and
|
|
191
|
+
is only ever linked to its origin — open data is never re-hosted under a different license.
|
|
192
|
+
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# nirs4all-datasets
|
|
2
|
+
|
|
3
|
+
A **citable, reproducible bank of raw NIRS** (Near-Infrared Spectroscopy) **reference datasets** — for
|
|
4
|
+
benchmarking, exploring, and comparing models on a common, version-pinned, provenance-rich footing.
|
|
5
|
+
|
|
6
|
+
A dataset here is **raw measured reality, not a benchmark task**: one or more spectral **sources**
|
|
7
|
+
(instruments), any number of **variables** (every target *and* metadata column — nothing is invented, and
|
|
8
|
+
nothing is thrown away), the native splits if the source defined them, and full provenance back to the
|
|
9
|
+
**origin** that published the data. The *task* — which Y, which split, which metric — is a choice the
|
|
10
|
+
consumer makes; it is never baked into the dataset.
|
|
11
|
+
|
|
12
|
+
Three deliverables:
|
|
13
|
+
|
|
14
|
+
1. a git-tracked **catalog** — one hand-checkable descriptor + a machine-generated *identity card* (stats,
|
|
15
|
+
per-source/per-variable dataviz, MLCommons Croissant, a Datasheet) per dataset. The heavy bytes never
|
|
16
|
+
enter git.
|
|
17
|
+
2. a Python **plugin** — `get("name")` downloads a dataset on demand from its **origin**, verifies its
|
|
18
|
+
SHA-256, caches it, and returns a `NirsDataset`.
|
|
19
|
+
3. a **static site** — a browsable, qualified catalog with whole-bank dataviz and per-dataset id-cards.
|
|
20
|
+
|
|
21
|
+
It reuses [`nirs4all`](../nirs4all) for qualification and [`nirs4all-io`](../nirs4all-io) /
|
|
22
|
+
[`nirs4all-formats`](../nirs4all-formats) for reading instrument files (OPUS, JCAMP-DX, SPC, ASD, …).
|
|
23
|
+
**It never re-implements NIRS/IO logic.**
|
|
24
|
+
|
|
25
|
+
> Status: **alpha (0.x), pre-1.0** — the on-disk and API contracts may still change.
|
|
26
|
+
|
|
27
|
+
## The dataset model
|
|
28
|
+
|
|
29
|
+
- **Sources (X) — `1..n`, kept separate.** Multi-instrument / multi-block datasets keep each block as its
|
|
30
|
+
own source. Sources may even carry *different numbers of spectra* (asymmetric repetitions): they are
|
|
31
|
+
aligned by **sample identity** (`sample_id`), **never by row position**.
|
|
32
|
+
- **Variables (Y + metadata) — `0..n`.** There is no intrinsic Y/metadata distinction: every column is a
|
|
33
|
+
*potential* target. A dataset may declare no target at all (X-only / metadata-only is valid). Declared
|
|
34
|
+
targets are flagged; everything else is kept as metadata, with full per-variable dataviz either way.
|
|
35
|
+
- **Splits — documented, never auto-applied.** Native train/test/fold partitions are recorded so you can
|
|
36
|
+
reproduce a paper's split, but `get()` never silently applies one.
|
|
37
|
+
- **Tiers — how a dataset is shown and exported.** `public` (everything shown, openly fetchable from the
|
|
38
|
+
origin), `private` (everything shown; export needs a token), `anonymized` (variable names masked +
|
|
39
|
+
targets normalized; export needs a token). **Bytes are never served from git or the site** — the catalog
|
|
40
|
+
points at the origin DOI/URL; a personal Dataverse is only a *future* fallback for protected datasets.
|
|
41
|
+
- **Versions — two axes.** A **content** version (bumps when the dataset bytes change) and a
|
|
42
|
+
**metric-protocol** version (lets the cards be re-qualified under a new protocol without rebuilding the
|
|
43
|
+
data).
|
|
44
|
+
|
|
45
|
+
## Install (development)
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
uv venv && uv pip install -e ".[dev]" # maturin: builds the native acquisition core into the package
|
|
49
|
+
# (uses local editable nirs4all via [tool.uv.sources]; needs a Rust toolchain)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Native acquisition core & language bindings
|
|
53
|
+
|
|
54
|
+
The **download** of a dataset — version-pinned DOI resolution, redirect-safe Dataverse / Zenodo /
|
|
55
|
+
figshare fetch, streaming SHA-256 verification and the pooch-style cache — lives in a small **Rust
|
|
56
|
+
core** (`crates/nirs4all-datasets-core`) behind a stable **C ABI** (`n4ds_`), and is published like the
|
|
57
|
+
rest of the ecosystem (`nirs4all-io` is the template). The scientific **analysis** layer (cards,
|
|
58
|
+
qualify, site, health) stays in pure Python. The cross-language contract is one distributable
|
|
59
|
+
`catalog/index.json`; the `n4ds` CLI is the parity oracle. Bindings (all over the same C ABI):
|
|
60
|
+
|
|
61
|
+
| Binding | Package | Status |
|
|
62
|
+
|---|---|---|
|
|
63
|
+
| Python | embedded in `nirs4all-datasets` (`nirs4all_datasets._n4ds`, pyo3) | built + tested |
|
|
64
|
+
| Rust | `nirs4all-datasets-core` / `-capi` (crates.io) | built + tested |
|
|
65
|
+
| WASM/JS | `@nirs4all/datasets-wasm` (npm) — metadata + small public datasets | built + tested |
|
|
66
|
+
| R | `nirs4alldatasets` (C shim, r-universe / Release) | built + tested |
|
|
67
|
+
| Octave/MATLAB | MEX (GitHub Release zip) | built + tested |
|
|
68
|
+
|
|
69
|
+
See [`bindings/SPEC.md`](bindings/SPEC.md) (the binding contract) and
|
|
70
|
+
[`docs/dev/release_process.md`](docs/dev/release_process.md).
|
|
71
|
+
|
|
72
|
+
## Quickstart
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
import nirs4all_datasets as n4ad
|
|
76
|
+
|
|
77
|
+
n4ad.list() # the catalog index
|
|
78
|
+
n4ad.card("corn_eigenvector_nir") # the identity card (dict): sources, variables, stats, provenance
|
|
79
|
+
|
|
80
|
+
ds = n4ad.get("corn_eigenvector_nir") # -> NirsDataset (fetched from origin, checksum-verified, cached)
|
|
81
|
+
ds.sources() # ['X1', 'X2', 'X3'] — the same corn measured on three NIR instruments
|
|
82
|
+
ds.x("X1") # one source's spectra as a 2D numpy array
|
|
83
|
+
ds.x(concat=False) # {source_id: array} for every source (sample-aligned, not row-aligned)
|
|
84
|
+
ds.y() # all declared targets, per sample
|
|
85
|
+
ds.metadata() # the metadata columns (each a potential target)
|
|
86
|
+
ds.split("original") # the native split labels, if the source defined one
|
|
87
|
+
ds.to_nirs4all() # hand off to nirs4all for modelling
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Private / anonymized datasets need a Dataverse token: `n4ad.get("name", token=...)`.
|
|
91
|
+
|
|
92
|
+
## CLI (`n4a-datasets`)
|
|
93
|
+
|
|
94
|
+
```text
|
|
95
|
+
bootstrap <tree> author schema-2.0 descriptors from <tree>/v2.0/* (--prune to re-base)
|
|
96
|
+
build-all --source-tree <tree> organize + qualify every dataset in parallel (--protocol-refresh, --site)
|
|
97
|
+
add <raw_source> <id> one raw source -> canonical + card + index
|
|
98
|
+
qualify <id> (re)build a dataset's card (--anonymize -> card.anon.json)
|
|
99
|
+
health-check probe each dataset's open origins -> catalog/health.json
|
|
100
|
+
catalog | list | card | get regenerate the index / inspect / load a dataset
|
|
101
|
+
publish | grant | revoke | restrict personal-Dataverse governance for protected data (future)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
`n4a-datasets <command> --help` documents every flag.
|
|
105
|
+
|
|
106
|
+
## What lives where (3-tier storage)
|
|
107
|
+
|
|
108
|
+
- **git** (small, tracked): `catalog/datasets/<id>.yaml` (descriptor), `catalog/datasets.yaml` (index +
|
|
109
|
+
whole-bank summary), and per-dataset `card.json` / `card.md` / `croissant.json` / `manifest.json`.
|
|
110
|
+
- **the origin** (Zenodo, a data Dataverse, a vendor archive, …): the raw + canonical **bytes**, fetched
|
|
111
|
+
on demand and never re-hosted by this project.
|
|
112
|
+
- **local cache** (downloaded on demand): the verified canonical Parquet under `pooch.os_cache`.
|
|
113
|
+
|
|
114
|
+
## API token — where to put it
|
|
115
|
+
|
|
116
|
+
A Dataverse API token is **only** needed to fetch **private/anonymized** datasets or to publish to a
|
|
117
|
+
personal Dataverse; public datasets need none. Resolution order:
|
|
118
|
+
|
|
119
|
+
1. Environment variable `NIRS4ALL_DATAVERSE_TOKEN` (recommended; required in CI).
|
|
120
|
+
2. `~/.config/nirs4all-datasets/config.toml` (`chmod 600`):
|
|
121
|
+
```toml
|
|
122
|
+
[dataverse]
|
|
123
|
+
instance = "https://entrepot.recherche.data.gouv.fr"
|
|
124
|
+
token = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
|
125
|
+
```
|
|
126
|
+
3. A project `.env` (gitignored) — see `.env.example`.
|
|
127
|
+
|
|
128
|
+
The token travels only in the `X-Dataverse-key` header, is never logged, and is never sent on a redirect
|
|
129
|
+
to signed object storage. **Never commit it** (`.env`, `config.toml`, `*.token` are gitignored).
|
|
130
|
+
|
|
131
|
+
## Contributing
|
|
132
|
+
|
|
133
|
+
Full walkthrough in **[CONTRIBUTING.md](CONTRIBUTING.md)**; the design is in
|
|
134
|
+
**[docs/DESIGN.md](docs/DESIGN.md)**. The green gate (run before every commit) mirrors CI:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
ruff check . && mypy --config-file pyproject.toml src
|
|
138
|
+
python catalog/scripts/validate.py # every descriptor is schema-valid
|
|
139
|
+
pytest -q
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## License
|
|
143
|
+
|
|
144
|
+
Code: MIT (see [`LICENSE`](LICENSE)). Each dataset carries its **own** SPDX license in its descriptor, and
|
|
145
|
+
is only ever linked to its origin — open data is never re-hosted under a different license.
|