nirs4all-lite 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ .DS_Store
2
+ *.py[cod]
3
+ __pycache__/
4
+ .pytest_cache/
5
+ .mypy_cache/
6
+ .ruff_cache/
7
+ target/
8
+ node_modules/
9
+ dist/
10
+ build/
11
+ *.egg-info/
12
+ *.Rcheck/
13
+ .r-parity-lib/
14
+ *.tar.gz
15
+ *.mex*
16
+ *.oct
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 nirs4all contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,90 @@
1
+ Metadata-Version: 2.4
2
+ Name: nirs4all-lite
3
+ Version: 0.1.0
4
+ Summary: Portable nirs4all aggregate binding over formats, IO, methods, dag-ml, and dag-ml-data (datasets optional)
5
+ Project-URL: Homepage, https://nirs4all.org
6
+ Project-URL: Repository, https://github.com/GBeurier/nirs4all-lite
7
+ Project-URL: Issues, https://github.com/GBeurier/nirs4all-lite/issues
8
+ Author: nirs4all contributors
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: bindings,chemometrics,nirs,spectroscopy,wasm
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: pyyaml>=6
21
+ Provides-Extra: all
22
+ Requires-Dist: dag-ml-data>=0.1.0a0; extra == 'all'
23
+ Requires-Dist: dag-ml>=0.1.0a0; extra == 'all'
24
+ Requires-Dist: nirs4all-formats>=0.1.0; extra == 'all'
25
+ Requires-Dist: nirs4all-io>=0.1.0; extra == 'all'
26
+ Requires-Dist: nirs4all-methods>=0.98.0; extra == 'all'
27
+ Requires-Dist: scikit-learn>=1.3; extra == 'all'
28
+ Provides-Extra: dag-ml
29
+ Requires-Dist: dag-ml>=0.1.0a0; extra == 'dag-ml'
30
+ Provides-Extra: dag-ml-data
31
+ Requires-Dist: dag-ml-data>=0.1.0a0; extra == 'dag-ml-data'
32
+ Provides-Extra: datasets
33
+ Requires-Dist: nirs4all-datasets>=0.2.0a1; extra == 'datasets'
34
+ Provides-Extra: everything
35
+ Requires-Dist: dag-ml-data>=0.1.0a0; extra == 'everything'
36
+ Requires-Dist: dag-ml>=0.1.0a0; extra == 'everything'
37
+ Requires-Dist: nirs4all-datasets>=0.2.0a1; extra == 'everything'
38
+ Requires-Dist: nirs4all-formats>=0.1.0; extra == 'everything'
39
+ Requires-Dist: nirs4all-io>=0.1.0; extra == 'everything'
40
+ Requires-Dist: nirs4all-methods>=0.98.0; extra == 'everything'
41
+ Requires-Dist: scikit-learn>=1.3; extra == 'everything'
42
+ Provides-Extra: formats
43
+ Requires-Dist: nirs4all-formats>=0.1.0; extra == 'formats'
44
+ Provides-Extra: io
45
+ Requires-Dist: nirs4all-io>=0.1.0; extra == 'io'
46
+ Provides-Extra: methods
47
+ Requires-Dist: nirs4all-methods>=0.98.0; extra == 'methods'
48
+ Requires-Dist: scikit-learn>=1.3; extra == 'methods'
49
+ Description-Content-Type: text/markdown
50
+
51
+ # Python Binding
52
+
53
+ Distribution name: `nirs4all-lite`
54
+
55
+ Import name: `nirs4all_lite`
56
+
57
+ This binding intentionally avoids the `nirs4all` import name so it can be
58
+ installed next to the full Python `nirs4all` package during parity checks.
59
+
60
+ ## Portable Execution
61
+
62
+ `nirs4all_lite.run_portable_pipeline(source, dataset)` executes the shared
63
+ portable JSON/YAML subset through the `nirs4all-methods` Python bindings:
64
+
65
+ - `KennardStoneSplitter`
66
+ - `StandardNormalVariate` / `SNV`
67
+ - `SavitzkyGolay`
68
+ - `sklearn.cross_decomposition.PLSRegression`
69
+ - `_range_` sweeps over `n_components`
70
+
71
+ Savitzky-Golay defaults to `mode="interp"` for full Python nirs4all parity and
72
+ preserves explicit methods-backed modes (`mirror`, `constant`, `nearest`,
73
+ `wrap`, `interp`) plus `cval`.
74
+
75
+ The aggregate does not implement numerical kernels. Install the optional
76
+ methods extra, or make `n4m` and `pls4all` importable, before calling it:
77
+
78
+ ```bash
79
+ python -m pip install "nirs4all-lite[methods]"
80
+ ```
81
+
82
+ The strict local parity gate compares all shared fixtures against the full
83
+ Python `nirs4all` oracle and reports max prediction/RMSE deltas on failure:
84
+
85
+ ```bash
86
+ PYTHONPATH=bindings/python/src:/path/to/nirs4all-methods/bindings/python/src \
87
+ PLS4ALL_LIB_PATH=/path/to/libn4m.so \
88
+ NIRS4ALL_LITE_REQUIRE_METHODS_PARITY=1 \
89
+ python -m unittest bindings/python/tests/test_execution_parity.py -v
90
+ ```
@@ -0,0 +1,40 @@
1
+ # Python Binding
2
+
3
+ Distribution name: `nirs4all-lite`
4
+
5
+ Import name: `nirs4all_lite`
6
+
7
+ This binding intentionally avoids the `nirs4all` import name so it can be
8
+ installed next to the full Python `nirs4all` package during parity checks.
9
+
10
+ ## Portable Execution
11
+
12
+ `nirs4all_lite.run_portable_pipeline(source, dataset)` executes the shared
13
+ portable JSON/YAML subset through the `nirs4all-methods` Python bindings:
14
+
15
+ - `KennardStoneSplitter`
16
+ - `StandardNormalVariate` / `SNV`
17
+ - `SavitzkyGolay`
18
+ - `sklearn.cross_decomposition.PLSRegression`
19
+ - `_range_` sweeps over `n_components`
20
+
21
+ Savitzky-Golay defaults to `mode="interp"` for full Python nirs4all parity and
22
+ preserves explicit methods-backed modes (`mirror`, `constant`, `nearest`,
23
+ `wrap`, `interp`) plus `cval`.
24
+
25
+ The aggregate does not implement numerical kernels. Install the optional
26
+ methods extra, or make `n4m` and `pls4all` importable, before calling it:
27
+
28
+ ```bash
29
+ python -m pip install "nirs4all-lite[methods]"
30
+ ```
31
+
32
+ The strict local parity gate compares all shared fixtures against the full
33
+ Python `nirs4all` oracle and reports max prediction/RMSE deltas on failure:
34
+
35
+ ```bash
36
+ PYTHONPATH=bindings/python/src:/path/to/nirs4all-methods/bindings/python/src \
37
+ PLS4ALL_LIB_PATH=/path/to/libn4m.so \
38
+ NIRS4ALL_LITE_REQUIRE_METHODS_PARITY=1 \
39
+ python -m unittest bindings/python/tests/test_execution_parity.py -v
40
+ ```
@@ -0,0 +1,54 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "nirs4all-lite"
7
+ version = "0.1.0"
8
+ description = "Portable nirs4all aggregate binding over formats, IO, methods, dag-ml, and dag-ml-data (datasets optional)"
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ license = "MIT"
12
+ license-files = ["LICENSE"]
13
+ authors = [{ name = "nirs4all contributors" }]
14
+ keywords = ["nirs", "spectroscopy", "chemometrics", "wasm", "bindings"]
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Intended Audience :: Science/Research",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Topic :: Scientific/Engineering",
23
+ ]
24
+ dependencies = ["PyYAML>=6"]
25
+
26
+ [project.optional-dependencies]
27
+ dag-ml = ["dag-ml>=0.1.0a0"]
28
+ dag-ml-data = ["dag-ml-data>=0.1.0a0"]
29
+ formats = ["nirs4all-formats>=0.1.0"]
30
+ io = ["nirs4all-io>=0.1.0"]
31
+ # nirs4all-datasets is kept EXTERNAL/OPTIONAL (to avoid bloat): it is its own
32
+ # extra and is intentionally NOT part of the bundled `all` aggregate below.
33
+ datasets = ["nirs4all-datasets>=0.2.0a1"]
34
+ methods = ["nirs4all-methods>=0.98.0", "scikit-learn>=1.3"]
35
+ # The bundled aggregate = methods + formats + io (+ dag-ml / dag-ml-data).
36
+ # Datasets is deliberately excluded; install nirs4all-lite[datasets] (or
37
+ # [everything]) to opt in.
38
+ all = [
39
+ "dag-ml>=0.1.0a0",
40
+ "dag-ml-data>=0.1.0a0",
41
+ "nirs4all-formats>=0.1.0",
42
+ "nirs4all-io>=0.1.0",
43
+ "nirs4all-methods>=0.98.0",
44
+ "scikit-learn>=1.3",
45
+ ]
46
+ everything = ["nirs4all-lite[all,datasets]"]
47
+
48
+ [project.urls]
49
+ Homepage = "https://nirs4all.org"
50
+ Repository = "https://github.com/GBeurier/nirs4all-lite"
51
+ Issues = "https://github.com/GBeurier/nirs4all-lite/issues"
52
+
53
+ [tool.hatch.build.targets.wheel]
54
+ packages = ["src/nirs4all_lite"]
@@ -0,0 +1,48 @@
1
+ """Python surface for the nirs4all-lite aggregate distribution."""
2
+
3
+ from ._execution import PortableDataset, parse_execution_plan, run_portable_pipeline
4
+ from ._pipeline import (
5
+ PORTABLE_OPERATOR_CLASSES,
6
+ PipelineDefinition,
7
+ load_pipeline_definition,
8
+ portable_class_names,
9
+ )
10
+ from ._upstreams import (
11
+ LazyUpstream,
12
+ Upstream,
13
+ available_upstreams,
14
+ import_upstream,
15
+ require_upstream,
16
+ upstream_status,
17
+ upstreams,
18
+ )
19
+
20
+ dag_ml = LazyUpstream("dag_ml")
21
+ dag_ml_data = LazyUpstream("dag_ml_data")
22
+ datasets = LazyUpstream("datasets")
23
+ formats = LazyUpstream("formats")
24
+ io = LazyUpstream("io")
25
+ methods = LazyUpstream("methods")
26
+
27
+ __all__ = [
28
+ "LazyUpstream",
29
+ "PORTABLE_OPERATOR_CLASSES",
30
+ "PortableDataset",
31
+ "PipelineDefinition",
32
+ "Upstream",
33
+ "available_upstreams",
34
+ "dag_ml",
35
+ "dag_ml_data",
36
+ "datasets",
37
+ "formats",
38
+ "import_upstream",
39
+ "io",
40
+ "load_pipeline_definition",
41
+ "methods",
42
+ "parse_execution_plan",
43
+ "portable_class_names",
44
+ "require_upstream",
45
+ "run_portable_pipeline",
46
+ "upstream_status",
47
+ "upstreams",
48
+ ]
@@ -0,0 +1,291 @@
1
+ """Portable pipeline execution backed by the nirs4all-methods bindings."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any
7
+
8
+ from ._pipeline import PipelineDefinition, load_pipeline_definition
9
+
10
+ KENNARD_STONE_CLASSES: frozenset[str] = frozenset(
11
+ {
12
+ "nirs4all.operators.splitters.KennardStoneSplitter",
13
+ "nirs4all.operators.splitters.splitters.KennardStoneSplitter",
14
+ }
15
+ )
16
+ SNV_CLASSES: frozenset[str] = frozenset(
17
+ {
18
+ "nirs4all.operators.transforms.SNV",
19
+ "nirs4all.operators.transforms.StandardNormalVariate",
20
+ "nirs4all.operators.transforms.scalers.StandardNormalVariate",
21
+ }
22
+ )
23
+ SAVGOL_CLASSES: frozenset[str] = frozenset(
24
+ {
25
+ "nirs4all.operators.transforms.SavitzkyGolay",
26
+ "nirs4all.operators.transforms.nirs.SavitzkyGolay",
27
+ }
28
+ )
29
+ PLS_CLASSES: frozenset[str] = frozenset(
30
+ {
31
+ "sklearn.cross_decomposition.PLSRegression",
32
+ "sklearn.cross_decomposition._pls.PLSRegression",
33
+ }
34
+ )
35
+ SAVGOL_MODES: dict[str, int] = {
36
+ "mirror": 0,
37
+ "constant": 1,
38
+ "nearest": 2,
39
+ "wrap": 3,
40
+ "interp": 4,
41
+ }
42
+ SAVGOL_MODE_NAMES: tuple[str, ...] = ("mirror", "constant", "nearest", "wrap", "interp")
43
+
44
+
45
+ @dataclass(frozen=True)
46
+ class PortableDataset:
47
+ """Dense matrix dataset accepted by the portable runner."""
48
+
49
+ X: Any
50
+ y: Any
51
+ rows: int | None = None
52
+ cols: int | None = None
53
+
54
+
55
+ def run_portable_pipeline(
56
+ source: str | dict[str, Any] | list[Any] | PipelineDefinition,
57
+ dataset: PortableDataset | dict[str, Any],
58
+ ) -> dict[str, Any]:
59
+ """Execute a portable nirs4all pipeline through `nirs4all-methods`.
60
+
61
+ The aggregate does not implement numerical kernels itself. This function
62
+ translates the shared nirs4all JSON/YAML syntax to the idiomatic Python
63
+ wrappers exposed by `nirs4all-methods` (`n4m.sklearn` and
64
+ `pls4all.sklearn`) and returns the same result contract as the npm/WASM
65
+ binding.
66
+ """
67
+
68
+ np, KennardStoneSplitter, SNV, SavitzkyGolay, PLSRegression = _load_methods_surface()
69
+ definition = source if isinstance(source, PipelineDefinition) else load_pipeline_definition(source)
70
+ input_data = _coerce_dataset(dataset, np)
71
+ plan = parse_execution_plan(definition)
72
+
73
+ split = _compute_split(plan["splitter"], input_data, KennardStoneSplitter, np)
74
+ train_indices = np.asarray(split["trainIndices"], dtype=np.int64)
75
+ test_indices = np.asarray(split["testIndices"], dtype=np.int64)
76
+ x_train = input_data["X"][train_indices].copy()
77
+ x_test = input_data["X"][test_indices].copy()
78
+ y_train = input_data["y"][train_indices].copy()
79
+ y_test = input_data["y"][test_indices].copy()
80
+
81
+ preprocessing: list[dict[str, Any]] = []
82
+ for step in plan["preprocessing"]:
83
+ transformer = _make_transformer(step, SNV, SavitzkyGolay)
84
+ transformer.fit(x_train, y_train)
85
+ x_train = transformer.transform(x_train)
86
+ x_test = transformer.transform(x_test)
87
+ preprocessing.append({"type": step["type"], "params": step["params"]})
88
+
89
+ variants = []
90
+ for n_components in plan["nComponents"]:
91
+ model = PLSRegression(
92
+ n_components=int(n_components),
93
+ solver="simpls",
94
+ center_x=True,
95
+ scale_x=True,
96
+ center_y=True,
97
+ scale_y=True,
98
+ )
99
+ model.fit(x_train, y_train)
100
+ predictions = np.asarray(model.predict(x_test), dtype=np.float64).reshape(-1)
101
+ diff = predictions - y_test.reshape(-1)
102
+ variants.append(
103
+ {
104
+ "n_components": int(n_components),
105
+ "rmse": float(np.sqrt(np.mean(diff * diff))),
106
+ "predictions": predictions.tolist(),
107
+ }
108
+ )
109
+
110
+ selected = min(variants, key=lambda item: item["rmse"])
111
+ return {
112
+ "name": definition.name,
113
+ "rows": int(input_data["rows"]),
114
+ "cols": int(input_data["cols"]),
115
+ "split": split,
116
+ "preprocessing": preprocessing,
117
+ "variants": variants,
118
+ "selected": selected,
119
+ "targets": y_test.reshape(-1).tolist(),
120
+ }
121
+
122
+
123
+ def parse_execution_plan(
124
+ source: str | dict[str, Any] | list[Any] | PipelineDefinition,
125
+ ) -> dict[str, Any]:
126
+ """Parse the executable subset of the portable pipeline contract."""
127
+
128
+ definition = source if isinstance(source, PipelineDefinition) else load_pipeline_definition(source)
129
+ splitter: dict[str, Any] | None = None
130
+ preprocessing: list[dict[str, Any]] = []
131
+ model_step: dict[str, Any] | None = None
132
+
133
+ for step in definition.pipeline:
134
+ if not isinstance(step, dict):
135
+ raise TypeError("Portable pipeline steps must be mapping objects.")
136
+
137
+ class_name = step.get("class")
138
+ if isinstance(class_name, str):
139
+ if class_name in KENNARD_STONE_CLASSES:
140
+ splitter = {"type": "KennardStone", "params": dict(step.get("params") or {})}
141
+ elif class_name in SNV_CLASSES:
142
+ preprocessing.append({"type": "StandardNormalVariate", "params": []})
143
+ elif class_name in SAVGOL_CLASSES:
144
+ preprocessing.append({"type": "SavitzkyGolay", "params": _savgol_params(step.get("params") or {})})
145
+ else:
146
+ raise ValueError(f"Portable execution does not support step class '{class_name}'.")
147
+ continue
148
+
149
+ model = step.get("model")
150
+ if isinstance(model, dict):
151
+ if model_step is not None:
152
+ raise ValueError("Portable execution supports exactly one model step.")
153
+ model_step = step
154
+ continue
155
+
156
+ raise ValueError(f"Portable execution does not support pipeline step: {step!r}")
157
+
158
+ if model_step is None:
159
+ raise ValueError("Portable execution requires a PLSRegression model step.")
160
+ model = model_step.get("model")
161
+ model_class = model.get("class") if isinstance(model, dict) else None
162
+ if model_class not in PLS_CLASSES:
163
+ raise ValueError(f"Portable execution does not support model class '{model_class}'.")
164
+
165
+ return {
166
+ "splitter": splitter,
167
+ "preprocessing": preprocessing,
168
+ "nComponents": _component_values(model_step),
169
+ }
170
+
171
+
172
+ def _load_methods_surface():
173
+ try:
174
+ import numpy as np
175
+ from n4m.sklearn.preprocessing import SNV, SavitzkyGolay
176
+ from n4m.sklearn.splitters import KennardStoneSplitter
177
+ from pls4all.sklearn import PLSRegression
178
+ except ImportError as exc: # pragma: no cover - exercised by optional installs
179
+ raise ImportError(
180
+ "Portable execution requires the nirs4all-methods Python bindings. "
181
+ "Install `nirs4all-lite[methods]` or make `n4m`/`pls4all` available."
182
+ ) from exc
183
+ return np, KennardStoneSplitter, SNV, SavitzkyGolay, PLSRegression
184
+
185
+
186
+ def _coerce_dataset(dataset: PortableDataset | dict[str, Any], np):
187
+ if isinstance(dataset, PortableDataset):
188
+ raw = {"X": dataset.X, "y": dataset.y, "rows": dataset.rows, "cols": dataset.cols}
189
+ elif isinstance(dataset, dict):
190
+ raw = dataset
191
+ else:
192
+ raise TypeError("Portable execution requires a PortableDataset or mapping.")
193
+
194
+ rows = raw.get("rows", raw.get("n_samples"))
195
+ cols = raw.get("cols", raw.get("n_features"))
196
+ x = np.asarray(raw["X"], dtype=np.float64)
197
+ if x.ndim == 1:
198
+ if rows is None or cols is None:
199
+ raise TypeError("Flat X requires rows/cols or n_samples/n_features.")
200
+ x = x.reshape((int(rows), int(cols)))
201
+ elif x.ndim == 2:
202
+ rows, cols = x.shape
203
+ else:
204
+ raise TypeError("Dataset X must be a flat or 2-D numeric matrix.")
205
+
206
+ y = np.asarray(raw["y"], dtype=np.float64)
207
+ if y.ndim == 2 and y.shape[1] == 1:
208
+ y = y.reshape(-1)
209
+ elif y.ndim != 1:
210
+ raise TypeError("Portable execution currently supports a single numeric target.")
211
+
212
+ if x.shape[0] != y.shape[0]:
213
+ raise ValueError(f"X rows ({x.shape[0]}) must match y rows ({y.shape[0]}).")
214
+ return {
215
+ "X": np.ascontiguousarray(x, dtype=np.float64),
216
+ "y": np.ascontiguousarray(y, dtype=np.float64),
217
+ "rows": int(x.shape[0]),
218
+ "cols": int(x.shape[1]),
219
+ }
220
+
221
+
222
+ def _compute_split(splitter: dict[str, Any] | None, data: dict[str, Any], splitter_cls, np) -> dict[str, Any]:
223
+ if splitter is None:
224
+ indices = list(range(data["rows"]))
225
+ return {"kind": "all", "trainIndices": indices, "testIndices": indices}
226
+
227
+ split = splitter_cls(test_size=float(splitter["params"].get("test_size", 0.25)))
228
+ train, test = split.split(data["X"])
229
+ return {
230
+ "kind": "KennardStone",
231
+ "trainIndices": np.asarray(train, dtype=np.int64).astype(int).tolist(),
232
+ "testIndices": np.asarray(test, dtype=np.int64).astype(int).tolist(),
233
+ }
234
+
235
+
236
+ def _make_transformer(step: dict[str, Any], snv_cls, savgol_cls):
237
+ if step["type"] == "StandardNormalVariate":
238
+ return snv_cls()
239
+ if step["type"] == "SavitzkyGolay":
240
+ window_length, polyorder, deriv, mode, cval = step["params"]
241
+ return savgol_cls(
242
+ window_length=int(window_length),
243
+ polyorder=int(polyorder),
244
+ deriv=int(deriv),
245
+ delta=1.0,
246
+ mode=_savgol_mode_name(mode),
247
+ cval=float(cval),
248
+ )
249
+ raise ValueError(f"Unsupported portable preprocessing step: {step['type']}")
250
+
251
+
252
+ def _savgol_params(params: dict[str, Any]) -> list[float | int]:
253
+ delta = float(params.get("delta", 1.0))
254
+ if delta != 1.0:
255
+ raise ValueError("Portable Savitzky-Golay execution currently supports delta=1 only.")
256
+ return [
257
+ int(params.get("window_length", params.get("window", 11))),
258
+ int(params.get("polyorder", 3)),
259
+ int(params.get("deriv", 0)),
260
+ _savgol_mode(params.get("mode", "interp")),
261
+ float(params.get("cval", 0.0)),
262
+ ]
263
+
264
+
265
+ def _savgol_mode(value: Any) -> int:
266
+ if isinstance(value, str):
267
+ key = value.lower()
268
+ if key in SAVGOL_MODES:
269
+ return SAVGOL_MODES[key]
270
+ raise ValueError(f"Unsupported Savitzky-Golay mode: {value!r}")
271
+ mode = int(value)
272
+ if 0 <= mode < len(SAVGOL_MODE_NAMES):
273
+ return mode
274
+ raise ValueError(f"Unsupported Savitzky-Golay mode: {value!r}")
275
+
276
+
277
+ def _savgol_mode_name(value: Any) -> str:
278
+ mode = _savgol_mode(value)
279
+ return SAVGOL_MODE_NAMES[mode]
280
+
281
+
282
+ def _component_values(step: dict[str, Any]) -> list[int]:
283
+ if "_range_" in step:
284
+ if step.get("param") != "n_components":
285
+ raise ValueError("Portable execution only supports _range_ sweeps over 'n_components'.")
286
+ start, stop, stride = [int(value) for value in step["_range_"]]
287
+ if stride <= 0:
288
+ raise ValueError("Invalid n_components _range_; expected [start, stop, positive_step].")
289
+ return list(range(start, stop + 1, stride))
290
+ params = step.get("model", {}).get("params", {})
291
+ return [max(1, int(params.get("n_components", 2)))]
@@ -0,0 +1,185 @@
1
+ """Pipeline definition loading and validation for nirs4all-compatible syntax."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from copy import deepcopy
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import yaml
12
+
13
+ PORTABLE_OPERATOR_CLASSES: frozenset[str] = frozenset(
14
+ {
15
+ "nirs4all.operators.splitters.KennardStoneSplitter",
16
+ "nirs4all.operators.splitters.splitters.KennardStoneSplitter",
17
+ "nirs4all.operators.transforms.SNV",
18
+ "nirs4all.operators.transforms.StandardNormalVariate",
19
+ "nirs4all.operators.transforms.scalers.StandardNormalVariate",
20
+ "nirs4all.operators.transforms.SavitzkyGolay",
21
+ "nirs4all.operators.transforms.nirs.SavitzkyGolay",
22
+ "sklearn.cross_decomposition.PLSRegression",
23
+ "sklearn.cross_decomposition._pls.PLSRegression",
24
+ }
25
+ )
26
+
27
+
28
+ @dataclass(frozen=True)
29
+ class PipelineDefinition:
30
+ """A nirs4all-style pipeline definition restricted to portable operators."""
31
+
32
+ name: str
33
+ description: str
34
+ random_state: int | None
35
+ pipeline: list[Any]
36
+
37
+ def as_dict(self) -> dict[str, Any]:
38
+ result: dict[str, Any] = {
39
+ "name": self.name,
40
+ "description": self.description,
41
+ "pipeline": deepcopy(self.pipeline),
42
+ }
43
+ if self.random_state is not None:
44
+ result["random_state"] = self.random_state
45
+ return result
46
+
47
+
48
+ def load_pipeline_definition(source: str | Path | dict[str, Any] | list[Any]) -> PipelineDefinition:
49
+ """Load a nirs4all JSON/YAML pipeline definition and validate portable classes."""
50
+
51
+ if isinstance(source, (dict, list)):
52
+ data = deepcopy(source)
53
+ elif isinstance(source, Path):
54
+ data = _parse_file(source)
55
+ else:
56
+ path = _path_like_source(source)
57
+ if path is not None:
58
+ data = _parse_file(path)
59
+ else:
60
+ data = _parse_text(source, suffix="")
61
+
62
+ data = _normalize_pipeline_root(data)
63
+
64
+ pipeline = data["pipeline"]
65
+ if not isinstance(pipeline, list):
66
+ raise ValueError("Pipeline definition key 'pipeline' or 'steps' must contain a list of steps.")
67
+
68
+ cleaned_pipeline = _strip_comments(pipeline)
69
+ _validate_portable_classes(cleaned_pipeline)
70
+
71
+ random_state = data.get("random_state")
72
+ if isinstance(random_state, bool):
73
+ random_state = None
74
+ if random_state is not None and not isinstance(random_state, int):
75
+ raise TypeError("'random_state' must be an integer when provided.")
76
+
77
+ return PipelineDefinition(
78
+ name=str(data.get("name") or "pipeline"),
79
+ description=str(data.get("description") or ""),
80
+ random_state=random_state,
81
+ pipeline=cleaned_pipeline,
82
+ )
83
+
84
+
85
+ def portable_class_names(definition: PipelineDefinition | dict[str, Any] | list[Any]) -> list[str]:
86
+ """Return class names used by a portable pipeline definition, preserving order."""
87
+
88
+ if isinstance(definition, PipelineDefinition):
89
+ root: Any = definition.pipeline
90
+ elif isinstance(definition, dict) and "pipeline" in definition:
91
+ root = definition["pipeline"]
92
+ else:
93
+ root = definition
94
+
95
+ classes: list[str] = []
96
+ _collect_classes(root, classes)
97
+ return classes
98
+
99
+
100
+ def _parse_text(text: str, *, suffix: str) -> Any:
101
+ if suffix == ".json":
102
+ return json.loads(text)
103
+ if suffix in {".yaml", ".yml"}:
104
+ return yaml.safe_load(text)
105
+
106
+ try:
107
+ return json.loads(text)
108
+ except json.JSONDecodeError:
109
+ return yaml.safe_load(text)
110
+
111
+
112
+ def _parse_file(path: Path) -> Any:
113
+ if not path.is_file():
114
+ raise FileNotFoundError(f"Configuration file does not exist: {path}")
115
+ return _parse_text(path.read_text(encoding="utf-8"), suffix=path.suffix.lower())
116
+
117
+
118
+ def _path_like_source(source: str) -> Path | None:
119
+ if "\n" in source or "\r" in source:
120
+ return None
121
+
122
+ path = Path(source)
123
+ if path.suffix.lower() in {".json", ".yaml", ".yml"}:
124
+ return path
125
+ if path.exists():
126
+ return path
127
+ return None
128
+
129
+
130
+ def _normalize_pipeline_root(data: Any) -> dict[str, Any]:
131
+ if isinstance(data, list):
132
+ return {"pipeline": data}
133
+
134
+ if not isinstance(data, dict):
135
+ raise TypeError("Pipeline definition must be a list or mapping with a 'pipeline'/'steps' key.")
136
+
137
+ normalized = deepcopy(data)
138
+ if "pipeline" not in normalized:
139
+ if "steps" not in normalized:
140
+ raise ValueError(
141
+ "Invalid pipeline definition format. Expected a list or a mapping "
142
+ "with a 'pipeline' or 'steps' key."
143
+ )
144
+ normalized["pipeline"] = normalized["steps"]
145
+ return normalized
146
+
147
+
148
+ def _strip_comments(value: Any) -> Any:
149
+ if isinstance(value, list):
150
+ return [_strip_comments(item) for item in value if not _is_comment_step(item)]
151
+ if isinstance(value, dict):
152
+ return {
153
+ key: _strip_comments(item)
154
+ for key, item in value.items()
155
+ if key != "_comment"
156
+ }
157
+ return value
158
+
159
+
160
+ def _is_comment_step(value: Any) -> bool:
161
+ return isinstance(value, dict) and set(value) == {"_comment"}
162
+
163
+
164
+ def _validate_portable_classes(value: Any) -> None:
165
+ classes = portable_class_names(value)
166
+ unsupported = [name for name in classes if name not in PORTABLE_OPERATOR_CLASSES]
167
+ if unsupported:
168
+ raise ValueError(
169
+ "Pipeline uses operators outside the current nirs4all-lite portable subset: "
170
+ + ", ".join(dict.fromkeys(unsupported))
171
+ )
172
+
173
+
174
+ def _collect_classes(value: Any, output: list[str]) -> None:
175
+ if isinstance(value, list):
176
+ for item in value:
177
+ _collect_classes(item, output)
178
+ return
179
+
180
+ if isinstance(value, dict):
181
+ class_name = value.get("class")
182
+ if isinstance(class_name, str):
183
+ output.append(class_name)
184
+ for item in value.values():
185
+ _collect_classes(item, output)
@@ -0,0 +1,129 @@
1
+ """Upstream registry and lazy import helpers for nirs4all-lite."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ import importlib
7
+ import importlib.util
8
+ from types import ModuleType
9
+ from typing import Mapping
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class Upstream:
14
+ key: str
15
+ candidates: tuple[str, ...]
16
+ role: str
17
+
18
+
19
+ upstreams: Mapping[str, Upstream] = {
20
+ "dag_ml": Upstream(
21
+ key="dag_ml",
22
+ candidates=("dag_ml",),
23
+ role="Leakage-safe DAG/ML execution coordinator",
24
+ ),
25
+ "dag_ml_data": Upstream(
26
+ key="dag_ml_data",
27
+ candidates=("dag_ml_data",),
28
+ role="Sample-aligned data contracts for DAG/ML runtimes",
29
+ ),
30
+ "formats": Upstream(
31
+ key="formats",
32
+ candidates=("nirs4all_formats", "nirs4all.formats"),
33
+ role="Spectroscopy/NIRS vendor file readers",
34
+ ),
35
+ "io": Upstream(
36
+ key="io",
37
+ candidates=("nirs4all_io", "nirs4all.io"),
38
+ role="Dataset assembly bridge",
39
+ ),
40
+ "datasets": Upstream(
41
+ key="datasets",
42
+ candidates=("nirs4all_datasets", "nirs4all.datasets"),
43
+ role="DOI-pinned NIRS dataset catalog",
44
+ ),
45
+ "methods": Upstream(
46
+ key="methods",
47
+ candidates=("nirs4all_methods", "pls4all", "n4m", "nirs4all.methods"),
48
+ role="Portable C ABI PLS/NIRS numerical engine",
49
+ ),
50
+ }
51
+
52
+
53
+ def _find_candidate(candidate: str) -> bool:
54
+ try:
55
+ return importlib.util.find_spec(candidate) is not None
56
+ except ModuleNotFoundError:
57
+ return False
58
+
59
+
60
+ def import_upstream(name: str) -> ModuleType | None:
61
+ """Import an upstream module if one of its known candidates is installed."""
62
+
63
+ item = upstreams.get(name)
64
+ if item is None:
65
+ raise KeyError(f"Unknown nirs4all-lite upstream: {name}")
66
+
67
+ for candidate in item.candidates:
68
+ if _find_candidate(candidate):
69
+ return importlib.import_module(candidate)
70
+ return None
71
+
72
+
73
+ def require_upstream(name: str) -> ModuleType:
74
+ """Import an upstream module or raise a clear error."""
75
+
76
+ module = import_upstream(name)
77
+ if module is not None:
78
+ return module
79
+
80
+ item = upstreams[name]
81
+ candidates = ", ".join(item.candidates)
82
+ raise ImportError(
83
+ f"nirs4all-lite upstream '{name}' is not installed. "
84
+ f"Tried import candidates: {candidates}."
85
+ )
86
+
87
+
88
+ def available_upstreams() -> dict[str, bool]:
89
+ """Return availability for every registered upstream."""
90
+
91
+ return {name: import_upstream(name) is not None for name in upstreams}
92
+
93
+
94
+ def upstream_status() -> list[dict[str, object]]:
95
+ """Return a serializable status table for diagnostics."""
96
+
97
+ status = []
98
+ for name, item in upstreams.items():
99
+ available_candidate = next(
100
+ (candidate for candidate in item.candidates if _find_candidate(candidate)),
101
+ None,
102
+ )
103
+ status.append(
104
+ {
105
+ "key": name,
106
+ "available": available_candidate is not None,
107
+ "candidate": available_candidate,
108
+ "role": item.role,
109
+ }
110
+ )
111
+ return status
112
+
113
+
114
+ class LazyUpstream:
115
+ """Proxy that resolves an upstream module on first attribute access."""
116
+
117
+ def __init__(self, name: str) -> None:
118
+ if name not in upstreams:
119
+ raise KeyError(f"Unknown nirs4all-lite upstream: {name}")
120
+ self.name = name
121
+
122
+ def module(self) -> ModuleType:
123
+ return require_upstream(self.name)
124
+
125
+ def __getattr__(self, attribute: str) -> object:
126
+ return getattr(self.module(), attribute)
127
+
128
+ def __repr__(self) -> str:
129
+ return f"LazyUpstream(name={self.name!r})"
@@ -0,0 +1,80 @@
1
+ import json
2
+ import os
3
+ import unittest
4
+ from pathlib import Path
5
+
6
+ import nirs4all_lite as n4lite
7
+
8
+
9
+ ROOT = Path(__file__).resolve().parents[3]
10
+ FIXTURE_DIR = ROOT / "tests" / "parity" / "fixtures"
11
+ ORACLE_PATH = ROOT / "tests" / "parity" / "expected" / "portable_python_oracle.json"
12
+
13
+
14
+ def _max_abs_diff(actual, expected):
15
+ if len(actual) != len(expected):
16
+ raise AssertionError(f"length mismatch: {len(actual)} != {len(expected)}")
17
+ return max((abs(float(a) - float(e)) for a, e in zip(actual, expected)), default=0.0)
18
+
19
+
20
+ class ExecutionParityTests(unittest.TestCase):
21
+ def setUp(self) -> None:
22
+ try:
23
+ import numpy # noqa: F401
24
+ import n4m # noqa: F401
25
+ import pls4all # noqa: F401
26
+ except ImportError as exc:
27
+ if os.environ.get("NIRS4ALL_LITE_REQUIRE_METHODS_PARITY") == "1":
28
+ raise
29
+ raise unittest.SkipTest(
30
+ "nirs4all-methods Python bindings are not available"
31
+ ) from exc
32
+
33
+ def test_python_binding_execution_matches_full_nirs4all_oracle(self) -> None:
34
+ oracle = json.loads(ORACLE_PATH.read_text(encoding="utf-8"))
35
+ dataset = {
36
+ "X": oracle["dataset"]["X"],
37
+ "y": oracle["dataset"]["y"],
38
+ "rows": oracle["dataset"]["rows"],
39
+ "cols": oracle["dataset"]["cols"],
40
+ }
41
+ tolerances = oracle["metadata"]["tolerances"]
42
+
43
+ self.assertGreaterEqual(len(oracle["cases"]), 4)
44
+ for expected in oracle["cases"]:
45
+ with self.subTest(case=expected["name"]):
46
+ fixture = FIXTURE_DIR / f"{expected['name']}.json"
47
+ actual = n4lite.run_portable_pipeline(fixture, dataset)
48
+
49
+ self.assertEqual(actual["split"], expected["split"])
50
+ self.assertLessEqual(
51
+ _max_abs_diff(actual["targets"], expected["targets"]),
52
+ tolerances["targets_abs"],
53
+ )
54
+ self.assertEqual(len(actual["variants"]), len(expected["variants"]))
55
+ for actual_variant, expected_variant in zip(actual["variants"], expected["variants"]):
56
+ self.assertEqual(actual_variant["n_components"], expected_variant["n_components"])
57
+ self.assertLessEqual(
58
+ abs(actual_variant["rmse"] - expected_variant["rmse"]),
59
+ tolerances["rmse_abs"],
60
+ msg=(
61
+ f"{expected['name']} n_components="
62
+ f"{expected_variant['n_components']} RMSE diff"
63
+ ),
64
+ )
65
+ self.assertLessEqual(
66
+ _max_abs_diff(actual_variant["predictions"], expected_variant["predictions"]),
67
+ tolerances["predictions_abs"],
68
+ msg=(
69
+ f"{expected['name']} n_components="
70
+ f"{expected_variant['n_components']} prediction diff"
71
+ ),
72
+ )
73
+ self.assertEqual(
74
+ actual["selected"]["n_components"],
75
+ expected["selected"]["n_components"],
76
+ )
77
+
78
+
79
+ if __name__ == "__main__":
80
+ unittest.main()
@@ -0,0 +1,110 @@
1
+ import unittest
2
+ from pathlib import Path
3
+
4
+ import nirs4all_lite as n4lite
5
+
6
+
7
+ FIXTURE_DIR = Path(__file__).resolve().parents[3] / "tests" / "parity" / "fixtures"
8
+
9
+
10
+ class PipelineContractTests(unittest.TestCase):
11
+ def test_all_shared_json_and_yaml_fixtures_normalize_to_same_pipeline(self) -> None:
12
+ fixture_names = sorted(path.stem for path in FIXTURE_DIR.glob("portable_*.json"))
13
+
14
+ self.assertGreaterEqual(len(fixture_names), 4)
15
+ for name in fixture_names:
16
+ with self.subTest(name=name):
17
+ json_pipeline = n4lite.load_pipeline_definition(FIXTURE_DIR / f"{name}.json")
18
+ yaml_pipeline = n4lite.load_pipeline_definition(FIXTURE_DIR / f"{name}.yaml")
19
+
20
+ self.assertEqual(json_pipeline.as_dict(), yaml_pipeline.as_dict())
21
+ self.assertGreater(len(n4lite.portable_class_names(json_pipeline)), 0)
22
+
23
+ def test_json_and_yaml_fixtures_normalize_to_same_pipeline(self) -> None:
24
+ json_pipeline = n4lite.load_pipeline_definition(FIXTURE_DIR / "portable_methods_pipeline.json")
25
+ yaml_pipeline = n4lite.load_pipeline_definition(FIXTURE_DIR / "portable_methods_pipeline.yaml")
26
+
27
+ self.assertEqual(json_pipeline.as_dict(), yaml_pipeline.as_dict())
28
+ self.assertEqual(json_pipeline.random_state, 42)
29
+ self.assertEqual(
30
+ n4lite.portable_class_names(json_pipeline),
31
+ [
32
+ "nirs4all.operators.splitters.KennardStoneSplitter",
33
+ "nirs4all.operators.transforms.StandardNormalVariate",
34
+ "nirs4all.operators.transforms.SavitzkyGolay",
35
+ "sklearn.cross_decomposition.PLSRegression",
36
+ ],
37
+ )
38
+
39
+ def test_pls_sweep_uses_nirs4all_range_syntax(self) -> None:
40
+ definition = n4lite.load_pipeline_definition(FIXTURE_DIR / "portable_methods_pipeline.json")
41
+ sweep = definition.pipeline[-1]
42
+
43
+ self.assertEqual(sweep["param"], "n_components")
44
+ self.assertEqual(sweep["_range_"], [2, 11, 2])
45
+ self.assertEqual(sweep["model"]["class"], "sklearn.cross_decomposition.PLSRegression")
46
+
47
+ def test_savgol_default_polyorder_matches_full_python_nirs4all(self) -> None:
48
+ plan = n4lite.parse_execution_plan(
49
+ {
50
+ "pipeline": [
51
+ {
52
+ "class": "nirs4all.operators.transforms.SavitzkyGolay",
53
+ "params": {"window_length": 11},
54
+ },
55
+ {
56
+ "model": {
57
+ "class": "sklearn.cross_decomposition.PLSRegression",
58
+ "params": {"n_components": 2},
59
+ }
60
+ },
61
+ ]
62
+ }
63
+ )
64
+
65
+ self.assertEqual(plan["preprocessing"][0]["params"], [11, 3, 0, 4, 0.0])
66
+
67
+ def test_savgol_mode_and_cval_are_preserved(self) -> None:
68
+ plan = n4lite.parse_execution_plan(
69
+ {
70
+ "pipeline": [
71
+ {
72
+ "class": "nirs4all.operators.transforms.SavitzkyGolay",
73
+ "params": {"window_length": 11, "mode": "constant", "cval": 7.25},
74
+ },
75
+ {
76
+ "model": {
77
+ "class": "sklearn.cross_decomposition.PLSRegression",
78
+ "params": {"n_components": 2},
79
+ }
80
+ },
81
+ ]
82
+ }
83
+ )
84
+
85
+ self.assertEqual(plan["preprocessing"][0]["params"], [11, 3, 0, 1, 7.25])
86
+
87
+ def test_steps_alias_and_direct_list_match_nirs4all_loader_surface(self) -> None:
88
+ definition = n4lite.load_pipeline_definition(FIXTURE_DIR / "portable_methods_pipeline.json")
89
+
90
+ from_steps = n4lite.load_pipeline_definition({"steps": definition.pipeline})
91
+ from_list = n4lite.load_pipeline_definition(definition.pipeline)
92
+
93
+ self.assertEqual(from_steps.pipeline, definition.pipeline)
94
+ self.assertEqual(from_list.pipeline, definition.pipeline)
95
+ self.assertEqual(from_steps.name, "pipeline")
96
+ self.assertEqual(from_list.name, "pipeline")
97
+
98
+ def test_unsupported_operator_is_rejected(self) -> None:
99
+ with self.assertRaisesRegex(ValueError, "outside the current nirs4all-lite portable subset"):
100
+ n4lite.load_pipeline_definition(
101
+ {
102
+ "pipeline": [
103
+ {"class": "sklearn.ensemble.RandomForestRegressor"},
104
+ ]
105
+ }
106
+ )
107
+
108
+
109
+ if __name__ == "__main__":
110
+ unittest.main()
@@ -0,0 +1,32 @@
1
+ import unittest
2
+
3
+ import nirs4all_lite as n4lite
4
+
5
+
6
+ class UpstreamRegistryTests(unittest.TestCase):
7
+ def test_expected_upstream_keys_are_registered(self) -> None:
8
+ self.assertEqual(
9
+ list(n4lite.upstreams),
10
+ ["dag_ml", "dag_ml_data", "formats", "io", "datasets", "methods"],
11
+ )
12
+
13
+ def test_status_is_serializable(self) -> None:
14
+ status = n4lite.upstream_status()
15
+ self.assertEqual(len(status), 6)
16
+ self.assertIn("available", status[0])
17
+ self.assertIn("role", status[0])
18
+
19
+ def test_lazy_proxy_points_to_registered_upstream(self) -> None:
20
+ self.assertEqual(repr(n4lite.methods), "LazyUpstream(name='methods')")
21
+
22
+ def test_methods_candidates_include_current_python_bindings(self) -> None:
23
+ self.assertIn("nirs4all_methods", n4lite.upstreams["methods"].candidates)
24
+ self.assertIn("pls4all", n4lite.upstreams["methods"].candidates)
25
+
26
+ def test_unknown_upstream_is_rejected(self) -> None:
27
+ with self.assertRaises(KeyError):
28
+ n4lite.import_upstream("unknown")
29
+
30
+
31
+ if __name__ == "__main__":
32
+ unittest.main()