PyPI - nirs4all-lite - Versions diffs - 0.1.0__tar.gz - Mend

nirs4all-lite 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

nirs4all_lite-0.1.0/.gitignore +16 -0
nirs4all_lite-0.1.0/LICENSE +21 -0
nirs4all_lite-0.1.0/PKG-INFO +90 -0
nirs4all_lite-0.1.0/README.md +40 -0
nirs4all_lite-0.1.0/pyproject.toml +54 -0
nirs4all_lite-0.1.0/src/nirs4all_lite/__init__.py +48 -0
nirs4all_lite-0.1.0/src/nirs4all_lite/_execution.py +291 -0
nirs4all_lite-0.1.0/src/nirs4all_lite/_pipeline.py +185 -0
nirs4all_lite-0.1.0/src/nirs4all_lite/_upstreams.py +129 -0
nirs4all_lite-0.1.0/src/nirs4all_lite/py.typed +1 -0
nirs4all_lite-0.1.0/tests/test_execution_parity.py +80 -0
nirs4all_lite-0.1.0/tests/test_pipeline_contract.py +110 -0
nirs4all_lite-0.1.0/tests/test_upstreams.py +32 -0

nirs4all_lite-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,16 @@
+.DS_Store
+*.py[cod]
+__pycache__/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+target/
+node_modules/
+dist/
+build/
+*.egg-info/
+*.Rcheck/
+.r-parity-lib/
+*.tar.gz
+*.mex*
+*.oct

nirs4all_lite-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 nirs4all contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

nirs4all_lite-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,90 @@
+Metadata-Version: 2.4
+Name: nirs4all-lite
+Version: 0.1.0
+Summary: Portable nirs4all aggregate binding over formats, IO, methods, dag-ml, and dag-ml-data (datasets optional)
+Project-URL: Homepage, https://nirs4all.org
+Project-URL: Repository, https://github.com/GBeurier/nirs4all-lite
+Project-URL: Issues, https://github.com/GBeurier/nirs4all-lite/issues
+Author: nirs4all contributors
+License-Expression: MIT
+License-File: LICENSE
+Keywords: bindings,chemometrics,nirs,spectroscopy,wasm
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering
+Requires-Python: >=3.11
+Requires-Dist: pyyaml>=6
+Provides-Extra: all
+Requires-Dist: dag-ml-data>=0.1.0a0; extra == 'all'
+Requires-Dist: dag-ml>=0.1.0a0; extra == 'all'
+Requires-Dist: nirs4all-formats>=0.1.0; extra == 'all'
+Requires-Dist: nirs4all-io>=0.1.0; extra == 'all'
+Requires-Dist: nirs4all-methods>=0.98.0; extra == 'all'
+Requires-Dist: scikit-learn>=1.3; extra == 'all'
+Provides-Extra: dag-ml
+Requires-Dist: dag-ml>=0.1.0a0; extra == 'dag-ml'
+Provides-Extra: dag-ml-data
+Requires-Dist: dag-ml-data>=0.1.0a0; extra == 'dag-ml-data'
+Provides-Extra: datasets
+Requires-Dist: nirs4all-datasets>=0.2.0a1; extra == 'datasets'
+Provides-Extra: everything
+Requires-Dist: dag-ml-data>=0.1.0a0; extra == 'everything'
+Requires-Dist: dag-ml>=0.1.0a0; extra == 'everything'
+Requires-Dist: nirs4all-datasets>=0.2.0a1; extra == 'everything'
+Requires-Dist: nirs4all-formats>=0.1.0; extra == 'everything'
+Requires-Dist: nirs4all-io>=0.1.0; extra == 'everything'
+Requires-Dist: nirs4all-methods>=0.98.0; extra == 'everything'
+Requires-Dist: scikit-learn>=1.3; extra == 'everything'
+Provides-Extra: formats
+Requires-Dist: nirs4all-formats>=0.1.0; extra == 'formats'
+Provides-Extra: io
+Requires-Dist: nirs4all-io>=0.1.0; extra == 'io'
+Provides-Extra: methods
+Requires-Dist: nirs4all-methods>=0.98.0; extra == 'methods'
+Requires-Dist: scikit-learn>=1.3; extra == 'methods'
+Description-Content-Type: text/markdown
+# Python Binding
+Distribution name: `nirs4all-lite`
+Import name: `nirs4all_lite`
+This binding intentionally avoids the `nirs4all` import name so it can be
+installed next to the full Python `nirs4all` package during parity checks.
+## Portable Execution
+`nirs4all_lite.run_portable_pipeline(source, dataset)` executes the shared
+portable JSON/YAML subset through the `nirs4all-methods` Python bindings:
+- `KennardStoneSplitter`
+- `StandardNormalVariate` / `SNV`
+- `SavitzkyGolay`
+- `sklearn.cross_decomposition.PLSRegression`
+- `_range_` sweeps over `n_components`
+Savitzky-Golay defaults to `mode="interp"` for full Python nirs4all parity and
+preserves explicit methods-backed modes (`mirror`, `constant`, `nearest`,
+`wrap`, `interp`) plus `cval`.
+The aggregate does not implement numerical kernels. Install the optional
+methods extra, or make `n4m` and `pls4all` importable, before calling it:
+```bash
+python -m pip install "nirs4all-lite[methods]"
+```
+The strict local parity gate compares all shared fixtures against the full
+Python `nirs4all` oracle and reports max prediction/RMSE deltas on failure:
+```bash
+PYTHONPATH=bindings/python/src:/path/to/nirs4all-methods/bindings/python/src \
+PLS4ALL_LIB_PATH=/path/to/libn4m.so \
+NIRS4ALL_LITE_REQUIRE_METHODS_PARITY=1 \
+python -m unittest bindings/python/tests/test_execution_parity.py -v
+```

nirs4all_lite-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,40 @@
+# Python Binding
+Distribution name: `nirs4all-lite`
+Import name: `nirs4all_lite`
+This binding intentionally avoids the `nirs4all` import name so it can be
+installed next to the full Python `nirs4all` package during parity checks.
+## Portable Execution
+`nirs4all_lite.run_portable_pipeline(source, dataset)` executes the shared
+portable JSON/YAML subset through the `nirs4all-methods` Python bindings:
+- `KennardStoneSplitter`
+- `StandardNormalVariate` / `SNV`
+- `SavitzkyGolay`
+- `sklearn.cross_decomposition.PLSRegression`
+- `_range_` sweeps over `n_components`
+Savitzky-Golay defaults to `mode="interp"` for full Python nirs4all parity and
+preserves explicit methods-backed modes (`mirror`, `constant`, `nearest`,
+`wrap`, `interp`) plus `cval`.
+The aggregate does not implement numerical kernels. Install the optional
+methods extra, or make `n4m` and `pls4all` importable, before calling it:
+```bash
+python -m pip install "nirs4all-lite[methods]"
+```
+The strict local parity gate compares all shared fixtures against the full
+Python `nirs4all` oracle and reports max prediction/RMSE deltas on failure:
+```bash
+PYTHONPATH=bindings/python/src:/path/to/nirs4all-methods/bindings/python/src \
+PLS4ALL_LIB_PATH=/path/to/libn4m.so \
+NIRS4ALL_LITE_REQUIRE_METHODS_PARITY=1 \
+python -m unittest bindings/python/tests/test_execution_parity.py -v
+```

nirs4all_lite-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,54 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "nirs4all-lite"
+version = "0.1.0"
+description = "Portable nirs4all aggregate binding over formats, IO, methods, dag-ml, and dag-ml-data (datasets optional)"
+readme = "README.md"
+requires-python = ">=3.11"
+license = "MIT"
+license-files = ["LICENSE"]
+authors = [{ name = "nirs4all contributors" }]
+keywords = ["nirs", "spectroscopy", "chemometrics", "wasm", "bindings"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Scientific/Engineering",
+]
+dependencies = ["PyYAML>=6"]
+[project.optional-dependencies]
+dag-ml = ["dag-ml>=0.1.0a0"]
+dag-ml-data = ["dag-ml-data>=0.1.0a0"]
+formats = ["nirs4all-formats>=0.1.0"]
+io = ["nirs4all-io>=0.1.0"]
+# nirs4all-datasets is kept EXTERNAL/OPTIONAL (to avoid bloat): it is its own
+# extra and is intentionally NOT part of the bundled `all` aggregate below.
+datasets = ["nirs4all-datasets>=0.2.0a1"]
+methods = ["nirs4all-methods>=0.98.0", "scikit-learn>=1.3"]
+# The bundled aggregate = methods + formats + io (+ dag-ml / dag-ml-data).
+# Datasets is deliberately excluded; install nirs4all-lite[datasets] (or
+# [everything]) to opt in.
+all = [
+    "dag-ml>=0.1.0a0",
+    "dag-ml-data>=0.1.0a0",
+    "nirs4all-formats>=0.1.0",
+    "nirs4all-io>=0.1.0",
+    "nirs4all-methods>=0.98.0",
+    "scikit-learn>=1.3",
+]
+everything = ["nirs4all-lite[all,datasets]"]
+[project.urls]
+Homepage = "https://nirs4all.org"
+Repository = "https://github.com/GBeurier/nirs4all-lite"
+Issues = "https://github.com/GBeurier/nirs4all-lite/issues"
+[tool.hatch.build.targets.wheel]
+packages = ["src/nirs4all_lite"]

nirs4all_lite-0.1.0/src/nirs4all_lite/__init__.py ADDED Viewed

@@ -0,0 +1,48 @@
+"""Python surface for the nirs4all-lite aggregate distribution."""
+from ._execution import PortableDataset, parse_execution_plan, run_portable_pipeline
+from ._pipeline import (
+    PORTABLE_OPERATOR_CLASSES,
+    PipelineDefinition,
+    load_pipeline_definition,
+    portable_class_names,
+)
+from ._upstreams import (
+    LazyUpstream,
+    Upstream,
+    available_upstreams,
+    import_upstream,
+    require_upstream,
+    upstream_status,
+    upstreams,
+)
+dag_ml = LazyUpstream("dag_ml")
+dag_ml_data = LazyUpstream("dag_ml_data")
+datasets = LazyUpstream("datasets")
+formats = LazyUpstream("formats")
+io = LazyUpstream("io")
+methods = LazyUpstream("methods")
+__all__ = [
+    "LazyUpstream",
+    "PORTABLE_OPERATOR_CLASSES",
+    "PortableDataset",
+    "PipelineDefinition",
+    "Upstream",
+    "available_upstreams",
+    "dag_ml",
+    "dag_ml_data",
+    "datasets",
+    "formats",
+    "import_upstream",
+    "io",
+    "load_pipeline_definition",
+    "methods",
+    "parse_execution_plan",
+    "portable_class_names",
+    "require_upstream",
+    "run_portable_pipeline",
+    "upstream_status",
+    "upstreams",
+]

nirs4all_lite-0.1.0/src/nirs4all_lite/_execution.py ADDED Viewed

@@ -0,0 +1,291 @@
+"""Portable pipeline execution backed by the nirs4all-methods bindings."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any
+from ._pipeline import PipelineDefinition, load_pipeline_definition
+KENNARD_STONE_CLASSES: frozenset[str] = frozenset(
+    {
+        "nirs4all.operators.splitters.KennardStoneSplitter",
+        "nirs4all.operators.splitters.splitters.KennardStoneSplitter",
+    }
+)
+SNV_CLASSES: frozenset[str] = frozenset(
+    {
+        "nirs4all.operators.transforms.SNV",
+        "nirs4all.operators.transforms.StandardNormalVariate",
+        "nirs4all.operators.transforms.scalers.StandardNormalVariate",
+    }
+)
+SAVGOL_CLASSES: frozenset[str] = frozenset(
+    {
+        "nirs4all.operators.transforms.SavitzkyGolay",
+        "nirs4all.operators.transforms.nirs.SavitzkyGolay",
+    }
+)
+PLS_CLASSES: frozenset[str] = frozenset(
+    {
+        "sklearn.cross_decomposition.PLSRegression",
+        "sklearn.cross_decomposition._pls.PLSRegression",
+    }
+)
+SAVGOL_MODES: dict[str, int] = {
+    "mirror": 0,
+    "constant": 1,
+    "nearest": 2,
+    "wrap": 3,
+    "interp": 4,
+}
+SAVGOL_MODE_NAMES: tuple[str, ...] = ("mirror", "constant", "nearest", "wrap", "interp")
+@dataclass(frozen=True)
+class PortableDataset:
+    """Dense matrix dataset accepted by the portable runner."""
+    X: Any
+    y: Any
+    rows: int | None = None
+    cols: int | None = None
+def run_portable_pipeline(
+    source: str | dict[str, Any] | list[Any] | PipelineDefinition,
+    dataset: PortableDataset | dict[str, Any],
+) -> dict[str, Any]:
+    """Execute a portable nirs4all pipeline through `nirs4all-methods`.
+    The aggregate does not implement numerical kernels itself. This function
+    translates the shared nirs4all JSON/YAML syntax to the idiomatic Python
+    wrappers exposed by `nirs4all-methods` (`n4m.sklearn` and
+    `pls4all.sklearn`) and returns the same result contract as the npm/WASM
+    binding.
+    """
+    np, KennardStoneSplitter, SNV, SavitzkyGolay, PLSRegression = _load_methods_surface()
+    definition = source if isinstance(source, PipelineDefinition) else load_pipeline_definition(source)
+    input_data = _coerce_dataset(dataset, np)
+    plan = parse_execution_plan(definition)
+    split = _compute_split(plan["splitter"], input_data, KennardStoneSplitter, np)
+    train_indices = np.asarray(split["trainIndices"], dtype=np.int64)
+    test_indices = np.asarray(split["testIndices"], dtype=np.int64)
+    x_train = input_data["X"][train_indices].copy()
+    x_test = input_data["X"][test_indices].copy()
+    y_train = input_data["y"][train_indices].copy()
+    y_test = input_data["y"][test_indices].copy()
+    preprocessing: list[dict[str, Any]] = []
+    for step in plan["preprocessing"]:
+        transformer = _make_transformer(step, SNV, SavitzkyGolay)
+        transformer.fit(x_train, y_train)
+        x_train = transformer.transform(x_train)
+        x_test = transformer.transform(x_test)
+        preprocessing.append({"type": step["type"], "params": step["params"]})
+    variants = []
+    for n_components in plan["nComponents"]:
+        model = PLSRegression(
+            n_components=int(n_components),
+            solver="simpls",
+            center_x=True,
+            scale_x=True,
+            center_y=True,
+            scale_y=True,
+        )
+        model.fit(x_train, y_train)
+        predictions = np.asarray(model.predict(x_test), dtype=np.float64).reshape(-1)
+        diff = predictions - y_test.reshape(-1)
+        variants.append(
+            {
+                "n_components": int(n_components),
+                "rmse": float(np.sqrt(np.mean(diff * diff))),
+                "predictions": predictions.tolist(),
+            }
+        )
+    selected = min(variants, key=lambda item: item["rmse"])
+    return {
+        "name": definition.name,
+        "rows": int(input_data["rows"]),
+        "cols": int(input_data["cols"]),
+        "split": split,
+        "preprocessing": preprocessing,
+        "variants": variants,
+        "selected": selected,
+        "targets": y_test.reshape(-1).tolist(),
+    }
+def parse_execution_plan(
+    source: str | dict[str, Any] | list[Any] | PipelineDefinition,
+) -> dict[str, Any]:
+    """Parse the executable subset of the portable pipeline contract."""
+    definition = source if isinstance(source, PipelineDefinition) else load_pipeline_definition(source)
+    splitter: dict[str, Any] | None = None
+    preprocessing: list[dict[str, Any]] = []
+    model_step: dict[str, Any] | None = None
+    for step in definition.pipeline:
+        if not isinstance(step, dict):
+            raise TypeError("Portable pipeline steps must be mapping objects.")
+        class_name = step.get("class")
+        if isinstance(class_name, str):
+            if class_name in KENNARD_STONE_CLASSES:
+                splitter = {"type": "KennardStone", "params": dict(step.get("params") or {})}
+            elif class_name in SNV_CLASSES:
+                preprocessing.append({"type": "StandardNormalVariate", "params": []})
+            elif class_name in SAVGOL_CLASSES:
+                preprocessing.append({"type": "SavitzkyGolay", "params": _savgol_params(step.get("params") or {})})
+            else:
+                raise ValueError(f"Portable execution does not support step class '{class_name}'.")
+            continue
+        model = step.get("model")
+        if isinstance(model, dict):
+            if model_step is not None:
+                raise ValueError("Portable execution supports exactly one model step.")
+            model_step = step
+            continue
+        raise ValueError(f"Portable execution does not support pipeline step: {step!r}")
+    if model_step is None:
+        raise ValueError("Portable execution requires a PLSRegression model step.")
+    model = model_step.get("model")
+    model_class = model.get("class") if isinstance(model, dict) else None
+    if model_class not in PLS_CLASSES:
+        raise ValueError(f"Portable execution does not support model class '{model_class}'.")
+    return {
+        "splitter": splitter,
+        "preprocessing": preprocessing,
+        "nComponents": _component_values(model_step),
+    }
+def _load_methods_surface():
+    try:
+        import numpy as np
+        from n4m.sklearn.preprocessing import SNV, SavitzkyGolay
+        from n4m.sklearn.splitters import KennardStoneSplitter
+        from pls4all.sklearn import PLSRegression
+    except ImportError as exc:  # pragma: no cover - exercised by optional installs
+        raise ImportError(
+            "Portable execution requires the nirs4all-methods Python bindings. "
+            "Install `nirs4all-lite[methods]` or make `n4m`/`pls4all` available."
+        ) from exc
+    return np, KennardStoneSplitter, SNV, SavitzkyGolay, PLSRegression
+def _coerce_dataset(dataset: PortableDataset | dict[str, Any], np):
+    if isinstance(dataset, PortableDataset):
+        raw = {"X": dataset.X, "y": dataset.y, "rows": dataset.rows, "cols": dataset.cols}
+    elif isinstance(dataset, dict):
+        raw = dataset
+    else:
+        raise TypeError("Portable execution requires a PortableDataset or mapping.")
+    rows = raw.get("rows", raw.get("n_samples"))
+    cols = raw.get("cols", raw.get("n_features"))
+    x = np.asarray(raw["X"], dtype=np.float64)
+    if x.ndim == 1:
+        if rows is None or cols is None:
+            raise TypeError("Flat X requires rows/cols or n_samples/n_features.")
+        x = x.reshape((int(rows), int(cols)))
+    elif x.ndim == 2:
+        rows, cols = x.shape
+    else:
+        raise TypeError("Dataset X must be a flat or 2-D numeric matrix.")
+    y = np.asarray(raw["y"], dtype=np.float64)
+    if y.ndim == 2 and y.shape[1] == 1:
+        y = y.reshape(-1)
+    elif y.ndim != 1:
+        raise TypeError("Portable execution currently supports a single numeric target.")
+    if x.shape[0] != y.shape[0]:
+        raise ValueError(f"X rows ({x.shape[0]}) must match y rows ({y.shape[0]}).")
+    return {
+        "X": np.ascontiguousarray(x, dtype=np.float64),
+        "y": np.ascontiguousarray(y, dtype=np.float64),
+        "rows": int(x.shape[0]),
+        "cols": int(x.shape[1]),
+    }
+def _compute_split(splitter: dict[str, Any] | None, data: dict[str, Any], splitter_cls, np) -> dict[str, Any]:
+    if splitter is None:
+        indices = list(range(data["rows"]))
+        return {"kind": "all", "trainIndices": indices, "testIndices": indices}
+    split = splitter_cls(test_size=float(splitter["params"].get("test_size", 0.25)))
+    train, test = split.split(data["X"])
+    return {
+        "kind": "KennardStone",
+        "trainIndices": np.asarray(train, dtype=np.int64).astype(int).tolist(),
+        "testIndices": np.asarray(test, dtype=np.int64).astype(int).tolist(),
+    }
+def _make_transformer(step: dict[str, Any], snv_cls, savgol_cls):
+    if step["type"] == "StandardNormalVariate":
+        return snv_cls()
+    if step["type"] == "SavitzkyGolay":
+        window_length, polyorder, deriv, mode, cval = step["params"]
+        return savgol_cls(
+            window_length=int(window_length),
+            polyorder=int(polyorder),
+            deriv=int(deriv),
+            delta=1.0,
+            mode=_savgol_mode_name(mode),
+            cval=float(cval),
+        )
+    raise ValueError(f"Unsupported portable preprocessing step: {step['type']}")
+def _savgol_params(params: dict[str, Any]) -> list[float | int]:
+    delta = float(params.get("delta", 1.0))
+    if delta != 1.0:
+        raise ValueError("Portable Savitzky-Golay execution currently supports delta=1 only.")
+    return [
+        int(params.get("window_length", params.get("window", 11))),
+        int(params.get("polyorder", 3)),
+        int(params.get("deriv", 0)),
+        _savgol_mode(params.get("mode", "interp")),
+        float(params.get("cval", 0.0)),
+    ]
+def _savgol_mode(value: Any) -> int:
+    if isinstance(value, str):
+        key = value.lower()
+        if key in SAVGOL_MODES:
+            return SAVGOL_MODES[key]
+        raise ValueError(f"Unsupported Savitzky-Golay mode: {value!r}")
+    mode = int(value)
+    if 0 <= mode < len(SAVGOL_MODE_NAMES):
+        return mode
+    raise ValueError(f"Unsupported Savitzky-Golay mode: {value!r}")
+def _savgol_mode_name(value: Any) -> str:
+    mode = _savgol_mode(value)
+    return SAVGOL_MODE_NAMES[mode]
+def _component_values(step: dict[str, Any]) -> list[int]:
+    if "_range_" in step:
+        if step.get("param") != "n_components":
+            raise ValueError("Portable execution only supports _range_ sweeps over 'n_components'.")
+        start, stop, stride = [int(value) for value in step["_range_"]]
+        if stride <= 0:
+            raise ValueError("Invalid n_components _range_; expected [start, stop, positive_step].")
+        return list(range(start, stop + 1, stride))
+    params = step.get("model", {}).get("params", {})
+    return [max(1, int(params.get("n_components", 2)))]

nirs4all_lite-0.1.0/src/nirs4all_lite/_pipeline.py ADDED Viewed

@@ -0,0 +1,185 @@
+"""Pipeline definition loading and validation for nirs4all-compatible syntax."""
+from __future__ import annotations
+import json
+from copy import deepcopy
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+import yaml
+PORTABLE_OPERATOR_CLASSES: frozenset[str] = frozenset(
+    {
+        "nirs4all.operators.splitters.KennardStoneSplitter",
+        "nirs4all.operators.splitters.splitters.KennardStoneSplitter",
+        "nirs4all.operators.transforms.SNV",
+        "nirs4all.operators.transforms.StandardNormalVariate",
+        "nirs4all.operators.transforms.scalers.StandardNormalVariate",
+        "nirs4all.operators.transforms.SavitzkyGolay",
+        "nirs4all.operators.transforms.nirs.SavitzkyGolay",
+        "sklearn.cross_decomposition.PLSRegression",
+        "sklearn.cross_decomposition._pls.PLSRegression",
+    }
+)
+@dataclass(frozen=True)
+class PipelineDefinition:
+    """A nirs4all-style pipeline definition restricted to portable operators."""
+    name: str
+    description: str
+    random_state: int | None
+    pipeline: list[Any]
+    def as_dict(self) -> dict[str, Any]:
+        result: dict[str, Any] = {
+            "name": self.name,
+            "description": self.description,
+            "pipeline": deepcopy(self.pipeline),
+        }
+        if self.random_state is not None:
+            result["random_state"] = self.random_state
+        return result
+def load_pipeline_definition(source: str | Path | dict[str, Any] | list[Any]) -> PipelineDefinition:
+    """Load a nirs4all JSON/YAML pipeline definition and validate portable classes."""
+    if isinstance(source, (dict, list)):
+        data = deepcopy(source)
+    elif isinstance(source, Path):
+        data = _parse_file(source)
+    else:
+        path = _path_like_source(source)
+        if path is not None:
+            data = _parse_file(path)
+        else:
+            data = _parse_text(source, suffix="")
+    data = _normalize_pipeline_root(data)
+    pipeline = data["pipeline"]
+    if not isinstance(pipeline, list):
+        raise ValueError("Pipeline definition key 'pipeline' or 'steps' must contain a list of steps.")
+    cleaned_pipeline = _strip_comments(pipeline)
+    _validate_portable_classes(cleaned_pipeline)
+    random_state = data.get("random_state")
+    if isinstance(random_state, bool):
+        random_state = None
+    if random_state is not None and not isinstance(random_state, int):
+        raise TypeError("'random_state' must be an integer when provided.")
+    return PipelineDefinition(
+        name=str(data.get("name") or "pipeline"),
+        description=str(data.get("description") or ""),
+        random_state=random_state,
+        pipeline=cleaned_pipeline,
+    )
+def portable_class_names(definition: PipelineDefinition | dict[str, Any] | list[Any]) -> list[str]:
+    """Return class names used by a portable pipeline definition, preserving order."""
+    if isinstance(definition, PipelineDefinition):
+        root: Any = definition.pipeline
+    elif isinstance(definition, dict) and "pipeline" in definition:
+        root = definition["pipeline"]
+    else:
+        root = definition
+    classes: list[str] = []
+    _collect_classes(root, classes)
+    return classes
+def _parse_text(text: str, *, suffix: str) -> Any:
+    if suffix == ".json":
+        return json.loads(text)
+    if suffix in {".yaml", ".yml"}:
+        return yaml.safe_load(text)
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        return yaml.safe_load(text)
+def _parse_file(path: Path) -> Any:
+    if not path.is_file():
+        raise FileNotFoundError(f"Configuration file does not exist: {path}")
+    return _parse_text(path.read_text(encoding="utf-8"), suffix=path.suffix.lower())
+def _path_like_source(source: str) -> Path | None:
+    if "\n" in source or "\r" in source:
+        return None
+    path = Path(source)
+    if path.suffix.lower() in {".json", ".yaml", ".yml"}:
+        return path
+    if path.exists():
+        return path
+    return None
+def _normalize_pipeline_root(data: Any) -> dict[str, Any]:
+    if isinstance(data, list):
+        return {"pipeline": data}
+    if not isinstance(data, dict):
+        raise TypeError("Pipeline definition must be a list or mapping with a 'pipeline'/'steps' key.")
+    normalized = deepcopy(data)
+    if "pipeline" not in normalized:
+        if "steps" not in normalized:
+            raise ValueError(
+                "Invalid pipeline definition format. Expected a list or a mapping "
+                "with a 'pipeline' or 'steps' key."
+            )
+        normalized["pipeline"] = normalized["steps"]
+    return normalized
+def _strip_comments(value: Any) -> Any:
+    if isinstance(value, list):
+        return [_strip_comments(item) for item in value if not _is_comment_step(item)]
+    if isinstance(value, dict):
+        return {
+            key: _strip_comments(item)
+            for key, item in value.items()
+            if key != "_comment"
+        }
+    return value
+def _is_comment_step(value: Any) -> bool:
+    return isinstance(value, dict) and set(value) == {"_comment"}
+def _validate_portable_classes(value: Any) -> None:
+    classes = portable_class_names(value)
+    unsupported = [name for name in classes if name not in PORTABLE_OPERATOR_CLASSES]
+    if unsupported:
+        raise ValueError(
+            "Pipeline uses operators outside the current nirs4all-lite portable subset: "
+            + ", ".join(dict.fromkeys(unsupported))
+        )
+def _collect_classes(value: Any, output: list[str]) -> None:
+    if isinstance(value, list):
+        for item in value:
+            _collect_classes(item, output)
+        return
+    if isinstance(value, dict):
+        class_name = value.get("class")
+        if isinstance(class_name, str):
+            output.append(class_name)
+        for item in value.values():
+            _collect_classes(item, output)

nirs4all_lite-0.1.0/src/nirs4all_lite/_upstreams.py ADDED Viewed

@@ -0,0 +1,129 @@
+"""Upstream registry and lazy import helpers for nirs4all-lite."""
+from __future__ import annotations
+from dataclasses import dataclass
+import importlib
+import importlib.util
+from types import ModuleType
+from typing import Mapping
+@dataclass(frozen=True)
+class Upstream:
+    key: str
+    candidates: tuple[str, ...]
+    role: str
+upstreams: Mapping[str, Upstream] = {
+    "dag_ml": Upstream(
+        key="dag_ml",
+        candidates=("dag_ml",),
+        role="Leakage-safe DAG/ML execution coordinator",
+    ),
+    "dag_ml_data": Upstream(
+        key="dag_ml_data",
+        candidates=("dag_ml_data",),
+        role="Sample-aligned data contracts for DAG/ML runtimes",
+    ),
+    "formats": Upstream(
+        key="formats",
+        candidates=("nirs4all_formats", "nirs4all.formats"),
+        role="Spectroscopy/NIRS vendor file readers",
+    ),
+    "io": Upstream(
+        key="io",
+        candidates=("nirs4all_io", "nirs4all.io"),
+        role="Dataset assembly bridge",
+    ),
+    "datasets": Upstream(
+        key="datasets",
+        candidates=("nirs4all_datasets", "nirs4all.datasets"),
+        role="DOI-pinned NIRS dataset catalog",
+    ),
+    "methods": Upstream(
+        key="methods",
+        candidates=("nirs4all_methods", "pls4all", "n4m", "nirs4all.methods"),
+        role="Portable C ABI PLS/NIRS numerical engine",
+    ),
+}
+def _find_candidate(candidate: str) -> bool:
+    try:
+        return importlib.util.find_spec(candidate) is not None
+    except ModuleNotFoundError:
+        return False
+def import_upstream(name: str) -> ModuleType | None:
+    """Import an upstream module if one of its known candidates is installed."""
+    item = upstreams.get(name)
+    if item is None:
+        raise KeyError(f"Unknown nirs4all-lite upstream: {name}")
+    for candidate in item.candidates:
+        if _find_candidate(candidate):
+            return importlib.import_module(candidate)
+    return None
+def require_upstream(name: str) -> ModuleType:
+    """Import an upstream module or raise a clear error."""
+    module = import_upstream(name)
+    if module is not None:
+        return module
+    item = upstreams[name]
+    candidates = ", ".join(item.candidates)
+    raise ImportError(
+        f"nirs4all-lite upstream '{name}' is not installed. "
+        f"Tried import candidates: {candidates}."
+    )
+def available_upstreams() -> dict[str, bool]:
+    """Return availability for every registered upstream."""
+    return {name: import_upstream(name) is not None for name in upstreams}
+def upstream_status() -> list[dict[str, object]]:
+    """Return a serializable status table for diagnostics."""
+    status = []
+    for name, item in upstreams.items():
+        available_candidate = next(
+            (candidate for candidate in item.candidates if _find_candidate(candidate)),
+            None,
+        )
+        status.append(
+            {
+                "key": name,
+                "available": available_candidate is not None,
+                "candidate": available_candidate,
+                "role": item.role,
+            }
+        )
+    return status
+class LazyUpstream:
+    """Proxy that resolves an upstream module on first attribute access."""
+    def __init__(self, name: str) -> None:
+        if name not in upstreams:
+            raise KeyError(f"Unknown nirs4all-lite upstream: {name}")
+        self.name = name
+    def module(self) -> ModuleType:
+        return require_upstream(self.name)
+    def __getattr__(self, attribute: str) -> object:
+        return getattr(self.module(), attribute)
+    def __repr__(self) -> str:
+        return f"LazyUpstream(name={self.name!r})"

nirs4all_lite-0.1.0/src/nirs4all_lite/py.typed ADDED Viewed

	@@ -0,0 +1 @@
1	+

nirs4all_lite-0.1.0/tests/test_execution_parity.py ADDED Viewed

@@ -0,0 +1,80 @@
+import json
+import os
+import unittest
+from pathlib import Path
+import nirs4all_lite as n4lite
+ROOT = Path(__file__).resolve().parents[3]
+FIXTURE_DIR = ROOT / "tests" / "parity" / "fixtures"
+ORACLE_PATH = ROOT / "tests" / "parity" / "expected" / "portable_python_oracle.json"
+def _max_abs_diff(actual, expected):
+    if len(actual) != len(expected):
+        raise AssertionError(f"length mismatch: {len(actual)} != {len(expected)}")
+    return max((abs(float(a) - float(e)) for a, e in zip(actual, expected)), default=0.0)
+class ExecutionParityTests(unittest.TestCase):
+    def setUp(self) -> None:
+        try:
+            import numpy  # noqa: F401
+            import n4m  # noqa: F401
+            import pls4all  # noqa: F401
+        except ImportError as exc:
+            if os.environ.get("NIRS4ALL_LITE_REQUIRE_METHODS_PARITY") == "1":
+                raise
+            raise unittest.SkipTest(
+                "nirs4all-methods Python bindings are not available"
+            ) from exc
+    def test_python_binding_execution_matches_full_nirs4all_oracle(self) -> None:
+        oracle = json.loads(ORACLE_PATH.read_text(encoding="utf-8"))
+        dataset = {
+            "X": oracle["dataset"]["X"],
+            "y": oracle["dataset"]["y"],
+            "rows": oracle["dataset"]["rows"],
+            "cols": oracle["dataset"]["cols"],
+        }
+        tolerances = oracle["metadata"]["tolerances"]
+        self.assertGreaterEqual(len(oracle["cases"]), 4)
+        for expected in oracle["cases"]:
+            with self.subTest(case=expected["name"]):
+                fixture = FIXTURE_DIR / f"{expected['name']}.json"
+                actual = n4lite.run_portable_pipeline(fixture, dataset)
+                self.assertEqual(actual["split"], expected["split"])
+                self.assertLessEqual(
+                    _max_abs_diff(actual["targets"], expected["targets"]),
+                    tolerances["targets_abs"],
+                )
+                self.assertEqual(len(actual["variants"]), len(expected["variants"]))
+                for actual_variant, expected_variant in zip(actual["variants"], expected["variants"]):
+                    self.assertEqual(actual_variant["n_components"], expected_variant["n_components"])
+                    self.assertLessEqual(
+                        abs(actual_variant["rmse"] - expected_variant["rmse"]),
+                        tolerances["rmse_abs"],
+                        msg=(
+                            f"{expected['name']} n_components="
+                            f"{expected_variant['n_components']} RMSE diff"
+                        ),
+                    )
+                    self.assertLessEqual(
+                        _max_abs_diff(actual_variant["predictions"], expected_variant["predictions"]),
+                        tolerances["predictions_abs"],
+                        msg=(
+                            f"{expected['name']} n_components="
+                            f"{expected_variant['n_components']} prediction diff"
+                        ),
+                    )
+                self.assertEqual(
+                    actual["selected"]["n_components"],
+                    expected["selected"]["n_components"],
+                )
+if __name__ == "__main__":
+    unittest.main()

nirs4all_lite-0.1.0/tests/test_pipeline_contract.py ADDED Viewed

@@ -0,0 +1,110 @@
+import unittest
+from pathlib import Path
+import nirs4all_lite as n4lite
+FIXTURE_DIR = Path(__file__).resolve().parents[3] / "tests" / "parity" / "fixtures"
+class PipelineContractTests(unittest.TestCase):
+    def test_all_shared_json_and_yaml_fixtures_normalize_to_same_pipeline(self) -> None:
+        fixture_names = sorted(path.stem for path in FIXTURE_DIR.glob("portable_*.json"))
+        self.assertGreaterEqual(len(fixture_names), 4)
+        for name in fixture_names:
+            with self.subTest(name=name):
+                json_pipeline = n4lite.load_pipeline_definition(FIXTURE_DIR / f"{name}.json")
+                yaml_pipeline = n4lite.load_pipeline_definition(FIXTURE_DIR / f"{name}.yaml")
+                self.assertEqual(json_pipeline.as_dict(), yaml_pipeline.as_dict())
+                self.assertGreater(len(n4lite.portable_class_names(json_pipeline)), 0)
+    def test_json_and_yaml_fixtures_normalize_to_same_pipeline(self) -> None:
+        json_pipeline = n4lite.load_pipeline_definition(FIXTURE_DIR / "portable_methods_pipeline.json")
+        yaml_pipeline = n4lite.load_pipeline_definition(FIXTURE_DIR / "portable_methods_pipeline.yaml")
+        self.assertEqual(json_pipeline.as_dict(), yaml_pipeline.as_dict())
+        self.assertEqual(json_pipeline.random_state, 42)
+        self.assertEqual(
+            n4lite.portable_class_names(json_pipeline),
+            [
+                "nirs4all.operators.splitters.KennardStoneSplitter",
+                "nirs4all.operators.transforms.StandardNormalVariate",
+                "nirs4all.operators.transforms.SavitzkyGolay",
+                "sklearn.cross_decomposition.PLSRegression",
+            ],
+        )
+    def test_pls_sweep_uses_nirs4all_range_syntax(self) -> None:
+        definition = n4lite.load_pipeline_definition(FIXTURE_DIR / "portable_methods_pipeline.json")
+        sweep = definition.pipeline[-1]
+        self.assertEqual(sweep["param"], "n_components")
+        self.assertEqual(sweep["_range_"], [2, 11, 2])
+        self.assertEqual(sweep["model"]["class"], "sklearn.cross_decomposition.PLSRegression")
+    def test_savgol_default_polyorder_matches_full_python_nirs4all(self) -> None:
+        plan = n4lite.parse_execution_plan(
+            {
+                "pipeline": [
+                    {
+                        "class": "nirs4all.operators.transforms.SavitzkyGolay",
+                        "params": {"window_length": 11},
+                    },
+                    {
+                        "model": {
+                            "class": "sklearn.cross_decomposition.PLSRegression",
+                            "params": {"n_components": 2},
+                        }
+                    },
+                ]
+            }
+        )
+        self.assertEqual(plan["preprocessing"][0]["params"], [11, 3, 0, 4, 0.0])
+    def test_savgol_mode_and_cval_are_preserved(self) -> None:
+        plan = n4lite.parse_execution_plan(
+            {
+                "pipeline": [
+                    {
+                        "class": "nirs4all.operators.transforms.SavitzkyGolay",
+                        "params": {"window_length": 11, "mode": "constant", "cval": 7.25},
+                    },
+                    {
+                        "model": {
+                            "class": "sklearn.cross_decomposition.PLSRegression",
+                            "params": {"n_components": 2},
+                        }
+                    },
+                ]
+            }
+        )
+        self.assertEqual(plan["preprocessing"][0]["params"], [11, 3, 0, 1, 7.25])
+    def test_steps_alias_and_direct_list_match_nirs4all_loader_surface(self) -> None:
+        definition = n4lite.load_pipeline_definition(FIXTURE_DIR / "portable_methods_pipeline.json")
+        from_steps = n4lite.load_pipeline_definition({"steps": definition.pipeline})
+        from_list = n4lite.load_pipeline_definition(definition.pipeline)
+        self.assertEqual(from_steps.pipeline, definition.pipeline)
+        self.assertEqual(from_list.pipeline, definition.pipeline)
+        self.assertEqual(from_steps.name, "pipeline")
+        self.assertEqual(from_list.name, "pipeline")
+    def test_unsupported_operator_is_rejected(self) -> None:
+        with self.assertRaisesRegex(ValueError, "outside the current nirs4all-lite portable subset"):
+            n4lite.load_pipeline_definition(
+                {
+                    "pipeline": [
+                        {"class": "sklearn.ensemble.RandomForestRegressor"},
+                    ]
+                }
+            )
+if __name__ == "__main__":
+    unittest.main()

nirs4all_lite-0.1.0/tests/test_upstreams.py ADDED Viewed

@@ -0,0 +1,32 @@
+import unittest
+import nirs4all_lite as n4lite
+class UpstreamRegistryTests(unittest.TestCase):
+    def test_expected_upstream_keys_are_registered(self) -> None:
+        self.assertEqual(
+            list(n4lite.upstreams),
+            ["dag_ml", "dag_ml_data", "formats", "io", "datasets", "methods"],
+        )
+    def test_status_is_serializable(self) -> None:
+        status = n4lite.upstream_status()
+        self.assertEqual(len(status), 6)
+        self.assertIn("available", status[0])
+        self.assertIn("role", status[0])
+    def test_lazy_proxy_points_to_registered_upstream(self) -> None:
+        self.assertEqual(repr(n4lite.methods), "LazyUpstream(name='methods')")
+    def test_methods_candidates_include_current_python_bindings(self) -> None:
+        self.assertIn("nirs4all_methods", n4lite.upstreams["methods"].candidates)
+        self.assertIn("pls4all", n4lite.upstreams["methods"].candidates)
+    def test_unknown_upstream_is_rejected(self) -> None:
+        with self.assertRaises(KeyError):
+            n4lite.import_upstream("unknown")
+if __name__ == "__main__":
+    unittest.main()