PyPI - r-scikit-learn - Versions diffs - 0.1.1__tar.gz → 0.1.2__tar.gz - Mend

r-scikit-learn 0.1.1tar.gz → 0.1.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

{r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/CHANGELOG.md RENAMED Viewed

@@ -5,6 +5,18 @@ published package versions are immutable.
 ## Unreleased
+## 0.1.2 - 2026-06-24
+- Added dense brute-force `KNeighborsClassifier` with Rust-backed neighbor
+  search, class voting, `predict`, `predict_proba`, and `kneighbors`.
+- Added scikit-learn parity tests and benchmarks for nearest-neighbor
+  classification.
+- Optimized the dense Euclidean neighbor search path with blocked dot products,
+  reusable work buffers, and macOS Accelerate/CBLAS acceleration with a portable
+  `matrixmultiply` fallback.
+- Added sparse `StandardScaler(with_mean=False)` and `MaxAbsScaler` with
+  Rust-backed CSR/CSC reductions and column scaling.
 ## 0.1.1 - 2026-06-15
 - Added wheel and source-distribution installation testing across supported

{r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/Cargo.lock RENAMED Viewed

@@ -998,9 +998,10 @@ dependencies = [
 [[package]]
 name = "r-scikit-learn-core"
-version = "0.1.1"
+version = "0.1.2"
 dependencies = [
  "faer",
+ "matrixmultiply",
  "nalgebra",
  "numpy",
  "pyo3",

{r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/Cargo.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "r-scikit-learn-core"
-version = "0.1.1"
+version = "0.1.2"
 edition = "2021"
 license = "MIT"
 description = "Rust computational core for r-scikit-learn"
@@ -29,6 +29,7 @@ crate-type = ["cdylib", "rlib"]
 [dependencies]
 faer = { version = "0.24", default-features = false, features = ["std", "rayon", "linalg"] }
+matrixmultiply = "0.3"
 nalgebra = { version = "0.34", default-features = false, features = ["std"] }
 numpy = "0.28"
 pyo3 = "0.28"

{r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: r-scikit-learn
-Version: 0.1.1
+Version: 0.1.2
 Classifier: Development Status :: 3 - Alpha
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3
@@ -126,6 +126,13 @@ encoder = OneHotEncoder(handle_unknown="ignore")
 X_one_hot = encoder.fit_transform([["small"], ["large"], ["small"]])
 ```
+```python
+from rsklearn.preprocessing import MaxAbsScaler, StandardScaler
+X_sparse_scaled = StandardScaler(with_mean=False).fit_transform(X_one_hot)
+X_sparse_maxabs = MaxAbsScaler().fit_transform(X_one_hot)
+```
 ```python
 import numpy as np
 from rsklearn.impute import SimpleImputer
@@ -195,7 +202,10 @@ probabilities = classifier.predict_proba(X_test)
 - Uses float64 fitted statistics and native float32 kernels where supported.
 - Ignores NaNs while fitting, preserves them while transforming, and rejects
   infinity.
-- Supports incremental `partial_fit` for `StandardScaler` and `MinMaxScaler`.
+- Supports incremental `partial_fit` for `StandardScaler`, `MaxAbsScaler`, and
+  `MinMaxScaler`.
+- Supports CSR/CSC sparse `StandardScaler(with_mean=False)` and `MaxAbsScaler`
+  without densifying input.
 - Supports L1, L2, and max row normalization.
 - Provides quantile-based `RobustScaler` fitting and inverse transforms.
@@ -276,8 +286,6 @@ The core implemented behavior is tested and packaged across Linux, macOS, and
 Windows, but the project remains alpha software. Before a stable 1.0 release,
 the following compatibility and operational work remains:
-- Sparse-aware estimator behavior, including non-centering `StandardScaler`
-  operation. Shared CSR/CSC validation and Rust kernels are implemented.
 - `sample_weight` support for `StandardScaler.partial_fit`.
 - Comprehensive `get_feature_names_out` support and configurable output
   containers across estimators.

{r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/README.md RENAMED Viewed

@@ -93,6 +93,13 @@ encoder = OneHotEncoder(handle_unknown="ignore")
 X_one_hot = encoder.fit_transform([["small"], ["large"], ["small"]])
 ```
+```python
+from rsklearn.preprocessing import MaxAbsScaler, StandardScaler
+X_sparse_scaled = StandardScaler(with_mean=False).fit_transform(X_one_hot)
+X_sparse_maxabs = MaxAbsScaler().fit_transform(X_one_hot)
+```
 ```python
 import numpy as np
 from rsklearn.impute import SimpleImputer
@@ -162,7 +169,10 @@ probabilities = classifier.predict_proba(X_test)
 - Uses float64 fitted statistics and native float32 kernels where supported.
 - Ignores NaNs while fitting, preserves them while transforming, and rejects
   infinity.
-- Supports incremental `partial_fit` for `StandardScaler` and `MinMaxScaler`.
+- Supports incremental `partial_fit` for `StandardScaler`, `MaxAbsScaler`, and
+  `MinMaxScaler`.
+- Supports CSR/CSC sparse `StandardScaler(with_mean=False)` and `MaxAbsScaler`
+  without densifying input.
 - Supports L1, L2, and max row normalization.
 - Provides quantile-based `RobustScaler` fitting and inverse transforms.
@@ -243,8 +253,6 @@ The core implemented behavior is tested and packaged across Linux, macOS, and
 Windows, but the project remains alpha software. Before a stable 1.0 release,
 the following compatibility and operational work remains:
-- Sparse-aware estimator behavior, including non-centering `StandardScaler`
-  operation. Shared CSR/CSC validation and Rust kernels are implemented.
 - `sample_weight` support for `StandardScaler.partial_fit`.
 - Comprehensive `get_feature_names_out` support and configurable output
   containers across estimators.

r_scikit_learn-0.1.2/benches/benchmark_neighbors.py ADDED Viewed

@@ -0,0 +1,124 @@
+"""Compare r-scikit-learn and scikit-learn nearest-neighbor performance."""
+from __future__ import annotations
+import argparse
+import statistics
+import sys
+import time
+from collections.abc import Callable
+import numpy as np
+import rsklearn.neighbors as rneighbors
+import scipy
+import sklearn
+import sklearn.neighbors as sneighbors
+from rsklearn import _core
+def measure(
+    function: Callable[[], object], repetitions: int, warmups: int
+) -> tuple[float, float]:
+    for _ in range(warmups):
+        function()
+    values = []
+    for _ in range(repetitions):
+        started = time.perf_counter()
+        function()
+        values.append(time.perf_counter() - started)
+    return statistics.mean(values), statistics.stdev(values) if repetitions > 1 else 0
+def report(
+    name: str,
+    ours: Callable[[], object],
+    theirs: Callable[[], object],
+    repetitions: int,
+    warmups: int,
+) -> None:
+    ours_mean, ours_stdev = measure(ours, repetitions, warmups)
+    theirs_mean, theirs_stdev = measure(theirs, repetitions, warmups)
+    improvement = (theirs_mean - ours_mean) / theirs_mean * 100
+    print(
+        f"{name:<32} r-scikit-learn {ours_mean:9.6f}s ± {ours_stdev:9.6f}s  "
+        f"scikit-learn {theirs_mean:9.6f}s ± {theirs_stdev:9.6f}s  "
+        f"impr. {improvement:+7.2f}%"
+    )
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--train-samples", type=int, default=20_000)
+    parser.add_argument("--query-samples", type=int, default=1_000)
+    parser.add_argument("--features", type=int, default=20)
+    parser.add_argument("--classes", type=int, default=5)
+    parser.add_argument("--neighbors", type=int, default=5)
+    parser.add_argument("--repetitions", type=int, default=5)
+    parser.add_argument("--warmups", type=int, default=2)
+    parser.add_argument(
+        "--allow-debug",
+        action="store_true",
+        help="run even when r-scikit-learn's Rust extension is a debug build",
+    )
+    args = parser.parse_args()
+    profile = _core.build_profile()
+    if profile != "release" and not args.allow_debug:
+        raise SystemExit(
+            "Refusing to benchmark a debug Rust extension. Install a release build "
+            "with `maturin develop --release`, then rerun. Pass --allow-debug only "
+            "when intentionally measuring debug code."
+        )
+    print(f"Python: {sys.executable}")
+    print(f"Rust extension: {_core.__file__} ({profile})")
+    print(
+        f"Dependencies: numpy {np.__version__}, scipy {scipy.__version__}, "
+        f"scikit-learn {sklearn.__version__}"
+    )
+    rng = np.random.default_rng(20260616)
+    X_train = rng.normal(size=(args.train_samples, args.features))
+    X_query = rng.normal(size=(args.query_samples, args.features))
+    y = rng.integers(0, args.classes, size=args.train_samples, dtype=np.int64)
+    options = {
+        "n_neighbors": args.neighbors,
+        "weights": "uniform",
+        "algorithm": "brute",
+        "metric": "euclidean",
+    }
+    print(
+        f"Train matrix: {args.train_samples:,} x {args.features:,}; "
+        f"query matrix: {args.query_samples:,} x {args.features:,}"
+    )
+    report(
+        "KNeighborsClassifier fit",
+        lambda: rneighbors.KNeighborsClassifier(**options).fit(X_train, y),
+        lambda: sneighbors.KNeighborsClassifier(**options).fit(X_train, y),
+        args.repetitions,
+        args.warmups,
+    )
+    ours = rneighbors.KNeighborsClassifier(**options).fit(X_train, y)
+    theirs = sneighbors.KNeighborsClassifier(**options).fit(X_train, y)
+    report(
+        "KNeighborsClassifier kneighbors",
+        lambda: ours.kneighbors(X_query),
+        lambda: theirs.kneighbors(X_query),
+        args.repetitions,
+        args.warmups,
+    )
+    report(
+        "KNeighborsClassifier predict",
+        lambda: ours.predict(X_query),
+        lambda: theirs.predict(X_query),
+        args.repetitions,
+        args.warmups,
+    )
+    report(
+        "KNeighborsClassifier proba",
+        lambda: ours.predict_proba(X_query),
+        lambda: theirs.predict_proba(X_query),
+        args.repetitions,
+        args.warmups,
+    )
+if __name__ == "__main__":
+    main()

{r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/benches/benchmark_preprocessing.py RENAMED Viewed

@@ -13,6 +13,7 @@ from rsklearn.base import BaseEstimator
 from rsklearn.impute import SimpleImputer
 from rsklearn.preprocessing import (
     LabelEncoder,
+    MaxAbsScaler,
     MinMaxScaler,
     Normalizer,
     OneHotEncoder,
@@ -27,6 +28,7 @@ from sklearn.impute import SimpleImputer as ScikitSimpleImputer
 # The scikit-learn distribution intentionally exposes the `sklearn` import package.
 from sklearn.preprocessing import LabelEncoder as ScikitLabelEncoder
+from sklearn.preprocessing import MaxAbsScaler as ScikitMaxAbsScaler
 from sklearn.preprocessing import MinMaxScaler as ScikitMinMaxScaler
 from sklearn.preprocessing import Normalizer as ScikitNormalizer
 from sklearn.preprocessing import OneHotEncoder as ScikitOneHotEncoder
@@ -88,6 +90,7 @@ def benchmark_matrix(rows: int, columns: int, repetitions: int) -> None:
     )
     for name, ours, theirs in [
         ("StandardScaler", StandardScaler, ScikitStandardScaler),
+        ("MaxAbsScaler", MaxAbsScaler, ScikitMaxAbsScaler),
         ("MinMaxScaler", MinMaxScaler, ScikitMinMaxScaler),
         ("Normalizer", Normalizer, ScikitNormalizer),
         ("RobustScaler", RobustScaler, ScikitRobustScaler),
@@ -294,6 +297,34 @@ def benchmark_sparse(repetitions: int) -> None:
         scikit_scale,
         repetitions,
     )
+    ours_standard = StandardScaler(with_mean=False).fit(matrix)
+    theirs_standard = ScikitStandardScaler(with_mean=False).fit(matrix)
+    report_comparison(
+        "Sparse StandardScaler fit",
+        lambda: StandardScaler(with_mean=False).fit(matrix),
+        lambda: ScikitStandardScaler(with_mean=False).fit(matrix),
+        repetitions,
+    )
+    report_comparison(
+        "Sparse StandardScaler transform",
+        lambda: ours_standard.transform(matrix),
+        lambda: theirs_standard.transform(matrix),
+        repetitions,
+    )
+    ours_maxabs = MaxAbsScaler().fit(matrix)
+    theirs_maxabs = ScikitMaxAbsScaler().fit(matrix)
+    report_comparison(
+        "Sparse MaxAbsScaler fit",
+        lambda: MaxAbsScaler().fit(matrix),
+        lambda: ScikitMaxAbsScaler().fit(matrix),
+        repetitions,
+    )
+    report_comparison(
+        "Sparse MaxAbsScaler transform",
+        lambda: ours_maxabs.transform(matrix),
+        lambda: theirs_maxabs.transform(matrix),
+        repetitions,
+    )
 def main() -> None:

{r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "maturin"
 [project]
 name = "r-scikit-learn"
-version = "0.1.1"
+version = "0.1.2"
 description = "High-performance scikit-learn-style machine learning powered by safe Rust"
 readme = "README.md"
 requires-python = ">=3.10"

{r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/__init__.py RENAMED Viewed

@@ -10,6 +10,7 @@ from .base import (
 from .compose import ColumnTransformer, make_column_transformer
 from .impute import SimpleImputer
 from .linear_model import ElasticNet, Lasso, LinearRegression, LogisticRegression, Ridge
+from .neighbors import KNeighborsClassifier
 from .pipeline import Pipeline, make_pipeline
 from .preprocessing import (
     LabelEncoder,
@@ -26,6 +27,7 @@ __all__ = [
     "ClassifierMixin",
     "ColumnTransformer",
     "ElasticNet",
+    "KNeighborsClassifier",
     "LabelEncoder",
     "Lasso",
     "LinearRegression",
@@ -45,4 +47,4 @@ __all__ = [
     "make_column_transformer",
     "make_pipeline",
 ]
-__version__ = "0.1.1"
+__version__ = "0.1.2"

r_scikit_learn-0.1.2/python/rsklearn/neighbors/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Nearest-neighbor estimators."""
+from ._classification import KNeighborsClassifier
+__all__ = ["KNeighborsClassifier"]

r_scikit_learn-0.1.2/python/rsklearn/neighbors/_classification.py ADDED Viewed

@@ -0,0 +1,237 @@
+"""K-nearest-neighbors classification."""
+from __future__ import annotations
+import warnings
+from typing import Any
+import numpy as np
+from numpy.typing import NDArray
+from rsklearn import _core
+from rsklearn._validation import validate_labels
+from rsklearn.base import BaseEstimator, ClassifierMixin
+from rsklearn.preprocessing import LabelEncoder
+from rsklearn.utils.validation import check_is_fitted, validate_data
+try:
+    from sklearn.exceptions import DataConversionWarning
+except ImportError:
+    DataConversionWarning = UserWarning
+class KNeighborsClassifier(ClassifierMixin, BaseEstimator):
+    """Classifier implementing dense brute-force k-nearest-neighbor voting."""
+    _rsklearn_target_tags = {"required": True}
+    def __init__(
+        self,
+        n_neighbors: int = 5,
+        *,
+        weights: str = "uniform",
+        algorithm: str = "auto",
+        leaf_size: int = 30,
+        p: int = 2,
+        metric: str = "minkowski",
+        metric_params: dict[str, Any] | None = None,
+        n_jobs: int | None = None,
+    ) -> None:
+        self.n_neighbors = n_neighbors
+        self.weights = weights
+        self.algorithm = algorithm
+        self.leaf_size = leaf_size
+        self.p = p
+        self.metric = metric
+        self.metric_params = metric_params
+        self.n_jobs = n_jobs
+    def _validate_params(self) -> None:
+        if (
+            isinstance(self.n_neighbors, (bool, np.bool_))
+            or not isinstance(self.n_neighbors, (int, np.integer))
+            or self.n_neighbors <= 0
+        ):
+            raise ValueError("n_neighbors must be a positive integer")
+        if self.weights not in ("uniform", "distance"):
+            raise NotImplementedError(
+                "KNeighborsClassifier currently supports weights='uniform' "
+                "or weights='distance'"
+            )
+        if self.algorithm not in ("auto", "brute"):
+            raise NotImplementedError(
+                "KNeighborsClassifier currently supports algorithm='auto' or 'brute'"
+            )
+        if (
+            isinstance(self.leaf_size, (bool, np.bool_))
+            or not isinstance(self.leaf_size, (int, np.integer))
+            or self.leaf_size <= 0
+        ):
+            raise ValueError("leaf_size must be a positive integer")
+        if self.metric_params not in (None, {}):
+            raise NotImplementedError("metric_params are not implemented")
+        if self.n_jobs not in (None, 1):
+            raise NotImplementedError(
+                "n_jobs parallel execution is not implemented at the Python API level"
+            )
+        self._resolve_metric()
+    def _resolve_metric(self) -> tuple[str, int]:
+        if self.metric == "euclidean":
+            if self.p not in (2, 2.0):
+                raise ValueError("p is only used with metric='minkowski'")
+            return "euclidean", 0
+        if self.metric == "manhattan":
+            if self.p not in (1, 1.0):
+                raise ValueError("p is only used with metric='minkowski'")
+            return "manhattan", 1
+        if self.metric == "minkowski":
+            if self.p in (2, 2.0):
+                return "euclidean", 0
+            if self.p in (1, 1.0):
+                return "manhattan", 1
+            raise NotImplementedError(
+                "KNeighborsClassifier currently supports Minkowski p=1 or p=2"
+            )
+        raise NotImplementedError(
+            "KNeighborsClassifier currently supports metric='minkowski', "
+            "'euclidean', or 'manhattan'"
+        )
+    def _weights_code(self) -> int:
+        return 0 if self.weights == "uniform" else 1
+    def _validate_neighbor_count(
+        self, n_neighbors: int | None, *, training: bool
+    ) -> int:
+        check_is_fitted(self, ("_fit_X", "_fit_norms", "_y_encoded", "classes_"))
+        k = self.n_neighbors if n_neighbors is None else n_neighbors
+        if (
+            isinstance(k, (bool, np.bool_))
+            or not isinstance(k, (int, np.integer))
+            or k <= 0
+        ):
+            raise ValueError("n_neighbors must be a positive integer")
+        maximum = self.n_samples_fit_ - int(training)
+        if int(k) > maximum:
+            raise ValueError(
+                f"Expected n_neighbors <= n_samples_fit, but n_neighbors = {int(k)}, "
+                f"n_samples_fit = {maximum}"
+            )
+        return int(k)
+    def _validate_X(self, X: Any) -> NDArray[np.float64]:
+        array = validate_data(
+            self,
+            X,
+            reset=False,
+            dtype=np.float64,
+            order="C",
+            ensure_all_finite=True,
+        )
+        return np.ascontiguousarray(array, dtype=np.float64)
+    def fit(self, X: Any, y: Any) -> KNeighborsClassifier:
+        """Store the training set and encoded target labels."""
+        self._validate_params()
+        if y is None:
+            raise ValueError(
+                "KNeighborsClassifier requires y to be passed, but the target y is None"
+            )
+        target = np.asarray(y)
+        if target.ndim == 2 and target.shape[1] == 1:
+            warnings.warn(
+                "A column-vector y was passed when a 1d array was expected.",
+                DataConversionWarning,
+                stacklevel=2,
+            )
+            y = target.ravel()
+        X_array, y_array = validate_data(
+            self,
+            X,
+            y,
+            reset=True,
+            dtype=np.float64,
+            order="C",
+            ensure_all_finite=True,
+        )
+        if y_array.dtype.kind in "fc" and np.any(y_array != np.floor(y_array)):
+            raise ValueError("Unknown label type: continuous")
+        validate_labels(y_array)
+        encoder = LabelEncoder()
+        labels = encoder.fit_transform(y_array)
+        self.classes_ = encoder.classes_
+        if self.classes_.size < 2:
+            raise ValueError(
+                "KNeighborsClassifier requires at least two classes; got 1 class"
+            )
+        metric_name, metric_code = self._resolve_metric()
+        self._fit_X = np.ascontiguousarray(X_array, dtype=np.float64)
+        self._y_encoded = np.ascontiguousarray(labels, dtype=np.int64)
+        self._fit_norms = (
+            _core.knn_row_norms(self._fit_X)
+            if metric_code == 0
+            else np.asarray([], dtype=np.float64)
+        )
+        self.n_samples_fit_ = self._fit_X.shape[0]
+        self.effective_metric_ = metric_name
+        self.effective_metric_params_ = (
+            {} if self.metric_params is None else dict(self.metric_params)
+        )
+        self._metric_code = metric_code
+        return self
+    def kneighbors(
+        self,
+        X: Any = None,
+        n_neighbors: int | None = None,
+        return_distance: bool = True,
+    ) -> tuple[NDArray[np.float64], NDArray[np.int64]] | NDArray[np.int64]:
+        """Return nearest-neighbor distances and indices."""
+        training_query = X is None
+        k = self._validate_neighbor_count(n_neighbors, training=training_query)
+        query = self._fit_X if training_query else self._validate_X(X)
+        distances, indices = _core.knn_kneighbors(
+            query,
+            self._fit_X,
+            self._fit_norms,
+            k,
+            self._metric_code,
+            training_query,
+        )
+        if return_distance:
+            return distances, indices
+        return indices
+    def predict_proba(self, X: Any) -> NDArray[np.float64]:
+        """Return class probabilities for query samples."""
+        check_is_fitted(self, ("_fit_X", "_fit_norms", "_y_encoded", "classes_"))
+        k = self._validate_neighbor_count(None, training=False)
+        query = self._validate_X(X)
+        return _core.knn_predict_proba(
+            query,
+            self._fit_X,
+            self._fit_norms,
+            self._y_encoded,
+            k,
+            self.classes_.size,
+            self._metric_code,
+            self._weights_code(),
+        )
+    def predict(self, X: Any) -> NDArray[Any]:
+        """Predict class labels for query samples."""
+        check_is_fitted(self, ("_fit_X", "_fit_norms", "_y_encoded", "classes_"))
+        k = self._validate_neighbor_count(None, training=False)
+        query = self._validate_X(X)
+        indices = _core.knn_predict(
+            query,
+            self._fit_X,
+            self._fit_norms,
+            self._y_encoded,
+            k,
+            self.classes_.size,
+            self._metric_code,
+            self._weights_code(),
+        )
+        return self.classes_[indices]

{r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/__init__.py RENAMED Viewed

@@ -1,6 +1,7 @@
 """Preprocessing estimators."""
 from ._label_encoder import LabelEncoder
+from ._maxabs_scaler import MaxAbsScaler
 from ._minmax_scaler import MinMaxScaler
 from ._normalizer import Normalizer
 from ._one_hot_encoder import OneHotEncoder
@@ -10,6 +11,7 @@ from ._standard_scaler import StandardScaler
 __all__ = [
     "LabelEncoder",
+    "MaxAbsScaler",
     "MinMaxScaler",
     "Normalizer",
     "OneHotEncoder",

r-scikit-learn 0.1.1__tar.gz → 0.1.2__tar.gz

r-scikit-learn 0.1.1tar.gz → 0.1.2tar.gz