r-scikit-learn 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/CHANGELOG.md +12 -0
  2. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/Cargo.lock +2 -1
  3. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/Cargo.toml +2 -1
  4. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/PKG-INFO +12 -4
  5. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/README.md +11 -3
  6. r_scikit_learn-0.1.2/benches/benchmark_neighbors.py +124 -0
  7. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/benches/benchmark_preprocessing.py +31 -0
  8. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/pyproject.toml +1 -1
  9. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/__init__.py +3 -1
  10. r_scikit_learn-0.1.2/python/rsklearn/neighbors/__init__.py +5 -0
  11. r_scikit_learn-0.1.2/python/rsklearn/neighbors/_classification.py +237 -0
  12. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/__init__.py +2 -0
  13. r_scikit_learn-0.1.2/python/rsklearn/preprocessing/_maxabs_scaler.py +138 -0
  14. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_standard_scaler.py +75 -1
  15. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/utils/__init__.py +8 -0
  16. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/utils/sparse.py +86 -2
  17. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/lib.rs +391 -1
  18. r_scikit_learn-0.1.2/src/maxabs_scaler.rs +86 -0
  19. r_scikit_learn-0.1.2/src/neighbors.rs +921 -0
  20. r_scikit_learn-0.1.2/src/sparse.rs +302 -0
  21. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_estimator_compliance.py +4 -0
  22. r_scikit_learn-0.1.2/tests/test_maxabs_scaler.py +60 -0
  23. r_scikit_learn-0.1.2/tests/test_neighbors.py +68 -0
  24. r_scikit_learn-0.1.2/tests/test_neighbors_parity.py +50 -0
  25. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_scikit_learn_parity.py +48 -0
  26. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_sparse_infrastructure.py +2 -2
  27. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_standard_scaler.py +41 -0
  28. r_scikit_learn-0.1.1/src/sparse.rs +0 -133
  29. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/LICENSE +0 -0
  30. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/benches/benchmark_linear_models.py +0 -0
  31. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/benches/benchmark_metrics.py +0 -0
  32. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/_validation.py +0 -0
  33. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/base.py +0 -0
  34. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/compose/__init__.py +0 -0
  35. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/compose/_column_transformer.py +0 -0
  36. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/impute/__init__.py +0 -0
  37. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/impute/_simple_imputer.py +0 -0
  38. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/linear_model/__init__.py +0 -0
  39. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/linear_model/_base.py +0 -0
  40. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/linear_model/_coordinate_descent.py +0 -0
  41. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/linear_model/_least_squares.py +0 -0
  42. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/linear_model/_logistic.py +0 -0
  43. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/linear_model/_warnings.py +0 -0
  44. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/metrics/__init__.py +0 -0
  45. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/metrics/_classification.py +0 -0
  46. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/metrics/_regression.py +0 -0
  47. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/metrics/_validation.py +0 -0
  48. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/model_selection/__init__.py +0 -0
  49. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/model_selection/_split.py +0 -0
  50. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/model_selection/_utils.py +0 -0
  51. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/model_selection/_validation.py +0 -0
  52. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/pipeline.py +0 -0
  53. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_base.py +0 -0
  54. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_categorical.py +0 -0
  55. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_label_encoder.py +0 -0
  56. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_minmax_scaler.py +0 -0
  57. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_normalizer.py +0 -0
  58. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_one_hot_encoder.py +0 -0
  59. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_ordinal_encoder.py +0 -0
  60. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_robust_scaler.py +0 -0
  61. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/py.typed +0 -0
  62. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/utils/validation.py +0 -0
  63. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/categorical.rs +0 -0
  64. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/error.rs +0 -0
  65. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/label_encoder.rs +0 -0
  66. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/linear_model.rs +0 -0
  67. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/metrics.rs +0 -0
  68. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/minmax_scaler.rs +0 -0
  69. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/normalizer.rs +0 -0
  70. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/robust_scaler.rs +0 -0
  71. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/simple_imputer.rs +0 -0
  72. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/standard_scaler.rs +0 -0
  73. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/release_smoke.py +0 -0
  74. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_base.py +0 -0
  75. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_categorical_infrastructure.py +0 -0
  76. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_column_transformer.py +0 -0
  77. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_column_transformer_parity.py +0 -0
  78. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_label_encoder.py +0 -0
  79. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_linear_model.py +0 -0
  80. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_linear_model_parity.py +0 -0
  81. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_metrics.py +0 -0
  82. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_metrics_parity.py +0 -0
  83. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_minmax_scaler.py +0 -0
  84. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_model_selection.py +0 -0
  85. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_model_selection_parity.py +0 -0
  86. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_normalizer.py +0 -0
  87. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_one_hot_encoder.py +0 -0
  88. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_ordinal_encoder.py +0 -0
  89. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_pipeline.py +0 -0
  90. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_pipeline_parity.py +0 -0
  91. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_public_validation.py +0 -0
  92. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_robust_scaler.py +0 -0
  93. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_simple_imputer.py +0 -0
  94. {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_validation.py +0 -0
@@ -5,6 +5,18 @@ published package versions are immutable.
5
5
 
6
6
  ## Unreleased
7
7
 
8
+ ## 0.1.2 - 2026-06-24
9
+
10
+ - Added dense brute-force `KNeighborsClassifier` with Rust-backed neighbor
11
+ search, class voting, `predict`, `predict_proba`, and `kneighbors`.
12
+ - Added scikit-learn parity tests and benchmarks for nearest-neighbor
13
+ classification.
14
+ - Optimized the dense Euclidean neighbor search path with blocked dot products,
15
+ reusable work buffers, and macOS Accelerate/CBLAS acceleration with a portable
16
+ `matrixmultiply` fallback.
17
+ - Added sparse `StandardScaler(with_mean=False)` and `MaxAbsScaler` with
18
+ Rust-backed CSR/CSC reductions and column scaling.
19
+
8
20
  ## 0.1.1 - 2026-06-15
9
21
 
10
22
  - Added wheel and source-distribution installation testing across supported
@@ -998,9 +998,10 @@ dependencies = [
998
998
 
999
999
  [[package]]
1000
1000
  name = "r-scikit-learn-core"
1001
- version = "0.1.1"
1001
+ version = "0.1.2"
1002
1002
  dependencies = [
1003
1003
  "faer",
1004
+ "matrixmultiply",
1004
1005
  "nalgebra",
1005
1006
  "numpy",
1006
1007
  "pyo3",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "r-scikit-learn-core"
3
- version = "0.1.1"
3
+ version = "0.1.2"
4
4
  edition = "2021"
5
5
  license = "MIT"
6
6
  description = "Rust computational core for r-scikit-learn"
@@ -29,6 +29,7 @@ crate-type = ["cdylib", "rlib"]
29
29
 
30
30
  [dependencies]
31
31
  faer = { version = "0.24", default-features = false, features = ["std", "rayon", "linalg"] }
32
+ matrixmultiply = "0.3"
32
33
  nalgebra = { version = "0.34", default-features = false, features = ["std"] }
33
34
  numpy = "0.28"
34
35
  pyo3 = "0.28"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: r-scikit-learn
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Classifier: Development Status :: 3 - Alpha
5
5
  Classifier: License :: OSI Approved :: MIT License
6
6
  Classifier: Programming Language :: Python :: 3
@@ -126,6 +126,13 @@ encoder = OneHotEncoder(handle_unknown="ignore")
126
126
  X_one_hot = encoder.fit_transform([["small"], ["large"], ["small"]])
127
127
  ```
128
128
 
129
+ ```python
130
+ from rsklearn.preprocessing import MaxAbsScaler, StandardScaler
131
+
132
+ X_sparse_scaled = StandardScaler(with_mean=False).fit_transform(X_one_hot)
133
+ X_sparse_maxabs = MaxAbsScaler().fit_transform(X_one_hot)
134
+ ```
135
+
129
136
  ```python
130
137
  import numpy as np
131
138
  from rsklearn.impute import SimpleImputer
@@ -195,7 +202,10 @@ probabilities = classifier.predict_proba(X_test)
195
202
  - Uses float64 fitted statistics and native float32 kernels where supported.
196
203
  - Ignores NaNs while fitting, preserves them while transforming, and rejects
197
204
  infinity.
198
- - Supports incremental `partial_fit` for `StandardScaler` and `MinMaxScaler`.
205
+ - Supports incremental `partial_fit` for `StandardScaler`, `MaxAbsScaler`, and
206
+ `MinMaxScaler`.
207
+ - Supports CSR/CSC sparse `StandardScaler(with_mean=False)` and `MaxAbsScaler`
208
+ without densifying input.
199
209
  - Supports L1, L2, and max row normalization.
200
210
  - Provides quantile-based `RobustScaler` fitting and inverse transforms.
201
211
 
@@ -276,8 +286,6 @@ The core implemented behavior is tested and packaged across Linux, macOS, and
276
286
  Windows, but the project remains alpha software. Before a stable 1.0 release,
277
287
  the following compatibility and operational work remains:
278
288
 
279
- - Sparse-aware estimator behavior, including non-centering `StandardScaler`
280
- operation. Shared CSR/CSC validation and Rust kernels are implemented.
281
289
  - `sample_weight` support for `StandardScaler.partial_fit`.
282
290
  - Comprehensive `get_feature_names_out` support and configurable output
283
291
  containers across estimators.
@@ -93,6 +93,13 @@ encoder = OneHotEncoder(handle_unknown="ignore")
93
93
  X_one_hot = encoder.fit_transform([["small"], ["large"], ["small"]])
94
94
  ```
95
95
 
96
+ ```python
97
+ from rsklearn.preprocessing import MaxAbsScaler, StandardScaler
98
+
99
+ X_sparse_scaled = StandardScaler(with_mean=False).fit_transform(X_one_hot)
100
+ X_sparse_maxabs = MaxAbsScaler().fit_transform(X_one_hot)
101
+ ```
102
+
96
103
  ```python
97
104
  import numpy as np
98
105
  from rsklearn.impute import SimpleImputer
@@ -162,7 +169,10 @@ probabilities = classifier.predict_proba(X_test)
162
169
  - Uses float64 fitted statistics and native float32 kernels where supported.
163
170
  - Ignores NaNs while fitting, preserves them while transforming, and rejects
164
171
  infinity.
165
- - Supports incremental `partial_fit` for `StandardScaler` and `MinMaxScaler`.
172
+ - Supports incremental `partial_fit` for `StandardScaler`, `MaxAbsScaler`, and
173
+ `MinMaxScaler`.
174
+ - Supports CSR/CSC sparse `StandardScaler(with_mean=False)` and `MaxAbsScaler`
175
+ without densifying input.
166
176
  - Supports L1, L2, and max row normalization.
167
177
  - Provides quantile-based `RobustScaler` fitting and inverse transforms.
168
178
 
@@ -243,8 +253,6 @@ The core implemented behavior is tested and packaged across Linux, macOS, and
243
253
  Windows, but the project remains alpha software. Before a stable 1.0 release,
244
254
  the following compatibility and operational work remains:
245
255
 
246
- - Sparse-aware estimator behavior, including non-centering `StandardScaler`
247
- operation. Shared CSR/CSC validation and Rust kernels are implemented.
248
256
  - `sample_weight` support for `StandardScaler.partial_fit`.
249
257
  - Comprehensive `get_feature_names_out` support and configurable output
250
258
  containers across estimators.
@@ -0,0 +1,124 @@
1
+ """Compare r-scikit-learn and scikit-learn nearest-neighbor performance."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import statistics
7
+ import sys
8
+ import time
9
+ from collections.abc import Callable
10
+
11
+ import numpy as np
12
+ import rsklearn.neighbors as rneighbors
13
+ import scipy
14
+ import sklearn
15
+ import sklearn.neighbors as sneighbors
16
+ from rsklearn import _core
17
+
18
+
19
+ def measure(
20
+ function: Callable[[], object], repetitions: int, warmups: int
21
+ ) -> tuple[float, float]:
22
+ for _ in range(warmups):
23
+ function()
24
+ values = []
25
+ for _ in range(repetitions):
26
+ started = time.perf_counter()
27
+ function()
28
+ values.append(time.perf_counter() - started)
29
+ return statistics.mean(values), statistics.stdev(values) if repetitions > 1 else 0
30
+
31
+
32
+ def report(
33
+ name: str,
34
+ ours: Callable[[], object],
35
+ theirs: Callable[[], object],
36
+ repetitions: int,
37
+ warmups: int,
38
+ ) -> None:
39
+ ours_mean, ours_stdev = measure(ours, repetitions, warmups)
40
+ theirs_mean, theirs_stdev = measure(theirs, repetitions, warmups)
41
+ improvement = (theirs_mean - ours_mean) / theirs_mean * 100
42
+ print(
43
+ f"{name:<32} r-scikit-learn {ours_mean:9.6f}s ± {ours_stdev:9.6f}s "
44
+ f"scikit-learn {theirs_mean:9.6f}s ± {theirs_stdev:9.6f}s "
45
+ f"impr. {improvement:+7.2f}%"
46
+ )
47
+
48
+
49
+ def main() -> None:
50
+ parser = argparse.ArgumentParser()
51
+ parser.add_argument("--train-samples", type=int, default=20_000)
52
+ parser.add_argument("--query-samples", type=int, default=1_000)
53
+ parser.add_argument("--features", type=int, default=20)
54
+ parser.add_argument("--classes", type=int, default=5)
55
+ parser.add_argument("--neighbors", type=int, default=5)
56
+ parser.add_argument("--repetitions", type=int, default=5)
57
+ parser.add_argument("--warmups", type=int, default=2)
58
+ parser.add_argument(
59
+ "--allow-debug",
60
+ action="store_true",
61
+ help="run even when r-scikit-learn's Rust extension is a debug build",
62
+ )
63
+ args = parser.parse_args()
64
+ profile = _core.build_profile()
65
+ if profile != "release" and not args.allow_debug:
66
+ raise SystemExit(
67
+ "Refusing to benchmark a debug Rust extension. Install a release build "
68
+ "with `maturin develop --release`, then rerun. Pass --allow-debug only "
69
+ "when intentionally measuring debug code."
70
+ )
71
+ print(f"Python: {sys.executable}")
72
+ print(f"Rust extension: {_core.__file__} ({profile})")
73
+ print(
74
+ f"Dependencies: numpy {np.__version__}, scipy {scipy.__version__}, "
75
+ f"scikit-learn {sklearn.__version__}"
76
+ )
77
+ rng = np.random.default_rng(20260616)
78
+ X_train = rng.normal(size=(args.train_samples, args.features))
79
+ X_query = rng.normal(size=(args.query_samples, args.features))
80
+ y = rng.integers(0, args.classes, size=args.train_samples, dtype=np.int64)
81
+ options = {
82
+ "n_neighbors": args.neighbors,
83
+ "weights": "uniform",
84
+ "algorithm": "brute",
85
+ "metric": "euclidean",
86
+ }
87
+ print(
88
+ f"Train matrix: {args.train_samples:,} x {args.features:,}; "
89
+ f"query matrix: {args.query_samples:,} x {args.features:,}"
90
+ )
91
+ report(
92
+ "KNeighborsClassifier fit",
93
+ lambda: rneighbors.KNeighborsClassifier(**options).fit(X_train, y),
94
+ lambda: sneighbors.KNeighborsClassifier(**options).fit(X_train, y),
95
+ args.repetitions,
96
+ args.warmups,
97
+ )
98
+ ours = rneighbors.KNeighborsClassifier(**options).fit(X_train, y)
99
+ theirs = sneighbors.KNeighborsClassifier(**options).fit(X_train, y)
100
+ report(
101
+ "KNeighborsClassifier kneighbors",
102
+ lambda: ours.kneighbors(X_query),
103
+ lambda: theirs.kneighbors(X_query),
104
+ args.repetitions,
105
+ args.warmups,
106
+ )
107
+ report(
108
+ "KNeighborsClassifier predict",
109
+ lambda: ours.predict(X_query),
110
+ lambda: theirs.predict(X_query),
111
+ args.repetitions,
112
+ args.warmups,
113
+ )
114
+ report(
115
+ "KNeighborsClassifier proba",
116
+ lambda: ours.predict_proba(X_query),
117
+ lambda: theirs.predict_proba(X_query),
118
+ args.repetitions,
119
+ args.warmups,
120
+ )
121
+
122
+
123
+ if __name__ == "__main__":
124
+ main()
@@ -13,6 +13,7 @@ from rsklearn.base import BaseEstimator
13
13
  from rsklearn.impute import SimpleImputer
14
14
  from rsklearn.preprocessing import (
15
15
  LabelEncoder,
16
+ MaxAbsScaler,
16
17
  MinMaxScaler,
17
18
  Normalizer,
18
19
  OneHotEncoder,
@@ -27,6 +28,7 @@ from sklearn.impute import SimpleImputer as ScikitSimpleImputer
27
28
 
28
29
  # The scikit-learn distribution intentionally exposes the `sklearn` import package.
29
30
  from sklearn.preprocessing import LabelEncoder as ScikitLabelEncoder
31
+ from sklearn.preprocessing import MaxAbsScaler as ScikitMaxAbsScaler
30
32
  from sklearn.preprocessing import MinMaxScaler as ScikitMinMaxScaler
31
33
  from sklearn.preprocessing import Normalizer as ScikitNormalizer
32
34
  from sklearn.preprocessing import OneHotEncoder as ScikitOneHotEncoder
@@ -88,6 +90,7 @@ def benchmark_matrix(rows: int, columns: int, repetitions: int) -> None:
88
90
  )
89
91
  for name, ours, theirs in [
90
92
  ("StandardScaler", StandardScaler, ScikitStandardScaler),
93
+ ("MaxAbsScaler", MaxAbsScaler, ScikitMaxAbsScaler),
91
94
  ("MinMaxScaler", MinMaxScaler, ScikitMinMaxScaler),
92
95
  ("Normalizer", Normalizer, ScikitNormalizer),
93
96
  ("RobustScaler", RobustScaler, ScikitRobustScaler),
@@ -294,6 +297,34 @@ def benchmark_sparse(repetitions: int) -> None:
294
297
  scikit_scale,
295
298
  repetitions,
296
299
  )
300
+ ours_standard = StandardScaler(with_mean=False).fit(matrix)
301
+ theirs_standard = ScikitStandardScaler(with_mean=False).fit(matrix)
302
+ report_comparison(
303
+ "Sparse StandardScaler fit",
304
+ lambda: StandardScaler(with_mean=False).fit(matrix),
305
+ lambda: ScikitStandardScaler(with_mean=False).fit(matrix),
306
+ repetitions,
307
+ )
308
+ report_comparison(
309
+ "Sparse StandardScaler transform",
310
+ lambda: ours_standard.transform(matrix),
311
+ lambda: theirs_standard.transform(matrix),
312
+ repetitions,
313
+ )
314
+ ours_maxabs = MaxAbsScaler().fit(matrix)
315
+ theirs_maxabs = ScikitMaxAbsScaler().fit(matrix)
316
+ report_comparison(
317
+ "Sparse MaxAbsScaler fit",
318
+ lambda: MaxAbsScaler().fit(matrix),
319
+ lambda: ScikitMaxAbsScaler().fit(matrix),
320
+ repetitions,
321
+ )
322
+ report_comparison(
323
+ "Sparse MaxAbsScaler transform",
324
+ lambda: ours_maxabs.transform(matrix),
325
+ lambda: theirs_maxabs.transform(matrix),
326
+ repetitions,
327
+ )
297
328
 
298
329
 
299
330
  def main() -> None:
@@ -4,7 +4,7 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "r-scikit-learn"
7
- version = "0.1.1"
7
+ version = "0.1.2"
8
8
  description = "High-performance scikit-learn-style machine learning powered by safe Rust"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -10,6 +10,7 @@ from .base import (
10
10
  from .compose import ColumnTransformer, make_column_transformer
11
11
  from .impute import SimpleImputer
12
12
  from .linear_model import ElasticNet, Lasso, LinearRegression, LogisticRegression, Ridge
13
+ from .neighbors import KNeighborsClassifier
13
14
  from .pipeline import Pipeline, make_pipeline
14
15
  from .preprocessing import (
15
16
  LabelEncoder,
@@ -26,6 +27,7 @@ __all__ = [
26
27
  "ClassifierMixin",
27
28
  "ColumnTransformer",
28
29
  "ElasticNet",
30
+ "KNeighborsClassifier",
29
31
  "LabelEncoder",
30
32
  "Lasso",
31
33
  "LinearRegression",
@@ -45,4 +47,4 @@ __all__ = [
45
47
  "make_column_transformer",
46
48
  "make_pipeline",
47
49
  ]
48
- __version__ = "0.1.1"
50
+ __version__ = "0.1.2"
@@ -0,0 +1,5 @@
1
+ """Nearest-neighbor estimators."""
2
+
3
+ from ._classification import KNeighborsClassifier
4
+
5
+ __all__ = ["KNeighborsClassifier"]
@@ -0,0 +1,237 @@
1
+ """K-nearest-neighbors classification."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import warnings
6
+ from typing import Any
7
+
8
+ import numpy as np
9
+ from numpy.typing import NDArray
10
+
11
+ from rsklearn import _core
12
+ from rsklearn._validation import validate_labels
13
+ from rsklearn.base import BaseEstimator, ClassifierMixin
14
+ from rsklearn.preprocessing import LabelEncoder
15
+ from rsklearn.utils.validation import check_is_fitted, validate_data
16
+
17
+ try:
18
+ from sklearn.exceptions import DataConversionWarning
19
+ except ImportError:
20
+ DataConversionWarning = UserWarning
21
+
22
+
23
+ class KNeighborsClassifier(ClassifierMixin, BaseEstimator):
24
+ """Classifier implementing dense brute-force k-nearest-neighbor voting."""
25
+
26
+ _rsklearn_target_tags = {"required": True}
27
+
28
+ def __init__(
29
+ self,
30
+ n_neighbors: int = 5,
31
+ *,
32
+ weights: str = "uniform",
33
+ algorithm: str = "auto",
34
+ leaf_size: int = 30,
35
+ p: int = 2,
36
+ metric: str = "minkowski",
37
+ metric_params: dict[str, Any] | None = None,
38
+ n_jobs: int | None = None,
39
+ ) -> None:
40
+ self.n_neighbors = n_neighbors
41
+ self.weights = weights
42
+ self.algorithm = algorithm
43
+ self.leaf_size = leaf_size
44
+ self.p = p
45
+ self.metric = metric
46
+ self.metric_params = metric_params
47
+ self.n_jobs = n_jobs
48
+
49
+ def _validate_params(self) -> None:
50
+ if (
51
+ isinstance(self.n_neighbors, (bool, np.bool_))
52
+ or not isinstance(self.n_neighbors, (int, np.integer))
53
+ or self.n_neighbors <= 0
54
+ ):
55
+ raise ValueError("n_neighbors must be a positive integer")
56
+ if self.weights not in ("uniform", "distance"):
57
+ raise NotImplementedError(
58
+ "KNeighborsClassifier currently supports weights='uniform' "
59
+ "or weights='distance'"
60
+ )
61
+ if self.algorithm not in ("auto", "brute"):
62
+ raise NotImplementedError(
63
+ "KNeighborsClassifier currently supports algorithm='auto' or 'brute'"
64
+ )
65
+ if (
66
+ isinstance(self.leaf_size, (bool, np.bool_))
67
+ or not isinstance(self.leaf_size, (int, np.integer))
68
+ or self.leaf_size <= 0
69
+ ):
70
+ raise ValueError("leaf_size must be a positive integer")
71
+ if self.metric_params not in (None, {}):
72
+ raise NotImplementedError("metric_params are not implemented")
73
+ if self.n_jobs not in (None, 1):
74
+ raise NotImplementedError(
75
+ "n_jobs parallel execution is not implemented at the Python API level"
76
+ )
77
+ self._resolve_metric()
78
+
79
+ def _resolve_metric(self) -> tuple[str, int]:
80
+ if self.metric == "euclidean":
81
+ if self.p not in (2, 2.0):
82
+ raise ValueError("p is only used with metric='minkowski'")
83
+ return "euclidean", 0
84
+ if self.metric == "manhattan":
85
+ if self.p not in (1, 1.0):
86
+ raise ValueError("p is only used with metric='minkowski'")
87
+ return "manhattan", 1
88
+ if self.metric == "minkowski":
89
+ if self.p in (2, 2.0):
90
+ return "euclidean", 0
91
+ if self.p in (1, 1.0):
92
+ return "manhattan", 1
93
+ raise NotImplementedError(
94
+ "KNeighborsClassifier currently supports Minkowski p=1 or p=2"
95
+ )
96
+ raise NotImplementedError(
97
+ "KNeighborsClassifier currently supports metric='minkowski', "
98
+ "'euclidean', or 'manhattan'"
99
+ )
100
+
101
+ def _weights_code(self) -> int:
102
+ return 0 if self.weights == "uniform" else 1
103
+
104
+ def _validate_neighbor_count(
105
+ self, n_neighbors: int | None, *, training: bool
106
+ ) -> int:
107
+ check_is_fitted(self, ("_fit_X", "_fit_norms", "_y_encoded", "classes_"))
108
+ k = self.n_neighbors if n_neighbors is None else n_neighbors
109
+ if (
110
+ isinstance(k, (bool, np.bool_))
111
+ or not isinstance(k, (int, np.integer))
112
+ or k <= 0
113
+ ):
114
+ raise ValueError("n_neighbors must be a positive integer")
115
+ maximum = self.n_samples_fit_ - int(training)
116
+ if int(k) > maximum:
117
+ raise ValueError(
118
+ f"Expected n_neighbors <= n_samples_fit, but n_neighbors = {int(k)}, "
119
+ f"n_samples_fit = {maximum}"
120
+ )
121
+ return int(k)
122
+
123
+ def _validate_X(self, X: Any) -> NDArray[np.float64]:
124
+ array = validate_data(
125
+ self,
126
+ X,
127
+ reset=False,
128
+ dtype=np.float64,
129
+ order="C",
130
+ ensure_all_finite=True,
131
+ )
132
+ return np.ascontiguousarray(array, dtype=np.float64)
133
+
134
+ def fit(self, X: Any, y: Any) -> KNeighborsClassifier:
135
+ """Store the training set and encoded target labels."""
136
+ self._validate_params()
137
+ if y is None:
138
+ raise ValueError(
139
+ "KNeighborsClassifier requires y to be passed, but the target y is None"
140
+ )
141
+ target = np.asarray(y)
142
+ if target.ndim == 2 and target.shape[1] == 1:
143
+ warnings.warn(
144
+ "A column-vector y was passed when a 1d array was expected.",
145
+ DataConversionWarning,
146
+ stacklevel=2,
147
+ )
148
+ y = target.ravel()
149
+ X_array, y_array = validate_data(
150
+ self,
151
+ X,
152
+ y,
153
+ reset=True,
154
+ dtype=np.float64,
155
+ order="C",
156
+ ensure_all_finite=True,
157
+ )
158
+ if y_array.dtype.kind in "fc" and np.any(y_array != np.floor(y_array)):
159
+ raise ValueError("Unknown label type: continuous")
160
+ validate_labels(y_array)
161
+ encoder = LabelEncoder()
162
+ labels = encoder.fit_transform(y_array)
163
+ self.classes_ = encoder.classes_
164
+ if self.classes_.size < 2:
165
+ raise ValueError(
166
+ "KNeighborsClassifier requires at least two classes; got 1 class"
167
+ )
168
+ metric_name, metric_code = self._resolve_metric()
169
+ self._fit_X = np.ascontiguousarray(X_array, dtype=np.float64)
170
+ self._y_encoded = np.ascontiguousarray(labels, dtype=np.int64)
171
+ self._fit_norms = (
172
+ _core.knn_row_norms(self._fit_X)
173
+ if metric_code == 0
174
+ else np.asarray([], dtype=np.float64)
175
+ )
176
+ self.n_samples_fit_ = self._fit_X.shape[0]
177
+ self.effective_metric_ = metric_name
178
+ self.effective_metric_params_ = (
179
+ {} if self.metric_params is None else dict(self.metric_params)
180
+ )
181
+ self._metric_code = metric_code
182
+ return self
183
+
184
+ def kneighbors(
185
+ self,
186
+ X: Any = None,
187
+ n_neighbors: int | None = None,
188
+ return_distance: bool = True,
189
+ ) -> tuple[NDArray[np.float64], NDArray[np.int64]] | NDArray[np.int64]:
190
+ """Return nearest-neighbor distances and indices."""
191
+ training_query = X is None
192
+ k = self._validate_neighbor_count(n_neighbors, training=training_query)
193
+ query = self._fit_X if training_query else self._validate_X(X)
194
+ distances, indices = _core.knn_kneighbors(
195
+ query,
196
+ self._fit_X,
197
+ self._fit_norms,
198
+ k,
199
+ self._metric_code,
200
+ training_query,
201
+ )
202
+ if return_distance:
203
+ return distances, indices
204
+ return indices
205
+
206
+ def predict_proba(self, X: Any) -> NDArray[np.float64]:
207
+ """Return class probabilities for query samples."""
208
+ check_is_fitted(self, ("_fit_X", "_fit_norms", "_y_encoded", "classes_"))
209
+ k = self._validate_neighbor_count(None, training=False)
210
+ query = self._validate_X(X)
211
+ return _core.knn_predict_proba(
212
+ query,
213
+ self._fit_X,
214
+ self._fit_norms,
215
+ self._y_encoded,
216
+ k,
217
+ self.classes_.size,
218
+ self._metric_code,
219
+ self._weights_code(),
220
+ )
221
+
222
+ def predict(self, X: Any) -> NDArray[Any]:
223
+ """Predict class labels for query samples."""
224
+ check_is_fitted(self, ("_fit_X", "_fit_norms", "_y_encoded", "classes_"))
225
+ k = self._validate_neighbor_count(None, training=False)
226
+ query = self._validate_X(X)
227
+ indices = _core.knn_predict(
228
+ query,
229
+ self._fit_X,
230
+ self._fit_norms,
231
+ self._y_encoded,
232
+ k,
233
+ self.classes_.size,
234
+ self._metric_code,
235
+ self._weights_code(),
236
+ )
237
+ return self.classes_[indices]
@@ -1,6 +1,7 @@
1
1
  """Preprocessing estimators."""
2
2
 
3
3
  from ._label_encoder import LabelEncoder
4
+ from ._maxabs_scaler import MaxAbsScaler
4
5
  from ._minmax_scaler import MinMaxScaler
5
6
  from ._normalizer import Normalizer
6
7
  from ._one_hot_encoder import OneHotEncoder
@@ -10,6 +11,7 @@ from ._standard_scaler import StandardScaler
10
11
 
11
12
  __all__ = [
12
13
  "LabelEncoder",
14
+ "MaxAbsScaler",
13
15
  "MinMaxScaler",
14
16
  "Normalizer",
15
17
  "OneHotEncoder",