r-scikit-learn 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/CHANGELOG.md +12 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/Cargo.lock +2 -1
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/Cargo.toml +2 -1
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/PKG-INFO +12 -4
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/README.md +11 -3
- r_scikit_learn-0.1.2/benches/benchmark_neighbors.py +124 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/benches/benchmark_preprocessing.py +31 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/pyproject.toml +1 -1
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/__init__.py +3 -1
- r_scikit_learn-0.1.2/python/rsklearn/neighbors/__init__.py +5 -0
- r_scikit_learn-0.1.2/python/rsklearn/neighbors/_classification.py +237 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/__init__.py +2 -0
- r_scikit_learn-0.1.2/python/rsklearn/preprocessing/_maxabs_scaler.py +138 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_standard_scaler.py +75 -1
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/utils/__init__.py +8 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/utils/sparse.py +86 -2
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/lib.rs +391 -1
- r_scikit_learn-0.1.2/src/maxabs_scaler.rs +86 -0
- r_scikit_learn-0.1.2/src/neighbors.rs +921 -0
- r_scikit_learn-0.1.2/src/sparse.rs +302 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_estimator_compliance.py +4 -0
- r_scikit_learn-0.1.2/tests/test_maxabs_scaler.py +60 -0
- r_scikit_learn-0.1.2/tests/test_neighbors.py +68 -0
- r_scikit_learn-0.1.2/tests/test_neighbors_parity.py +50 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_scikit_learn_parity.py +48 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_sparse_infrastructure.py +2 -2
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_standard_scaler.py +41 -0
- r_scikit_learn-0.1.1/src/sparse.rs +0 -133
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/LICENSE +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/benches/benchmark_linear_models.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/benches/benchmark_metrics.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/_validation.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/base.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/compose/__init__.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/compose/_column_transformer.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/impute/__init__.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/impute/_simple_imputer.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/linear_model/__init__.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/linear_model/_base.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/linear_model/_coordinate_descent.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/linear_model/_least_squares.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/linear_model/_logistic.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/linear_model/_warnings.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/metrics/__init__.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/metrics/_classification.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/metrics/_regression.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/metrics/_validation.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/model_selection/__init__.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/model_selection/_split.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/model_selection/_utils.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/model_selection/_validation.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/pipeline.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_base.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_categorical.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_label_encoder.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_minmax_scaler.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_normalizer.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_one_hot_encoder.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_ordinal_encoder.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/preprocessing/_robust_scaler.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/py.typed +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/python/rsklearn/utils/validation.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/categorical.rs +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/error.rs +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/label_encoder.rs +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/linear_model.rs +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/metrics.rs +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/minmax_scaler.rs +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/normalizer.rs +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/robust_scaler.rs +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/simple_imputer.rs +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/src/standard_scaler.rs +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/release_smoke.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_base.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_categorical_infrastructure.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_column_transformer.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_column_transformer_parity.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_label_encoder.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_linear_model.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_linear_model_parity.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_metrics.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_metrics_parity.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_minmax_scaler.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_model_selection.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_model_selection_parity.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_normalizer.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_one_hot_encoder.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_ordinal_encoder.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_pipeline.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_pipeline_parity.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_public_validation.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_robust_scaler.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_simple_imputer.py +0 -0
- {r_scikit_learn-0.1.1 → r_scikit_learn-0.1.2}/tests/test_validation.py +0 -0
|
@@ -5,6 +5,18 @@ published package versions are immutable.
|
|
|
5
5
|
|
|
6
6
|
## Unreleased
|
|
7
7
|
|
|
8
|
+
## 0.1.2 - 2026-06-24
|
|
9
|
+
|
|
10
|
+
- Added dense brute-force `KNeighborsClassifier` with Rust-backed neighbor
|
|
11
|
+
search, class voting, `predict`, `predict_proba`, and `kneighbors`.
|
|
12
|
+
- Added scikit-learn parity tests and benchmarks for nearest-neighbor
|
|
13
|
+
classification.
|
|
14
|
+
- Optimized the dense Euclidean neighbor search path with blocked dot products,
|
|
15
|
+
reusable work buffers, and macOS Accelerate/CBLAS acceleration with a portable
|
|
16
|
+
`matrixmultiply` fallback.
|
|
17
|
+
- Added sparse `StandardScaler(with_mean=False)` and `MaxAbsScaler` with
|
|
18
|
+
Rust-backed CSR/CSC reductions and column scaling.
|
|
19
|
+
|
|
8
20
|
## 0.1.1 - 2026-06-15
|
|
9
21
|
|
|
10
22
|
- Added wheel and source-distribution installation testing across supported
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "r-scikit-learn-core"
|
|
3
|
-
version = "0.1.
|
|
3
|
+
version = "0.1.2"
|
|
4
4
|
edition = "2021"
|
|
5
5
|
license = "MIT"
|
|
6
6
|
description = "Rust computational core for r-scikit-learn"
|
|
@@ -29,6 +29,7 @@ crate-type = ["cdylib", "rlib"]
|
|
|
29
29
|
|
|
30
30
|
[dependencies]
|
|
31
31
|
faer = { version = "0.24", default-features = false, features = ["std", "rayon", "linalg"] }
|
|
32
|
+
matrixmultiply = "0.3"
|
|
32
33
|
nalgebra = { version = "0.34", default-features = false, features = ["std"] }
|
|
33
34
|
numpy = "0.28"
|
|
34
35
|
pyo3 = "0.28"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: r-scikit-learn
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Classifier: Development Status :: 3 - Alpha
|
|
5
5
|
Classifier: License :: OSI Approved :: MIT License
|
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -126,6 +126,13 @@ encoder = OneHotEncoder(handle_unknown="ignore")
|
|
|
126
126
|
X_one_hot = encoder.fit_transform([["small"], ["large"], ["small"]])
|
|
127
127
|
```
|
|
128
128
|
|
|
129
|
+
```python
|
|
130
|
+
from rsklearn.preprocessing import MaxAbsScaler, StandardScaler
|
|
131
|
+
|
|
132
|
+
X_sparse_scaled = StandardScaler(with_mean=False).fit_transform(X_one_hot)
|
|
133
|
+
X_sparse_maxabs = MaxAbsScaler().fit_transform(X_one_hot)
|
|
134
|
+
```
|
|
135
|
+
|
|
129
136
|
```python
|
|
130
137
|
import numpy as np
|
|
131
138
|
from rsklearn.impute import SimpleImputer
|
|
@@ -195,7 +202,10 @@ probabilities = classifier.predict_proba(X_test)
|
|
|
195
202
|
- Uses float64 fitted statistics and native float32 kernels where supported.
|
|
196
203
|
- Ignores NaNs while fitting, preserves them while transforming, and rejects
|
|
197
204
|
infinity.
|
|
198
|
-
- Supports incremental `partial_fit` for `StandardScaler` and
|
|
205
|
+
- Supports incremental `partial_fit` for `StandardScaler`, `MaxAbsScaler`, and
|
|
206
|
+
`MinMaxScaler`.
|
|
207
|
+
- Supports CSR/CSC sparse `StandardScaler(with_mean=False)` and `MaxAbsScaler`
|
|
208
|
+
without densifying input.
|
|
199
209
|
- Supports L1, L2, and max row normalization.
|
|
200
210
|
- Provides quantile-based `RobustScaler` fitting and inverse transforms.
|
|
201
211
|
|
|
@@ -276,8 +286,6 @@ The core implemented behavior is tested and packaged across Linux, macOS, and
|
|
|
276
286
|
Windows, but the project remains alpha software. Before a stable 1.0 release,
|
|
277
287
|
the following compatibility and operational work remains:
|
|
278
288
|
|
|
279
|
-
- Sparse-aware estimator behavior, including non-centering `StandardScaler`
|
|
280
|
-
operation. Shared CSR/CSC validation and Rust kernels are implemented.
|
|
281
289
|
- `sample_weight` support for `StandardScaler.partial_fit`.
|
|
282
290
|
- Comprehensive `get_feature_names_out` support and configurable output
|
|
283
291
|
containers across estimators.
|
|
@@ -93,6 +93,13 @@ encoder = OneHotEncoder(handle_unknown="ignore")
|
|
|
93
93
|
X_one_hot = encoder.fit_transform([["small"], ["large"], ["small"]])
|
|
94
94
|
```
|
|
95
95
|
|
|
96
|
+
```python
|
|
97
|
+
from rsklearn.preprocessing import MaxAbsScaler, StandardScaler
|
|
98
|
+
|
|
99
|
+
X_sparse_scaled = StandardScaler(with_mean=False).fit_transform(X_one_hot)
|
|
100
|
+
X_sparse_maxabs = MaxAbsScaler().fit_transform(X_one_hot)
|
|
101
|
+
```
|
|
102
|
+
|
|
96
103
|
```python
|
|
97
104
|
import numpy as np
|
|
98
105
|
from rsklearn.impute import SimpleImputer
|
|
@@ -162,7 +169,10 @@ probabilities = classifier.predict_proba(X_test)
|
|
|
162
169
|
- Uses float64 fitted statistics and native float32 kernels where supported.
|
|
163
170
|
- Ignores NaNs while fitting, preserves them while transforming, and rejects
|
|
164
171
|
infinity.
|
|
165
|
-
- Supports incremental `partial_fit` for `StandardScaler` and
|
|
172
|
+
- Supports incremental `partial_fit` for `StandardScaler`, `MaxAbsScaler`, and
|
|
173
|
+
`MinMaxScaler`.
|
|
174
|
+
- Supports CSR/CSC sparse `StandardScaler(with_mean=False)` and `MaxAbsScaler`
|
|
175
|
+
without densifying input.
|
|
166
176
|
- Supports L1, L2, and max row normalization.
|
|
167
177
|
- Provides quantile-based `RobustScaler` fitting and inverse transforms.
|
|
168
178
|
|
|
@@ -243,8 +253,6 @@ The core implemented behavior is tested and packaged across Linux, macOS, and
|
|
|
243
253
|
Windows, but the project remains alpha software. Before a stable 1.0 release,
|
|
244
254
|
the following compatibility and operational work remains:
|
|
245
255
|
|
|
246
|
-
- Sparse-aware estimator behavior, including non-centering `StandardScaler`
|
|
247
|
-
operation. Shared CSR/CSC validation and Rust kernels are implemented.
|
|
248
256
|
- `sample_weight` support for `StandardScaler.partial_fit`.
|
|
249
257
|
- Comprehensive `get_feature_names_out` support and configurable output
|
|
250
258
|
containers across estimators.
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Compare r-scikit-learn and scikit-learn nearest-neighbor performance."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import statistics
|
|
7
|
+
import sys
|
|
8
|
+
import time
|
|
9
|
+
from collections.abc import Callable
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import rsklearn.neighbors as rneighbors
|
|
13
|
+
import scipy
|
|
14
|
+
import sklearn
|
|
15
|
+
import sklearn.neighbors as sneighbors
|
|
16
|
+
from rsklearn import _core
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def measure(
|
|
20
|
+
function: Callable[[], object], repetitions: int, warmups: int
|
|
21
|
+
) -> tuple[float, float]:
|
|
22
|
+
for _ in range(warmups):
|
|
23
|
+
function()
|
|
24
|
+
values = []
|
|
25
|
+
for _ in range(repetitions):
|
|
26
|
+
started = time.perf_counter()
|
|
27
|
+
function()
|
|
28
|
+
values.append(time.perf_counter() - started)
|
|
29
|
+
return statistics.mean(values), statistics.stdev(values) if repetitions > 1 else 0
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def report(
|
|
33
|
+
name: str,
|
|
34
|
+
ours: Callable[[], object],
|
|
35
|
+
theirs: Callable[[], object],
|
|
36
|
+
repetitions: int,
|
|
37
|
+
warmups: int,
|
|
38
|
+
) -> None:
|
|
39
|
+
ours_mean, ours_stdev = measure(ours, repetitions, warmups)
|
|
40
|
+
theirs_mean, theirs_stdev = measure(theirs, repetitions, warmups)
|
|
41
|
+
improvement = (theirs_mean - ours_mean) / theirs_mean * 100
|
|
42
|
+
print(
|
|
43
|
+
f"{name:<32} r-scikit-learn {ours_mean:9.6f}s ± {ours_stdev:9.6f}s "
|
|
44
|
+
f"scikit-learn {theirs_mean:9.6f}s ± {theirs_stdev:9.6f}s "
|
|
45
|
+
f"impr. {improvement:+7.2f}%"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def main() -> None:
|
|
50
|
+
parser = argparse.ArgumentParser()
|
|
51
|
+
parser.add_argument("--train-samples", type=int, default=20_000)
|
|
52
|
+
parser.add_argument("--query-samples", type=int, default=1_000)
|
|
53
|
+
parser.add_argument("--features", type=int, default=20)
|
|
54
|
+
parser.add_argument("--classes", type=int, default=5)
|
|
55
|
+
parser.add_argument("--neighbors", type=int, default=5)
|
|
56
|
+
parser.add_argument("--repetitions", type=int, default=5)
|
|
57
|
+
parser.add_argument("--warmups", type=int, default=2)
|
|
58
|
+
parser.add_argument(
|
|
59
|
+
"--allow-debug",
|
|
60
|
+
action="store_true",
|
|
61
|
+
help="run even when r-scikit-learn's Rust extension is a debug build",
|
|
62
|
+
)
|
|
63
|
+
args = parser.parse_args()
|
|
64
|
+
profile = _core.build_profile()
|
|
65
|
+
if profile != "release" and not args.allow_debug:
|
|
66
|
+
raise SystemExit(
|
|
67
|
+
"Refusing to benchmark a debug Rust extension. Install a release build "
|
|
68
|
+
"with `maturin develop --release`, then rerun. Pass --allow-debug only "
|
|
69
|
+
"when intentionally measuring debug code."
|
|
70
|
+
)
|
|
71
|
+
print(f"Python: {sys.executable}")
|
|
72
|
+
print(f"Rust extension: {_core.__file__} ({profile})")
|
|
73
|
+
print(
|
|
74
|
+
f"Dependencies: numpy {np.__version__}, scipy {scipy.__version__}, "
|
|
75
|
+
f"scikit-learn {sklearn.__version__}"
|
|
76
|
+
)
|
|
77
|
+
rng = np.random.default_rng(20260616)
|
|
78
|
+
X_train = rng.normal(size=(args.train_samples, args.features))
|
|
79
|
+
X_query = rng.normal(size=(args.query_samples, args.features))
|
|
80
|
+
y = rng.integers(0, args.classes, size=args.train_samples, dtype=np.int64)
|
|
81
|
+
options = {
|
|
82
|
+
"n_neighbors": args.neighbors,
|
|
83
|
+
"weights": "uniform",
|
|
84
|
+
"algorithm": "brute",
|
|
85
|
+
"metric": "euclidean",
|
|
86
|
+
}
|
|
87
|
+
print(
|
|
88
|
+
f"Train matrix: {args.train_samples:,} x {args.features:,}; "
|
|
89
|
+
f"query matrix: {args.query_samples:,} x {args.features:,}"
|
|
90
|
+
)
|
|
91
|
+
report(
|
|
92
|
+
"KNeighborsClassifier fit",
|
|
93
|
+
lambda: rneighbors.KNeighborsClassifier(**options).fit(X_train, y),
|
|
94
|
+
lambda: sneighbors.KNeighborsClassifier(**options).fit(X_train, y),
|
|
95
|
+
args.repetitions,
|
|
96
|
+
args.warmups,
|
|
97
|
+
)
|
|
98
|
+
ours = rneighbors.KNeighborsClassifier(**options).fit(X_train, y)
|
|
99
|
+
theirs = sneighbors.KNeighborsClassifier(**options).fit(X_train, y)
|
|
100
|
+
report(
|
|
101
|
+
"KNeighborsClassifier kneighbors",
|
|
102
|
+
lambda: ours.kneighbors(X_query),
|
|
103
|
+
lambda: theirs.kneighbors(X_query),
|
|
104
|
+
args.repetitions,
|
|
105
|
+
args.warmups,
|
|
106
|
+
)
|
|
107
|
+
report(
|
|
108
|
+
"KNeighborsClassifier predict",
|
|
109
|
+
lambda: ours.predict(X_query),
|
|
110
|
+
lambda: theirs.predict(X_query),
|
|
111
|
+
args.repetitions,
|
|
112
|
+
args.warmups,
|
|
113
|
+
)
|
|
114
|
+
report(
|
|
115
|
+
"KNeighborsClassifier proba",
|
|
116
|
+
lambda: ours.predict_proba(X_query),
|
|
117
|
+
lambda: theirs.predict_proba(X_query),
|
|
118
|
+
args.repetitions,
|
|
119
|
+
args.warmups,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
if __name__ == "__main__":
|
|
124
|
+
main()
|
|
@@ -13,6 +13,7 @@ from rsklearn.base import BaseEstimator
|
|
|
13
13
|
from rsklearn.impute import SimpleImputer
|
|
14
14
|
from rsklearn.preprocessing import (
|
|
15
15
|
LabelEncoder,
|
|
16
|
+
MaxAbsScaler,
|
|
16
17
|
MinMaxScaler,
|
|
17
18
|
Normalizer,
|
|
18
19
|
OneHotEncoder,
|
|
@@ -27,6 +28,7 @@ from sklearn.impute import SimpleImputer as ScikitSimpleImputer
|
|
|
27
28
|
|
|
28
29
|
# The scikit-learn distribution intentionally exposes the `sklearn` import package.
|
|
29
30
|
from sklearn.preprocessing import LabelEncoder as ScikitLabelEncoder
|
|
31
|
+
from sklearn.preprocessing import MaxAbsScaler as ScikitMaxAbsScaler
|
|
30
32
|
from sklearn.preprocessing import MinMaxScaler as ScikitMinMaxScaler
|
|
31
33
|
from sklearn.preprocessing import Normalizer as ScikitNormalizer
|
|
32
34
|
from sklearn.preprocessing import OneHotEncoder as ScikitOneHotEncoder
|
|
@@ -88,6 +90,7 @@ def benchmark_matrix(rows: int, columns: int, repetitions: int) -> None:
|
|
|
88
90
|
)
|
|
89
91
|
for name, ours, theirs in [
|
|
90
92
|
("StandardScaler", StandardScaler, ScikitStandardScaler),
|
|
93
|
+
("MaxAbsScaler", MaxAbsScaler, ScikitMaxAbsScaler),
|
|
91
94
|
("MinMaxScaler", MinMaxScaler, ScikitMinMaxScaler),
|
|
92
95
|
("Normalizer", Normalizer, ScikitNormalizer),
|
|
93
96
|
("RobustScaler", RobustScaler, ScikitRobustScaler),
|
|
@@ -294,6 +297,34 @@ def benchmark_sparse(repetitions: int) -> None:
|
|
|
294
297
|
scikit_scale,
|
|
295
298
|
repetitions,
|
|
296
299
|
)
|
|
300
|
+
ours_standard = StandardScaler(with_mean=False).fit(matrix)
|
|
301
|
+
theirs_standard = ScikitStandardScaler(with_mean=False).fit(matrix)
|
|
302
|
+
report_comparison(
|
|
303
|
+
"Sparse StandardScaler fit",
|
|
304
|
+
lambda: StandardScaler(with_mean=False).fit(matrix),
|
|
305
|
+
lambda: ScikitStandardScaler(with_mean=False).fit(matrix),
|
|
306
|
+
repetitions,
|
|
307
|
+
)
|
|
308
|
+
report_comparison(
|
|
309
|
+
"Sparse StandardScaler transform",
|
|
310
|
+
lambda: ours_standard.transform(matrix),
|
|
311
|
+
lambda: theirs_standard.transform(matrix),
|
|
312
|
+
repetitions,
|
|
313
|
+
)
|
|
314
|
+
ours_maxabs = MaxAbsScaler().fit(matrix)
|
|
315
|
+
theirs_maxabs = ScikitMaxAbsScaler().fit(matrix)
|
|
316
|
+
report_comparison(
|
|
317
|
+
"Sparse MaxAbsScaler fit",
|
|
318
|
+
lambda: MaxAbsScaler().fit(matrix),
|
|
319
|
+
lambda: ScikitMaxAbsScaler().fit(matrix),
|
|
320
|
+
repetitions,
|
|
321
|
+
)
|
|
322
|
+
report_comparison(
|
|
323
|
+
"Sparse MaxAbsScaler transform",
|
|
324
|
+
lambda: ours_maxabs.transform(matrix),
|
|
325
|
+
lambda: theirs_maxabs.transform(matrix),
|
|
326
|
+
repetitions,
|
|
327
|
+
)
|
|
297
328
|
|
|
298
329
|
|
|
299
330
|
def main() -> None:
|
|
@@ -10,6 +10,7 @@ from .base import (
|
|
|
10
10
|
from .compose import ColumnTransformer, make_column_transformer
|
|
11
11
|
from .impute import SimpleImputer
|
|
12
12
|
from .linear_model import ElasticNet, Lasso, LinearRegression, LogisticRegression, Ridge
|
|
13
|
+
from .neighbors import KNeighborsClassifier
|
|
13
14
|
from .pipeline import Pipeline, make_pipeline
|
|
14
15
|
from .preprocessing import (
|
|
15
16
|
LabelEncoder,
|
|
@@ -26,6 +27,7 @@ __all__ = [
|
|
|
26
27
|
"ClassifierMixin",
|
|
27
28
|
"ColumnTransformer",
|
|
28
29
|
"ElasticNet",
|
|
30
|
+
"KNeighborsClassifier",
|
|
29
31
|
"LabelEncoder",
|
|
30
32
|
"Lasso",
|
|
31
33
|
"LinearRegression",
|
|
@@ -45,4 +47,4 @@ __all__ = [
|
|
|
45
47
|
"make_column_transformer",
|
|
46
48
|
"make_pipeline",
|
|
47
49
|
]
|
|
48
|
-
__version__ = "0.1.
|
|
50
|
+
__version__ = "0.1.2"
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
"""K-nearest-neighbors classification."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import warnings
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from numpy.typing import NDArray
|
|
10
|
+
|
|
11
|
+
from rsklearn import _core
|
|
12
|
+
from rsklearn._validation import validate_labels
|
|
13
|
+
from rsklearn.base import BaseEstimator, ClassifierMixin
|
|
14
|
+
from rsklearn.preprocessing import LabelEncoder
|
|
15
|
+
from rsklearn.utils.validation import check_is_fitted, validate_data
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from sklearn.exceptions import DataConversionWarning
|
|
19
|
+
except ImportError:
|
|
20
|
+
DataConversionWarning = UserWarning
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class KNeighborsClassifier(ClassifierMixin, BaseEstimator):
|
|
24
|
+
"""Classifier implementing dense brute-force k-nearest-neighbor voting."""
|
|
25
|
+
|
|
26
|
+
_rsklearn_target_tags = {"required": True}
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
n_neighbors: int = 5,
|
|
31
|
+
*,
|
|
32
|
+
weights: str = "uniform",
|
|
33
|
+
algorithm: str = "auto",
|
|
34
|
+
leaf_size: int = 30,
|
|
35
|
+
p: int = 2,
|
|
36
|
+
metric: str = "minkowski",
|
|
37
|
+
metric_params: dict[str, Any] | None = None,
|
|
38
|
+
n_jobs: int | None = None,
|
|
39
|
+
) -> None:
|
|
40
|
+
self.n_neighbors = n_neighbors
|
|
41
|
+
self.weights = weights
|
|
42
|
+
self.algorithm = algorithm
|
|
43
|
+
self.leaf_size = leaf_size
|
|
44
|
+
self.p = p
|
|
45
|
+
self.metric = metric
|
|
46
|
+
self.metric_params = metric_params
|
|
47
|
+
self.n_jobs = n_jobs
|
|
48
|
+
|
|
49
|
+
def _validate_params(self) -> None:
|
|
50
|
+
if (
|
|
51
|
+
isinstance(self.n_neighbors, (bool, np.bool_))
|
|
52
|
+
or not isinstance(self.n_neighbors, (int, np.integer))
|
|
53
|
+
or self.n_neighbors <= 0
|
|
54
|
+
):
|
|
55
|
+
raise ValueError("n_neighbors must be a positive integer")
|
|
56
|
+
if self.weights not in ("uniform", "distance"):
|
|
57
|
+
raise NotImplementedError(
|
|
58
|
+
"KNeighborsClassifier currently supports weights='uniform' "
|
|
59
|
+
"or weights='distance'"
|
|
60
|
+
)
|
|
61
|
+
if self.algorithm not in ("auto", "brute"):
|
|
62
|
+
raise NotImplementedError(
|
|
63
|
+
"KNeighborsClassifier currently supports algorithm='auto' or 'brute'"
|
|
64
|
+
)
|
|
65
|
+
if (
|
|
66
|
+
isinstance(self.leaf_size, (bool, np.bool_))
|
|
67
|
+
or not isinstance(self.leaf_size, (int, np.integer))
|
|
68
|
+
or self.leaf_size <= 0
|
|
69
|
+
):
|
|
70
|
+
raise ValueError("leaf_size must be a positive integer")
|
|
71
|
+
if self.metric_params not in (None, {}):
|
|
72
|
+
raise NotImplementedError("metric_params are not implemented")
|
|
73
|
+
if self.n_jobs not in (None, 1):
|
|
74
|
+
raise NotImplementedError(
|
|
75
|
+
"n_jobs parallel execution is not implemented at the Python API level"
|
|
76
|
+
)
|
|
77
|
+
self._resolve_metric()
|
|
78
|
+
|
|
79
|
+
def _resolve_metric(self) -> tuple[str, int]:
|
|
80
|
+
if self.metric == "euclidean":
|
|
81
|
+
if self.p not in (2, 2.0):
|
|
82
|
+
raise ValueError("p is only used with metric='minkowski'")
|
|
83
|
+
return "euclidean", 0
|
|
84
|
+
if self.metric == "manhattan":
|
|
85
|
+
if self.p not in (1, 1.0):
|
|
86
|
+
raise ValueError("p is only used with metric='minkowski'")
|
|
87
|
+
return "manhattan", 1
|
|
88
|
+
if self.metric == "minkowski":
|
|
89
|
+
if self.p in (2, 2.0):
|
|
90
|
+
return "euclidean", 0
|
|
91
|
+
if self.p in (1, 1.0):
|
|
92
|
+
return "manhattan", 1
|
|
93
|
+
raise NotImplementedError(
|
|
94
|
+
"KNeighborsClassifier currently supports Minkowski p=1 or p=2"
|
|
95
|
+
)
|
|
96
|
+
raise NotImplementedError(
|
|
97
|
+
"KNeighborsClassifier currently supports metric='minkowski', "
|
|
98
|
+
"'euclidean', or 'manhattan'"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def _weights_code(self) -> int:
|
|
102
|
+
return 0 if self.weights == "uniform" else 1
|
|
103
|
+
|
|
104
|
+
def _validate_neighbor_count(
|
|
105
|
+
self, n_neighbors: int | None, *, training: bool
|
|
106
|
+
) -> int:
|
|
107
|
+
check_is_fitted(self, ("_fit_X", "_fit_norms", "_y_encoded", "classes_"))
|
|
108
|
+
k = self.n_neighbors if n_neighbors is None else n_neighbors
|
|
109
|
+
if (
|
|
110
|
+
isinstance(k, (bool, np.bool_))
|
|
111
|
+
or not isinstance(k, (int, np.integer))
|
|
112
|
+
or k <= 0
|
|
113
|
+
):
|
|
114
|
+
raise ValueError("n_neighbors must be a positive integer")
|
|
115
|
+
maximum = self.n_samples_fit_ - int(training)
|
|
116
|
+
if int(k) > maximum:
|
|
117
|
+
raise ValueError(
|
|
118
|
+
f"Expected n_neighbors <= n_samples_fit, but n_neighbors = {int(k)}, "
|
|
119
|
+
f"n_samples_fit = {maximum}"
|
|
120
|
+
)
|
|
121
|
+
return int(k)
|
|
122
|
+
|
|
123
|
+
def _validate_X(self, X: Any) -> NDArray[np.float64]:
|
|
124
|
+
array = validate_data(
|
|
125
|
+
self,
|
|
126
|
+
X,
|
|
127
|
+
reset=False,
|
|
128
|
+
dtype=np.float64,
|
|
129
|
+
order="C",
|
|
130
|
+
ensure_all_finite=True,
|
|
131
|
+
)
|
|
132
|
+
return np.ascontiguousarray(array, dtype=np.float64)
|
|
133
|
+
|
|
134
|
+
def fit(self, X: Any, y: Any) -> KNeighborsClassifier:
|
|
135
|
+
"""Store the training set and encoded target labels."""
|
|
136
|
+
self._validate_params()
|
|
137
|
+
if y is None:
|
|
138
|
+
raise ValueError(
|
|
139
|
+
"KNeighborsClassifier requires y to be passed, but the target y is None"
|
|
140
|
+
)
|
|
141
|
+
target = np.asarray(y)
|
|
142
|
+
if target.ndim == 2 and target.shape[1] == 1:
|
|
143
|
+
warnings.warn(
|
|
144
|
+
"A column-vector y was passed when a 1d array was expected.",
|
|
145
|
+
DataConversionWarning,
|
|
146
|
+
stacklevel=2,
|
|
147
|
+
)
|
|
148
|
+
y = target.ravel()
|
|
149
|
+
X_array, y_array = validate_data(
|
|
150
|
+
self,
|
|
151
|
+
X,
|
|
152
|
+
y,
|
|
153
|
+
reset=True,
|
|
154
|
+
dtype=np.float64,
|
|
155
|
+
order="C",
|
|
156
|
+
ensure_all_finite=True,
|
|
157
|
+
)
|
|
158
|
+
if y_array.dtype.kind in "fc" and np.any(y_array != np.floor(y_array)):
|
|
159
|
+
raise ValueError("Unknown label type: continuous")
|
|
160
|
+
validate_labels(y_array)
|
|
161
|
+
encoder = LabelEncoder()
|
|
162
|
+
labels = encoder.fit_transform(y_array)
|
|
163
|
+
self.classes_ = encoder.classes_
|
|
164
|
+
if self.classes_.size < 2:
|
|
165
|
+
raise ValueError(
|
|
166
|
+
"KNeighborsClassifier requires at least two classes; got 1 class"
|
|
167
|
+
)
|
|
168
|
+
metric_name, metric_code = self._resolve_metric()
|
|
169
|
+
self._fit_X = np.ascontiguousarray(X_array, dtype=np.float64)
|
|
170
|
+
self._y_encoded = np.ascontiguousarray(labels, dtype=np.int64)
|
|
171
|
+
self._fit_norms = (
|
|
172
|
+
_core.knn_row_norms(self._fit_X)
|
|
173
|
+
if metric_code == 0
|
|
174
|
+
else np.asarray([], dtype=np.float64)
|
|
175
|
+
)
|
|
176
|
+
self.n_samples_fit_ = self._fit_X.shape[0]
|
|
177
|
+
self.effective_metric_ = metric_name
|
|
178
|
+
self.effective_metric_params_ = (
|
|
179
|
+
{} if self.metric_params is None else dict(self.metric_params)
|
|
180
|
+
)
|
|
181
|
+
self._metric_code = metric_code
|
|
182
|
+
return self
|
|
183
|
+
|
|
184
|
+
def kneighbors(
|
|
185
|
+
self,
|
|
186
|
+
X: Any = None,
|
|
187
|
+
n_neighbors: int | None = None,
|
|
188
|
+
return_distance: bool = True,
|
|
189
|
+
) -> tuple[NDArray[np.float64], NDArray[np.int64]] | NDArray[np.int64]:
|
|
190
|
+
"""Return nearest-neighbor distances and indices."""
|
|
191
|
+
training_query = X is None
|
|
192
|
+
k = self._validate_neighbor_count(n_neighbors, training=training_query)
|
|
193
|
+
query = self._fit_X if training_query else self._validate_X(X)
|
|
194
|
+
distances, indices = _core.knn_kneighbors(
|
|
195
|
+
query,
|
|
196
|
+
self._fit_X,
|
|
197
|
+
self._fit_norms,
|
|
198
|
+
k,
|
|
199
|
+
self._metric_code,
|
|
200
|
+
training_query,
|
|
201
|
+
)
|
|
202
|
+
if return_distance:
|
|
203
|
+
return distances, indices
|
|
204
|
+
return indices
|
|
205
|
+
|
|
206
|
+
def predict_proba(self, X: Any) -> NDArray[np.float64]:
|
|
207
|
+
"""Return class probabilities for query samples."""
|
|
208
|
+
check_is_fitted(self, ("_fit_X", "_fit_norms", "_y_encoded", "classes_"))
|
|
209
|
+
k = self._validate_neighbor_count(None, training=False)
|
|
210
|
+
query = self._validate_X(X)
|
|
211
|
+
return _core.knn_predict_proba(
|
|
212
|
+
query,
|
|
213
|
+
self._fit_X,
|
|
214
|
+
self._fit_norms,
|
|
215
|
+
self._y_encoded,
|
|
216
|
+
k,
|
|
217
|
+
self.classes_.size,
|
|
218
|
+
self._metric_code,
|
|
219
|
+
self._weights_code(),
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def predict(self, X: Any) -> NDArray[Any]:
|
|
223
|
+
"""Predict class labels for query samples."""
|
|
224
|
+
check_is_fitted(self, ("_fit_X", "_fit_norms", "_y_encoded", "classes_"))
|
|
225
|
+
k = self._validate_neighbor_count(None, training=False)
|
|
226
|
+
query = self._validate_X(X)
|
|
227
|
+
indices = _core.knn_predict(
|
|
228
|
+
query,
|
|
229
|
+
self._fit_X,
|
|
230
|
+
self._fit_norms,
|
|
231
|
+
self._y_encoded,
|
|
232
|
+
k,
|
|
233
|
+
self.classes_.size,
|
|
234
|
+
self._metric_code,
|
|
235
|
+
self._weights_code(),
|
|
236
|
+
)
|
|
237
|
+
return self.classes_[indices]
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Preprocessing estimators."""
|
|
2
2
|
|
|
3
3
|
from ._label_encoder import LabelEncoder
|
|
4
|
+
from ._maxabs_scaler import MaxAbsScaler
|
|
4
5
|
from ._minmax_scaler import MinMaxScaler
|
|
5
6
|
from ._normalizer import Normalizer
|
|
6
7
|
from ._one_hot_encoder import OneHotEncoder
|
|
@@ -10,6 +11,7 @@ from ._standard_scaler import StandardScaler
|
|
|
10
11
|
|
|
11
12
|
__all__ = [
|
|
12
13
|
"LabelEncoder",
|
|
14
|
+
"MaxAbsScaler",
|
|
13
15
|
"MinMaxScaler",
|
|
14
16
|
"Normalizer",
|
|
15
17
|
"OneHotEncoder",
|