scratchkit 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlscratch/__init__.py +56 -0
- mlscratch/__main__.py +118 -0
- mlscratch/bayesian/__init__.py +53 -0
- mlscratch/bayesian/bayesian_linear_regression.py +171 -0
- mlscratch/bayesian/bayesian_network.py +248 -0
- mlscratch/bayesian/bayesian_nn.py +315 -0
- mlscratch/bayesian/gaussian_process.py +207 -0
- mlscratch/bayesian/hmm.py +277 -0
- mlscratch/bayesian/init.py +52 -0
- mlscratch/bayesian/kalman_filter.py +182 -0
- mlscratch/bayesian/naive_bayes.py +209 -0
- mlscratch/metrics/__init__.py +59 -0
- mlscratch/metrics/classification.py +365 -0
- mlscratch/metrics/regression.py +79 -0
- mlscratch/neural/__init__.py +121 -0
- mlscratch/neural/attention.py +420 -0
- mlscratch/neural/autoencoder.py +543 -0
- mlscratch/neural/boltzmann.py +231 -0
- mlscratch/neural/cnn.py +593 -0
- mlscratch/neural/cvnn.py +322 -0
- mlscratch/neural/gan.py +364 -0
- mlscratch/neural/hopfield.py +193 -0
- mlscratch/neural/perceptron.py +398 -0
- mlscratch/neural/rbf_network.py +230 -0
- mlscratch/neural/recurrent.py +569 -0
- mlscratch/preprocessing/__init__.py +38 -0
- mlscratch/preprocessing/encoders.py +140 -0
- mlscratch/preprocessing/model_selection.py +119 -0
- mlscratch/preprocessing/polynomial.py +105 -0
- mlscratch/preprocessing/scalers.py +220 -0
- mlscratch/py.typed +0 -0
- mlscratch/reinforcement/__init__.py +59 -0
- mlscratch/reinforcement/ddpg.py +363 -0
- mlscratch/reinforcement/dqn.py +319 -0
- mlscratch/reinforcement/ppo.py +452 -0
- mlscratch/reinforcement/q_learning.py +352 -0
- mlscratch/reinforcement/sac.py +382 -0
- mlscratch/reinforcement/utils.py +594 -0
- mlscratch/supervised/__init__.py +76 -0
- mlscratch/supervised/_validation.py +50 -0
- mlscratch/supervised/adaboost.py +255 -0
- mlscratch/supervised/decision_tree.py +495 -0
- mlscratch/supervised/gradient_boosting.py +354 -0
- mlscratch/supervised/knn.py +234 -0
- mlscratch/supervised/lasso_regression.py +125 -0
- mlscratch/supervised/linear_models.py +459 -0
- mlscratch/supervised/linear_regression.py +197 -0
- mlscratch/supervised/logistic_regression.py +119 -0
- mlscratch/supervised/naive_bayes.py +113 -0
- mlscratch/supervised/random_forest.py +321 -0
- mlscratch/supervised/ridge_regression.py +93 -0
- mlscratch/supervised/svm.py +356 -0
- mlscratch/unsupervised/__init__.py +39 -0
- mlscratch/unsupervised/apriori.py +178 -0
- mlscratch/unsupervised/dbscan.py +141 -0
- mlscratch/unsupervised/gmm.py +204 -0
- mlscratch/unsupervised/hierarchical_clustering.py +137 -0
- mlscratch/unsupervised/ica.py +167 -0
- mlscratch/unsupervised/kmeans.py +135 -0
- mlscratch/unsupervised/kmedoids.py +133 -0
- mlscratch/unsupervised/pca.py +103 -0
- mlscratch/unsupervised/tsne.py +200 -0
- scratchkit-0.2.0.dist-info/METADATA +241 -0
- scratchkit-0.2.0.dist-info/RECORD +68 -0
- scratchkit-0.2.0.dist-info/WHEEL +5 -0
- scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
- scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
- scratchkit-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
r"""
|
|
2
|
+
Gradient Boosting
|
|
3
|
+
==================
|
|
4
|
+
Friedman's TreeBoost: an additive ensemble of shallow regression trees,
|
|
5
|
+
each fit to the negative gradient ("pseudo-residual") of a loss
|
|
6
|
+
function evaluated at the current ensemble prediction, then shrunk by
|
|
7
|
+
a learning rate and added to the running prediction.
|
|
8
|
+
|
|
9
|
+
GradientBoostingRegressor
|
|
10
|
+
--------------------------
|
|
11
|
+
``loss='squared_error'``:
|
|
12
|
+
|
|
13
|
+
.. math::
|
|
14
|
+
F_m(x) = F_{m-1}(x) + \eta\, h_m(x), \qquad h_m \text{ fit to } y - F_{m-1}
|
|
15
|
+
|
|
16
|
+
A regression tree's leaf-mean is already the exact minimiser of
|
|
17
|
+
squared error, so no further leaf adjustment is needed — residual
|
|
18
|
+
fitting alone implements gradient descent in function space.
|
|
19
|
+
|
|
20
|
+
``loss='absolute_error'`` (LAD-TreeBoost):
|
|
21
|
+
:math:`h_m` is fit to :math:`\mathrm{sign}(y-F_{m-1})` to choose split
|
|
22
|
+
*structure*, then every leaf value is replaced by the **median**
|
|
23
|
+
residual of the samples routed there — the closed-form minimiser of
|
|
24
|
+
absolute error within a leaf.
|
|
25
|
+
|
|
26
|
+
GradientBoostingClassifier (binary)
|
|
27
|
+
-------------------------------------
|
|
28
|
+
Minimises binomial deviance. :math:`F_0 = \mathrm{logit}(\bar y)`.
|
|
29
|
+
At stage *m*:
|
|
30
|
+
|
|
31
|
+
.. math::
|
|
32
|
+
p_i = \sigma(F_{m-1}(x_i)), \qquad r_i = y_i - p_i
|
|
33
|
+
|
|
34
|
+
:math:`h_m` is fit to :math:`r_i` to choose split structure, then each
|
|
35
|
+
leaf is replaced by a single Newton-Raphson step (Friedman, 2001):
|
|
36
|
+
|
|
37
|
+
.. math::
|
|
38
|
+
\gamma_{\text{leaf}} = \frac{\sum_{i \in \text{leaf}} r_i}
|
|
39
|
+
{\sum_{i \in \text{leaf}} p_i(1-p_i)}
|
|
40
|
+
|
|
41
|
+
Predictions: :math:`\sigma(F_M(x)) \ge 0.5 \Rightarrow` positive class.
|
|
42
|
+
|
|
43
|
+
Design note
|
|
44
|
+
-----------
|
|
45
|
+
Both models choose tree *structure* with the plain weighted-MSE
|
|
46
|
+
criterion (cheap, already implemented by ``DecisionTreeRegressor``)
|
|
47
|
+
rather than the more elaborate "Friedman MSE" split-quality score.
|
|
48
|
+
Only the leaf *values* use the loss-specific closed-form update. This
|
|
49
|
+
is a standard, well-documented simplification that keeps the tree
|
|
50
|
+
code shared and dependency-free while still giving each loss its
|
|
51
|
+
correct, optimal leaf prediction.
|
|
52
|
+
|
|
53
|
+
Complexity
|
|
54
|
+
----------
|
|
55
|
+
- Training : O(n_estimators * n log n * d)
|
|
56
|
+
- Inference: O(n_estimators * depth)
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
from __future__ import annotations
|
|
60
|
+
|
|
61
|
+
import numpy as np
|
|
62
|
+
from numpy.typing import ArrayLike, NDArray
|
|
63
|
+
|
|
64
|
+
from ._validation import validate_x, validate_xy
|
|
65
|
+
from .decision_tree import DecisionTreeRegressor, group_by_leaf
|
|
66
|
+
|
|
67
|
+
FloatArray = NDArray[np.float64]
|
|
68
|
+
IntArray = NDArray[np.int64]
|
|
69
|
+
|
|
70
|
+
_EPS = 1e-12
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
74
|
+
# Helpers
|
|
75
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _sigmoid(z: FloatArray) -> FloatArray:
|
|
79
|
+
return 1.0 / (1.0 + np.exp(-np.clip(z, -500, 500)))
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _newton_leaf_refit(
|
|
83
|
+
tree: DecisionTreeRegressor, X: FloatArray, numerator: FloatArray, denominator: FloatArray
|
|
84
|
+
) -> None:
|
|
85
|
+
"""Overwrite each leaf's value with sum(numerator)/sum(denominator)
|
|
86
|
+
over the samples routed to that leaf — the GBM Newton-step update."""
|
|
87
|
+
leaves = tree.apply(X)
|
|
88
|
+
for leaf, idxs in group_by_leaf(leaves).values():
|
|
89
|
+
idx_arr = np.asarray(idxs)
|
|
90
|
+
den = float(denominator[idx_arr].sum())
|
|
91
|
+
leaf.value = float(numerator[idx_arr].sum() / den) if den > _EPS else 0.0
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _median_leaf_refit(tree: DecisionTreeRegressor, X: FloatArray, residual: FloatArray) -> None:
|
|
95
|
+
"""Overwrite each leaf's value with the median residual of the
|
|
96
|
+
samples routed there — the LAD-TreeBoost update."""
|
|
97
|
+
leaves = tree.apply(X)
|
|
98
|
+
for leaf, idxs in group_by_leaf(leaves).values():
|
|
99
|
+
leaf.value = float(np.median(residual[np.asarray(idxs)]))
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _check_common_params(n_estimators: int, learning_rate: float, subsample: float) -> None:
|
|
103
|
+
if int(n_estimators) < 1:
|
|
104
|
+
raise ValueError("n_estimators must be >= 1.")
|
|
105
|
+
if learning_rate <= 0:
|
|
106
|
+
raise ValueError("learning_rate must be positive.")
|
|
107
|
+
if not (0.0 < subsample <= 1.0):
|
|
108
|
+
raise ValueError("subsample must be in (0, 1].")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
112
|
+
# GradientBoostingRegressor
|
|
113
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class GradientBoostingRegressor:
|
|
117
|
+
"""Gradient-boosted ensemble of regression trees.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
n_estimators : int, default=100
|
|
122
|
+
learning_rate : float, default=0.1
|
|
123
|
+
Shrinkage applied to every tree's contribution.
|
|
124
|
+
max_depth : int, default=3
|
|
125
|
+
Trees are deliberately shallow ("weak learners").
|
|
126
|
+
min_samples_split : int, default=2
|
|
127
|
+
min_samples_leaf : int, default=1
|
|
128
|
+
subsample : float, default=1.0
|
|
129
|
+
Fraction of rows (sampled without replacement) used to fit
|
|
130
|
+
each tree. ``< 1.0`` gives stochastic gradient boosting.
|
|
131
|
+
loss : str, default='squared_error'
|
|
132
|
+
``'squared_error'`` or ``'absolute_error'``.
|
|
133
|
+
random_state : int | None, default=None
|
|
134
|
+
|
|
135
|
+
Attributes
|
|
136
|
+
----------
|
|
137
|
+
estimators_ : the fitted sequence of trees
|
|
138
|
+
init_ : the constant initial prediction (mean of y)
|
|
139
|
+
train_score_ : loss value after each boosting stage
|
|
140
|
+
feature_importances_ : mean impurity-decrease importance across trees
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
def __init__(
|
|
144
|
+
self,
|
|
145
|
+
n_estimators: int = 100,
|
|
146
|
+
learning_rate: float = 0.1,
|
|
147
|
+
max_depth: int = 3,
|
|
148
|
+
min_samples_split: int = 2,
|
|
149
|
+
min_samples_leaf: int = 1,
|
|
150
|
+
subsample: float = 1.0,
|
|
151
|
+
loss: str = "squared_error",
|
|
152
|
+
random_state: int | None = None,
|
|
153
|
+
) -> None:
|
|
154
|
+
if loss not in ("squared_error", "absolute_error"):
|
|
155
|
+
raise ValueError("loss must be 'squared_error' or 'absolute_error'.")
|
|
156
|
+
_check_common_params(n_estimators, learning_rate, subsample)
|
|
157
|
+
self.n_estimators = int(n_estimators)
|
|
158
|
+
self.learning_rate = float(learning_rate)
|
|
159
|
+
self.max_depth = max_depth
|
|
160
|
+
self.min_samples_split = int(min_samples_split)
|
|
161
|
+
self.min_samples_leaf = int(min_samples_leaf)
|
|
162
|
+
self.subsample = float(subsample)
|
|
163
|
+
self.loss = loss
|
|
164
|
+
self.random_state = random_state
|
|
165
|
+
|
|
166
|
+
self.estimators_: list[DecisionTreeRegressor] = []
|
|
167
|
+
self.init_: float | None = None
|
|
168
|
+
self.train_score_: FloatArray | None = None
|
|
169
|
+
self.feature_importances_: FloatArray | None = None
|
|
170
|
+
self.n_features_in_: int | None = None
|
|
171
|
+
|
|
172
|
+
def fit(self, X: ArrayLike, y: ArrayLike) -> GradientBoostingRegressor:
|
|
173
|
+
X_arr, y_arr = validate_xy(X, y)
|
|
174
|
+
y_arr = y_arr.astype(np.float64)
|
|
175
|
+
n = X_arr.shape[0]
|
|
176
|
+
self.n_features_in_ = X_arr.shape[1]
|
|
177
|
+
rng = np.random.default_rng(self.random_state)
|
|
178
|
+
|
|
179
|
+
self.init_ = float(np.mean(y_arr))
|
|
180
|
+
F = np.full(n, self.init_, dtype=np.float64)
|
|
181
|
+
self.estimators_ = []
|
|
182
|
+
self.train_score_ = np.empty(self.n_estimators, dtype=np.float64)
|
|
183
|
+
|
|
184
|
+
for m in range(self.n_estimators):
|
|
185
|
+
residual = y_arr - F
|
|
186
|
+
target = residual if self.loss == "squared_error" else np.sign(residual)
|
|
187
|
+
|
|
188
|
+
idx = (
|
|
189
|
+
rng.choice(n, size=max(1, int(round(self.subsample * n))), replace=False)
|
|
190
|
+
if self.subsample < 1.0
|
|
191
|
+
else np.arange(n)
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
tree = DecisionTreeRegressor(
|
|
195
|
+
max_depth=self.max_depth,
|
|
196
|
+
min_samples_split=self.min_samples_split,
|
|
197
|
+
min_samples_leaf=self.min_samples_leaf,
|
|
198
|
+
)
|
|
199
|
+
tree.fit(X_arr[idx], target[idx])
|
|
200
|
+
if self.loss == "absolute_error":
|
|
201
|
+
_median_leaf_refit(tree, X_arr[idx], residual[idx])
|
|
202
|
+
|
|
203
|
+
F = F + self.learning_rate * tree.predict(X_arr)
|
|
204
|
+
self.estimators_.append(tree)
|
|
205
|
+
self.train_score_[m] = float(np.mean((y_arr - F) ** 2))
|
|
206
|
+
|
|
207
|
+
self.feature_importances_ = np.mean(
|
|
208
|
+
[t.feature_importances_ for t in self.estimators_], axis=0
|
|
209
|
+
)
|
|
210
|
+
return self
|
|
211
|
+
|
|
212
|
+
def predict(self, X: ArrayLike) -> FloatArray:
|
|
213
|
+
if not self.estimators_:
|
|
214
|
+
raise RuntimeError("Call fit() before predict().")
|
|
215
|
+
X_arr = validate_x(X)
|
|
216
|
+
F = np.full(X_arr.shape[0], self.init_, dtype=np.float64)
|
|
217
|
+
for tree in self.estimators_:
|
|
218
|
+
F += self.learning_rate * tree.predict(X_arr)
|
|
219
|
+
return F
|
|
220
|
+
|
|
221
|
+
def staged_predict(self, X: ArrayLike):
|
|
222
|
+
"""Yield the running prediction after each boosting stage."""
|
|
223
|
+
if not self.estimators_:
|
|
224
|
+
raise RuntimeError("Call fit() before staged_predict().")
|
|
225
|
+
X_arr = validate_x(X)
|
|
226
|
+
F = np.full(X_arr.shape[0], self.init_, dtype=np.float64)
|
|
227
|
+
for tree in self.estimators_:
|
|
228
|
+
F = F + self.learning_rate * tree.predict(X_arr)
|
|
229
|
+
yield F.copy()
|
|
230
|
+
|
|
231
|
+
def score(self, X: ArrayLike, y: ArrayLike) -> float:
|
|
232
|
+
"""Return the coefficient of determination R^2."""
|
|
233
|
+
X_arr, y_arr = validate_xy(X, y)
|
|
234
|
+
preds = self.predict(X_arr)
|
|
235
|
+
ss_res = np.sum((y_arr - preds) ** 2)
|
|
236
|
+
ss_tot = np.sum((y_arr - y_arr.mean()) ** 2)
|
|
237
|
+
return float(1.0 - ss_res / ss_tot) if ss_tot > _EPS else 0.0
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
241
|
+
# GradientBoostingClassifier (binary)
|
|
242
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class GradientBoostingClassifier:
|
|
246
|
+
"""Gradient-boosted ensemble for binary classification (binomial deviance).
|
|
247
|
+
|
|
248
|
+
Parameters mirror :class:`GradientBoostingRegressor` minus ``loss``
|
|
249
|
+
(binomial deviance is the only supported objective).
|
|
250
|
+
|
|
251
|
+
Attributes
|
|
252
|
+
----------
|
|
253
|
+
estimators_, init_, train_score_, feature_importances_ — see
|
|
254
|
+
:class:`GradientBoostingRegressor`.
|
|
255
|
+
classes_ : sorted unique labels seen during fit (exactly 2)
|
|
256
|
+
"""
|
|
257
|
+
|
|
258
|
+
def __init__(
|
|
259
|
+
self,
|
|
260
|
+
n_estimators: int = 100,
|
|
261
|
+
learning_rate: float = 0.1,
|
|
262
|
+
max_depth: int = 3,
|
|
263
|
+
min_samples_split: int = 2,
|
|
264
|
+
min_samples_leaf: int = 1,
|
|
265
|
+
subsample: float = 1.0,
|
|
266
|
+
random_state: int | None = None,
|
|
267
|
+
) -> None:
|
|
268
|
+
_check_common_params(n_estimators, learning_rate, subsample)
|
|
269
|
+
self.n_estimators = int(n_estimators)
|
|
270
|
+
self.learning_rate = float(learning_rate)
|
|
271
|
+
self.max_depth = max_depth
|
|
272
|
+
self.min_samples_split = int(min_samples_split)
|
|
273
|
+
self.min_samples_leaf = int(min_samples_leaf)
|
|
274
|
+
self.subsample = float(subsample)
|
|
275
|
+
self.random_state = random_state
|
|
276
|
+
|
|
277
|
+
self.estimators_: list[DecisionTreeRegressor] = []
|
|
278
|
+
self.init_: float | None = None
|
|
279
|
+
self.train_score_: FloatArray | None = None
|
|
280
|
+
self.feature_importances_: FloatArray | None = None
|
|
281
|
+
self.classes_: NDArray | None = None
|
|
282
|
+
self.n_features_in_: int | None = None
|
|
283
|
+
|
|
284
|
+
def fit(self, X: ArrayLike, y: ArrayLike) -> GradientBoostingClassifier:
|
|
285
|
+
X_arr, y_raw = validate_xy(X, y)
|
|
286
|
+
self.classes_ = np.unique(y_raw)
|
|
287
|
+
if self.classes_.size != 2:
|
|
288
|
+
raise ValueError("GradientBoostingClassifier supports only binary classification.")
|
|
289
|
+
y_bin = (y_raw == self.classes_[1]).astype(np.float64)
|
|
290
|
+
|
|
291
|
+
n = X_arr.shape[0]
|
|
292
|
+
self.n_features_in_ = X_arr.shape[1]
|
|
293
|
+
rng = np.random.default_rng(self.random_state)
|
|
294
|
+
|
|
295
|
+
p0 = float(np.clip(y_bin.mean(), 1e-6, 1.0 - 1e-6))
|
|
296
|
+
self.init_ = float(np.log(p0 / (1.0 - p0)))
|
|
297
|
+
F = np.full(n, self.init_, dtype=np.float64)
|
|
298
|
+
self.estimators_ = []
|
|
299
|
+
self.train_score_ = np.empty(self.n_estimators, dtype=np.float64)
|
|
300
|
+
|
|
301
|
+
for m in range(self.n_estimators):
|
|
302
|
+
p = _sigmoid(F)
|
|
303
|
+
residual = y_bin - p
|
|
304
|
+
denom = p * (1.0 - p)
|
|
305
|
+
|
|
306
|
+
idx = (
|
|
307
|
+
rng.choice(n, size=max(1, int(round(self.subsample * n))), replace=False)
|
|
308
|
+
if self.subsample < 1.0
|
|
309
|
+
else np.arange(n)
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
tree = DecisionTreeRegressor(
|
|
313
|
+
max_depth=self.max_depth,
|
|
314
|
+
min_samples_split=self.min_samples_split,
|
|
315
|
+
min_samples_leaf=self.min_samples_leaf,
|
|
316
|
+
)
|
|
317
|
+
tree.fit(X_arr[idx], residual[idx])
|
|
318
|
+
_newton_leaf_refit(tree, X_arr[idx], residual[idx], denom[idx])
|
|
319
|
+
|
|
320
|
+
F = F + self.learning_rate * tree.predict(X_arr)
|
|
321
|
+
self.estimators_.append(tree)
|
|
322
|
+
|
|
323
|
+
p_now = np.clip(_sigmoid(F), 1e-12, 1.0 - 1e-12)
|
|
324
|
+
self.train_score_[m] = float(
|
|
325
|
+
-np.mean(y_bin * np.log(p_now) + (1 - y_bin) * np.log(1 - p_now))
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
self.feature_importances_ = np.mean(
|
|
329
|
+
[t.feature_importances_ for t in self.estimators_], axis=0
|
|
330
|
+
)
|
|
331
|
+
return self
|
|
332
|
+
|
|
333
|
+
def decision_function(self, X: ArrayLike) -> FloatArray:
|
|
334
|
+
"""Return the raw (pre-sigmoid) ensemble score."""
|
|
335
|
+
if not self.estimators_:
|
|
336
|
+
raise RuntimeError("Call fit() before decision_function().")
|
|
337
|
+
X_arr = validate_x(X)
|
|
338
|
+
F = np.full(X_arr.shape[0], self.init_, dtype=np.float64)
|
|
339
|
+
for tree in self.estimators_:
|
|
340
|
+
F += self.learning_rate * tree.predict(X_arr)
|
|
341
|
+
return F
|
|
342
|
+
|
|
343
|
+
def predict_proba(self, X: ArrayLike) -> FloatArray:
|
|
344
|
+
"""Return class probabilities, columns ordered as ``classes_``."""
|
|
345
|
+
p1 = _sigmoid(self.decision_function(X))
|
|
346
|
+
return np.column_stack([1.0 - p1, p1])
|
|
347
|
+
|
|
348
|
+
def predict(self, X: ArrayLike) -> NDArray:
|
|
349
|
+
proba = self.predict_proba(X)
|
|
350
|
+
return self.classes_[np.argmax(proba, axis=1)]
|
|
351
|
+
|
|
352
|
+
def score(self, X: ArrayLike, y: ArrayLike) -> float:
|
|
353
|
+
X_arr, y_arr = validate_xy(X, y)
|
|
354
|
+
return float(np.mean(self.predict(X_arr) == y_arr))
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""
|
|
2
|
+
K-Nearest Neighbours (KNN)
|
|
3
|
+
===========================
|
|
4
|
+
Instance-based (lazy) learning — no training phase; prediction queries
|
|
5
|
+
the k most similar training samples.
|
|
6
|
+
|
|
7
|
+
Distance metrics supported: euclidean, manhattan, minkowski
|
|
8
|
+
Weighting: 'uniform' (vote equally) or 'distance' (weight by 1/d)
|
|
9
|
+
|
|
10
|
+
KNeighboursClassifier — majority-vote (weighted) classification
|
|
11
|
+
KNeighboursRegressor — mean (weighted) of k nearest target values
|
|
12
|
+
|
|
13
|
+
Time complexity: O(n·d) per prediction (brute-force).
|
|
14
|
+
Only numpy and Python stdlib are used.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from collections import Counter
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _validate_x(X: np.ndarray) -> np.ndarray:
|
|
25
|
+
X_arr = np.asarray(X, dtype=float)
|
|
26
|
+
if X_arr.ndim != 2:
|
|
27
|
+
raise ValueError("X must be a 2D array of shape (n_samples, n_features).")
|
|
28
|
+
return X_arr
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _validate_xy(X: np.ndarray, y: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
|
32
|
+
X_arr = _validate_x(X)
|
|
33
|
+
y_arr = np.asarray(y).flatten()
|
|
34
|
+
if X_arr.shape[0] != y_arr.shape[0]:
|
|
35
|
+
raise ValueError(f"X has {X_arr.shape[0]} samples but y has {y_arr.shape[0]}.")
|
|
36
|
+
return X_arr, y_arr
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _pairwise_distances(
|
|
40
|
+
X: np.ndarray,
|
|
41
|
+
Y: np.ndarray,
|
|
42
|
+
metric: str = "euclidean",
|
|
43
|
+
p: float = 2.0,
|
|
44
|
+
) -> np.ndarray:
|
|
45
|
+
"""Return (n_X, n_Y) distance matrix."""
|
|
46
|
+
if metric == "euclidean":
|
|
47
|
+
# ||x-y||² = ||x||² + ||y||² - 2 x·y
|
|
48
|
+
sq_X = np.sum(X**2, axis=1, keepdims=True)
|
|
49
|
+
sq_Y = np.sum(Y**2, axis=1, keepdims=True)
|
|
50
|
+
dist2 = sq_X + sq_Y.T - 2 * X @ Y.T
|
|
51
|
+
return np.sqrt(np.maximum(dist2, 0.0))
|
|
52
|
+
elif metric == "manhattan":
|
|
53
|
+
return np.sum(np.abs(X[:, np.newaxis, :] - Y[np.newaxis, :, :]), axis=2)
|
|
54
|
+
elif metric == "minkowski":
|
|
55
|
+
return np.sum(np.abs(X[:, np.newaxis, :] - Y[np.newaxis, :, :]) ** p, axis=2) ** (1.0 / p)
|
|
56
|
+
else:
|
|
57
|
+
raise ValueError(f"Unknown metric '{metric}'. Choose euclidean, manhattan, minkowski.")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class KNeighboursClassifier:
|
|
61
|
+
"""
|
|
62
|
+
K-Nearest Neighbours classifier.
|
|
63
|
+
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
n_neighbors : int
|
|
67
|
+
weights : str 'uniform' | 'distance'
|
|
68
|
+
metric : str 'euclidean' | 'manhattan' | 'minkowski'
|
|
69
|
+
p : float Minkowski order (only when metric='minkowski')
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
n_neighbors: int = 5,
|
|
75
|
+
weights: str = "uniform",
|
|
76
|
+
metric: str = "euclidean",
|
|
77
|
+
p: float = 2.0,
|
|
78
|
+
) -> None:
|
|
79
|
+
if weights not in {"uniform", "distance"}:
|
|
80
|
+
raise ValueError("weights must be 'uniform' or 'distance'.")
|
|
81
|
+
self.n_neighbors = n_neighbors
|
|
82
|
+
self.weights = weights
|
|
83
|
+
self.metric = metric
|
|
84
|
+
self.p = p
|
|
85
|
+
self._X_train: np.ndarray | None = None
|
|
86
|
+
self._y_train: np.ndarray | None = None
|
|
87
|
+
self.classes_: np.ndarray | None = None
|
|
88
|
+
|
|
89
|
+
def fit(self, X: np.ndarray, y: np.ndarray) -> KNeighboursClassifier:
|
|
90
|
+
X_arr, y_arr = _validate_xy(X, y)
|
|
91
|
+
self._X_train = X_arr
|
|
92
|
+
self._y_train = y_arr
|
|
93
|
+
self.classes_ = np.unique(y_arr)
|
|
94
|
+
return self
|
|
95
|
+
|
|
96
|
+
def _get_knn(self, X: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
|
97
|
+
"""Return (indices, distances) of k nearest neighbours for each row of X."""
|
|
98
|
+
if self._X_train is None:
|
|
99
|
+
raise RuntimeError("Call fit() before predict().")
|
|
100
|
+
D = _pairwise_distances(X, self._X_train, self.metric, self.p)
|
|
101
|
+
# Partition (faster than full sort for large n)
|
|
102
|
+
k = min(self.n_neighbors, len(self._X_train))
|
|
103
|
+
idx = np.argpartition(D, k - 1, axis=1)[:, :k]
|
|
104
|
+
dists = D[np.arange(len(X))[:, np.newaxis], idx]
|
|
105
|
+
# Sort within the k neighbours
|
|
106
|
+
order = np.argsort(dists, axis=1)
|
|
107
|
+
idx = idx[np.arange(len(X))[:, np.newaxis], order]
|
|
108
|
+
dists = dists[np.arange(len(X))[:, np.newaxis], order]
|
|
109
|
+
return idx, dists
|
|
110
|
+
|
|
111
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
112
|
+
X_arr = _validate_x(X)
|
|
113
|
+
idx, dists = self._get_knn(X_arr)
|
|
114
|
+
neighbours = self._y_train[idx] # (n_test, k)
|
|
115
|
+
|
|
116
|
+
if self.weights == "uniform":
|
|
117
|
+
preds = [Counter(row).most_common(1)[0][0] for row in neighbours]
|
|
118
|
+
else:
|
|
119
|
+
preds = []
|
|
120
|
+
for i in range(len(X_arr)):
|
|
121
|
+
d = dists[i]
|
|
122
|
+
# Avoid division by zero for exact matches
|
|
123
|
+
if np.any(d == 0):
|
|
124
|
+
exact = neighbours[i][d == 0]
|
|
125
|
+
preds.append(Counter(exact).most_common(1)[0][0])
|
|
126
|
+
else:
|
|
127
|
+
w = 1.0 / d
|
|
128
|
+
vote: dict = {}
|
|
129
|
+
for cls, wi in zip(neighbours[i], w, strict=True):
|
|
130
|
+
vote[cls] = vote.get(cls, 0.0) + wi
|
|
131
|
+
preds.append(max(vote, key=vote.get))
|
|
132
|
+
return np.array(preds)
|
|
133
|
+
|
|
134
|
+
def predict_proba(self, X: np.ndarray) -> np.ndarray:
|
|
135
|
+
X_arr = _validate_x(X)
|
|
136
|
+
idx, dists = self._get_knn(X_arr)
|
|
137
|
+
n_test = len(X_arr)
|
|
138
|
+
n_cls = len(self.classes_)
|
|
139
|
+
cls_idx = {c: i for i, c in enumerate(self.classes_)}
|
|
140
|
+
proba = np.zeros((n_test, n_cls))
|
|
141
|
+
|
|
142
|
+
for i in range(n_test):
|
|
143
|
+
labels = self._y_train[idx[i]]
|
|
144
|
+
if self.weights == "uniform":
|
|
145
|
+
for lbl in labels:
|
|
146
|
+
proba[i, cls_idx[lbl]] += 1.0
|
|
147
|
+
else:
|
|
148
|
+
d = dists[i]
|
|
149
|
+
if np.any(d == 0):
|
|
150
|
+
for lbl in labels[d == 0]:
|
|
151
|
+
proba[i, cls_idx[lbl]] += 1.0
|
|
152
|
+
else:
|
|
153
|
+
for lbl, wi in zip(labels, 1.0 / d, strict=True):
|
|
154
|
+
proba[i, cls_idx[lbl]] += wi
|
|
155
|
+
proba[i] /= proba[i].sum() + 1e-12
|
|
156
|
+
|
|
157
|
+
return proba
|
|
158
|
+
|
|
159
|
+
def score(self, X: np.ndarray, y: np.ndarray) -> float:
|
|
160
|
+
"""Return classification accuracy on the given data."""
|
|
161
|
+
X_arr, y_arr = _validate_xy(X, y)
|
|
162
|
+
return float(np.mean(self.predict(X_arr) == y_arr))
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class KNeighboursRegressor:
|
|
166
|
+
"""
|
|
167
|
+
K-Nearest Neighbours regressor.
|
|
168
|
+
|
|
169
|
+
Parameters
|
|
170
|
+
----------
|
|
171
|
+
n_neighbors : int
|
|
172
|
+
weights : str 'uniform' | 'distance'
|
|
173
|
+
metric : str 'euclidean' | 'manhattan' | 'minkowski'
|
|
174
|
+
p : float Minkowski order
|
|
175
|
+
"""
|
|
176
|
+
|
|
177
|
+
def __init__(
|
|
178
|
+
self,
|
|
179
|
+
n_neighbors: int = 5,
|
|
180
|
+
weights: str = "uniform",
|
|
181
|
+
metric: str = "euclidean",
|
|
182
|
+
p: float = 2.0,
|
|
183
|
+
) -> None:
|
|
184
|
+
if weights not in {"uniform", "distance"}:
|
|
185
|
+
raise ValueError("weights must be 'uniform' or 'distance'.")
|
|
186
|
+
self.n_neighbors = n_neighbors
|
|
187
|
+
self.weights = weights
|
|
188
|
+
self.metric = metric
|
|
189
|
+
self.p = p
|
|
190
|
+
self._X_train: np.ndarray | None = None
|
|
191
|
+
self._y_train: np.ndarray | None = None
|
|
192
|
+
|
|
193
|
+
def fit(self, X: np.ndarray, y: np.ndarray) -> KNeighboursRegressor:
|
|
194
|
+
X_arr, y_arr = _validate_xy(X, y)
|
|
195
|
+
self._X_train = X_arr
|
|
196
|
+
self._y_train = y_arr.astype(float)
|
|
197
|
+
return self
|
|
198
|
+
|
|
199
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
200
|
+
if self._X_train is None:
|
|
201
|
+
raise RuntimeError("Call fit() before predict().")
|
|
202
|
+
X_arr = _validate_x(X)
|
|
203
|
+
clf = KNeighboursClassifier(self.n_neighbors, self.weights, self.metric, self.p)
|
|
204
|
+
clf._X_train = self._X_train
|
|
205
|
+
clf._y_train = self._y_train
|
|
206
|
+
idx, dists = clf._get_knn(X_arr)
|
|
207
|
+
neighbours = self._y_train[idx]
|
|
208
|
+
|
|
209
|
+
if self.weights == "uniform":
|
|
210
|
+
return neighbours.mean(axis=1)
|
|
211
|
+
|
|
212
|
+
preds = np.zeros(len(X_arr))
|
|
213
|
+
for i in range(len(X_arr)):
|
|
214
|
+
d = dists[i]
|
|
215
|
+
if np.any(d == 0):
|
|
216
|
+
preds[i] = neighbours[i][d == 0].mean()
|
|
217
|
+
else:
|
|
218
|
+
w = 1.0 / d
|
|
219
|
+
preds[i] = np.dot(w, neighbours[i]) / w.sum()
|
|
220
|
+
return preds
|
|
221
|
+
|
|
222
|
+
def score(self, X: np.ndarray, y: np.ndarray) -> float:
|
|
223
|
+
"""Return the coefficient of determination R^2 of the prediction."""
|
|
224
|
+
X_arr, y_arr = _validate_xy(X, y)
|
|
225
|
+
preds = self.predict(X_arr)
|
|
226
|
+
y_arr = y_arr.astype(float)
|
|
227
|
+
ss_res = float(np.sum((y_arr - preds) ** 2))
|
|
228
|
+
ss_tot = float(np.sum((y_arr - y_arr.mean()) ** 2))
|
|
229
|
+
return 1.0 - ss_res / ss_tot if ss_tot > 1e-12 else 0.0
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
# American-spelling aliases (sklearn-style) for ergonomics.
|
|
233
|
+
KNeighborsClassifier = KNeighboursClassifier
|
|
234
|
+
KNeighborsRegressor = KNeighboursRegressor
|