scratchkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. mlscratch/__init__.py +56 -0
  2. mlscratch/__main__.py +118 -0
  3. mlscratch/bayesian/__init__.py +53 -0
  4. mlscratch/bayesian/bayesian_linear_regression.py +171 -0
  5. mlscratch/bayesian/bayesian_network.py +248 -0
  6. mlscratch/bayesian/bayesian_nn.py +315 -0
  7. mlscratch/bayesian/gaussian_process.py +207 -0
  8. mlscratch/bayesian/hmm.py +277 -0
  9. mlscratch/bayesian/init.py +52 -0
  10. mlscratch/bayesian/kalman_filter.py +182 -0
  11. mlscratch/bayesian/naive_bayes.py +209 -0
  12. mlscratch/metrics/__init__.py +59 -0
  13. mlscratch/metrics/classification.py +365 -0
  14. mlscratch/metrics/regression.py +79 -0
  15. mlscratch/neural/__init__.py +121 -0
  16. mlscratch/neural/attention.py +420 -0
  17. mlscratch/neural/autoencoder.py +543 -0
  18. mlscratch/neural/boltzmann.py +231 -0
  19. mlscratch/neural/cnn.py +593 -0
  20. mlscratch/neural/cvnn.py +322 -0
  21. mlscratch/neural/gan.py +364 -0
  22. mlscratch/neural/hopfield.py +193 -0
  23. mlscratch/neural/perceptron.py +398 -0
  24. mlscratch/neural/rbf_network.py +230 -0
  25. mlscratch/neural/recurrent.py +569 -0
  26. mlscratch/preprocessing/__init__.py +38 -0
  27. mlscratch/preprocessing/encoders.py +140 -0
  28. mlscratch/preprocessing/model_selection.py +119 -0
  29. mlscratch/preprocessing/polynomial.py +105 -0
  30. mlscratch/preprocessing/scalers.py +220 -0
  31. mlscratch/py.typed +0 -0
  32. mlscratch/reinforcement/__init__.py +59 -0
  33. mlscratch/reinforcement/ddpg.py +363 -0
  34. mlscratch/reinforcement/dqn.py +319 -0
  35. mlscratch/reinforcement/ppo.py +452 -0
  36. mlscratch/reinforcement/q_learning.py +352 -0
  37. mlscratch/reinforcement/sac.py +382 -0
  38. mlscratch/reinforcement/utils.py +594 -0
  39. mlscratch/supervised/__init__.py +76 -0
  40. mlscratch/supervised/_validation.py +50 -0
  41. mlscratch/supervised/adaboost.py +255 -0
  42. mlscratch/supervised/decision_tree.py +495 -0
  43. mlscratch/supervised/gradient_boosting.py +354 -0
  44. mlscratch/supervised/knn.py +234 -0
  45. mlscratch/supervised/lasso_regression.py +125 -0
  46. mlscratch/supervised/linear_models.py +459 -0
  47. mlscratch/supervised/linear_regression.py +197 -0
  48. mlscratch/supervised/logistic_regression.py +119 -0
  49. mlscratch/supervised/naive_bayes.py +113 -0
  50. mlscratch/supervised/random_forest.py +321 -0
  51. mlscratch/supervised/ridge_regression.py +93 -0
  52. mlscratch/supervised/svm.py +356 -0
  53. mlscratch/unsupervised/__init__.py +39 -0
  54. mlscratch/unsupervised/apriori.py +178 -0
  55. mlscratch/unsupervised/dbscan.py +141 -0
  56. mlscratch/unsupervised/gmm.py +204 -0
  57. mlscratch/unsupervised/hierarchical_clustering.py +137 -0
  58. mlscratch/unsupervised/ica.py +167 -0
  59. mlscratch/unsupervised/kmeans.py +135 -0
  60. mlscratch/unsupervised/kmedoids.py +133 -0
  61. mlscratch/unsupervised/pca.py +103 -0
  62. mlscratch/unsupervised/tsne.py +200 -0
  63. scratchkit-0.2.0.dist-info/METADATA +241 -0
  64. scratchkit-0.2.0.dist-info/RECORD +68 -0
  65. scratchkit-0.2.0.dist-info/WHEEL +5 -0
  66. scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
  67. scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
  68. scratchkit-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,354 @@
1
+ r"""
2
+ Gradient Boosting
3
+ ==================
4
+ Friedman's TreeBoost: an additive ensemble of shallow regression trees,
5
+ each fit to the negative gradient ("pseudo-residual") of a loss
6
+ function evaluated at the current ensemble prediction, then shrunk by
7
+ a learning rate and added to the running prediction.
8
+
9
+ GradientBoostingRegressor
10
+ --------------------------
11
+ ``loss='squared_error'``:
12
+
13
+ .. math::
14
+ F_m(x) = F_{m-1}(x) + \eta\, h_m(x), \qquad h_m \text{ fit to } y - F_{m-1}
15
+
16
+ A regression tree's leaf-mean is already the exact minimiser of
17
+ squared error, so no further leaf adjustment is needed — residual
18
+ fitting alone implements gradient descent in function space.
19
+
20
+ ``loss='absolute_error'`` (LAD-TreeBoost):
21
+ :math:`h_m` is fit to :math:`\mathrm{sign}(y-F_{m-1})` to choose split
22
+ *structure*, then every leaf value is replaced by the **median**
23
+ residual of the samples routed there — the closed-form minimiser of
24
+ absolute error within a leaf.
25
+
26
+ GradientBoostingClassifier (binary)
27
+ -------------------------------------
28
+ Minimises binomial deviance. :math:`F_0 = \mathrm{logit}(\bar y)`.
29
+ At stage *m*:
30
+
31
+ .. math::
32
+ p_i = \sigma(F_{m-1}(x_i)), \qquad r_i = y_i - p_i
33
+
34
+ :math:`h_m` is fit to :math:`r_i` to choose split structure, then each
35
+ leaf is replaced by a single Newton-Raphson step (Friedman, 2001):
36
+
37
+ .. math::
38
+ \gamma_{\text{leaf}} = \frac{\sum_{i \in \text{leaf}} r_i}
39
+ {\sum_{i \in \text{leaf}} p_i(1-p_i)}
40
+
41
+ Predictions: :math:`\sigma(F_M(x)) \ge 0.5 \Rightarrow` positive class.
42
+
43
+ Design note
44
+ -----------
45
+ Both models choose tree *structure* with the plain weighted-MSE
46
+ criterion (cheap, already implemented by ``DecisionTreeRegressor``)
47
+ rather than the more elaborate "Friedman MSE" split-quality score.
48
+ Only the leaf *values* use the loss-specific closed-form update. This
49
+ is a standard, well-documented simplification that keeps the tree
50
+ code shared and dependency-free while still giving each loss its
51
+ correct, optimal leaf prediction.
52
+
53
+ Complexity
54
+ ----------
55
+ - Training : O(n_estimators * n log n * d)
56
+ - Inference: O(n_estimators * depth)
57
+ """
58
+
59
+ from __future__ import annotations
60
+
61
+ import numpy as np
62
+ from numpy.typing import ArrayLike, NDArray
63
+
64
+ from ._validation import validate_x, validate_xy
65
+ from .decision_tree import DecisionTreeRegressor, group_by_leaf
66
+
67
+ FloatArray = NDArray[np.float64]
68
+ IntArray = NDArray[np.int64]
69
+
70
+ _EPS = 1e-12
71
+
72
+
73
+ # ──────────────────────────────────────────────────────────────────────────
74
+ # Helpers
75
+ # ──────────────────────────────────────────────────────────────────────────
76
+
77
+
78
+ def _sigmoid(z: FloatArray) -> FloatArray:
79
+ return 1.0 / (1.0 + np.exp(-np.clip(z, -500, 500)))
80
+
81
+
82
+ def _newton_leaf_refit(
83
+ tree: DecisionTreeRegressor, X: FloatArray, numerator: FloatArray, denominator: FloatArray
84
+ ) -> None:
85
+ """Overwrite each leaf's value with sum(numerator)/sum(denominator)
86
+ over the samples routed to that leaf — the GBM Newton-step update."""
87
+ leaves = tree.apply(X)
88
+ for leaf, idxs in group_by_leaf(leaves).values():
89
+ idx_arr = np.asarray(idxs)
90
+ den = float(denominator[idx_arr].sum())
91
+ leaf.value = float(numerator[idx_arr].sum() / den) if den > _EPS else 0.0
92
+
93
+
94
+ def _median_leaf_refit(tree: DecisionTreeRegressor, X: FloatArray, residual: FloatArray) -> None:
95
+ """Overwrite each leaf's value with the median residual of the
96
+ samples routed there — the LAD-TreeBoost update."""
97
+ leaves = tree.apply(X)
98
+ for leaf, idxs in group_by_leaf(leaves).values():
99
+ leaf.value = float(np.median(residual[np.asarray(idxs)]))
100
+
101
+
102
+ def _check_common_params(n_estimators: int, learning_rate: float, subsample: float) -> None:
103
+ if int(n_estimators) < 1:
104
+ raise ValueError("n_estimators must be >= 1.")
105
+ if learning_rate <= 0:
106
+ raise ValueError("learning_rate must be positive.")
107
+ if not (0.0 < subsample <= 1.0):
108
+ raise ValueError("subsample must be in (0, 1].")
109
+
110
+
111
+ # ──────────────────────────────────────────────────────────────────────────
112
+ # GradientBoostingRegressor
113
+ # ──────────────────────────────────────────────────────────────────────────
114
+
115
+
116
+ class GradientBoostingRegressor:
117
+ """Gradient-boosted ensemble of regression trees.
118
+
119
+ Parameters
120
+ ----------
121
+ n_estimators : int, default=100
122
+ learning_rate : float, default=0.1
123
+ Shrinkage applied to every tree's contribution.
124
+ max_depth : int, default=3
125
+ Trees are deliberately shallow ("weak learners").
126
+ min_samples_split : int, default=2
127
+ min_samples_leaf : int, default=1
128
+ subsample : float, default=1.0
129
+ Fraction of rows (sampled without replacement) used to fit
130
+ each tree. ``< 1.0`` gives stochastic gradient boosting.
131
+ loss : str, default='squared_error'
132
+ ``'squared_error'`` or ``'absolute_error'``.
133
+ random_state : int | None, default=None
134
+
135
+ Attributes
136
+ ----------
137
+ estimators_ : the fitted sequence of trees
138
+ init_ : the constant initial prediction (mean of y)
139
+ train_score_ : loss value after each boosting stage
140
+ feature_importances_ : mean impurity-decrease importance across trees
141
+ """
142
+
143
+ def __init__(
144
+ self,
145
+ n_estimators: int = 100,
146
+ learning_rate: float = 0.1,
147
+ max_depth: int = 3,
148
+ min_samples_split: int = 2,
149
+ min_samples_leaf: int = 1,
150
+ subsample: float = 1.0,
151
+ loss: str = "squared_error",
152
+ random_state: int | None = None,
153
+ ) -> None:
154
+ if loss not in ("squared_error", "absolute_error"):
155
+ raise ValueError("loss must be 'squared_error' or 'absolute_error'.")
156
+ _check_common_params(n_estimators, learning_rate, subsample)
157
+ self.n_estimators = int(n_estimators)
158
+ self.learning_rate = float(learning_rate)
159
+ self.max_depth = max_depth
160
+ self.min_samples_split = int(min_samples_split)
161
+ self.min_samples_leaf = int(min_samples_leaf)
162
+ self.subsample = float(subsample)
163
+ self.loss = loss
164
+ self.random_state = random_state
165
+
166
+ self.estimators_: list[DecisionTreeRegressor] = []
167
+ self.init_: float | None = None
168
+ self.train_score_: FloatArray | None = None
169
+ self.feature_importances_: FloatArray | None = None
170
+ self.n_features_in_: int | None = None
171
+
172
+ def fit(self, X: ArrayLike, y: ArrayLike) -> GradientBoostingRegressor:
173
+ X_arr, y_arr = validate_xy(X, y)
174
+ y_arr = y_arr.astype(np.float64)
175
+ n = X_arr.shape[0]
176
+ self.n_features_in_ = X_arr.shape[1]
177
+ rng = np.random.default_rng(self.random_state)
178
+
179
+ self.init_ = float(np.mean(y_arr))
180
+ F = np.full(n, self.init_, dtype=np.float64)
181
+ self.estimators_ = []
182
+ self.train_score_ = np.empty(self.n_estimators, dtype=np.float64)
183
+
184
+ for m in range(self.n_estimators):
185
+ residual = y_arr - F
186
+ target = residual if self.loss == "squared_error" else np.sign(residual)
187
+
188
+ idx = (
189
+ rng.choice(n, size=max(1, int(round(self.subsample * n))), replace=False)
190
+ if self.subsample < 1.0
191
+ else np.arange(n)
192
+ )
193
+
194
+ tree = DecisionTreeRegressor(
195
+ max_depth=self.max_depth,
196
+ min_samples_split=self.min_samples_split,
197
+ min_samples_leaf=self.min_samples_leaf,
198
+ )
199
+ tree.fit(X_arr[idx], target[idx])
200
+ if self.loss == "absolute_error":
201
+ _median_leaf_refit(tree, X_arr[idx], residual[idx])
202
+
203
+ F = F + self.learning_rate * tree.predict(X_arr)
204
+ self.estimators_.append(tree)
205
+ self.train_score_[m] = float(np.mean((y_arr - F) ** 2))
206
+
207
+ self.feature_importances_ = np.mean(
208
+ [t.feature_importances_ for t in self.estimators_], axis=0
209
+ )
210
+ return self
211
+
212
+ def predict(self, X: ArrayLike) -> FloatArray:
213
+ if not self.estimators_:
214
+ raise RuntimeError("Call fit() before predict().")
215
+ X_arr = validate_x(X)
216
+ F = np.full(X_arr.shape[0], self.init_, dtype=np.float64)
217
+ for tree in self.estimators_:
218
+ F += self.learning_rate * tree.predict(X_arr)
219
+ return F
220
+
221
+ def staged_predict(self, X: ArrayLike):
222
+ """Yield the running prediction after each boosting stage."""
223
+ if not self.estimators_:
224
+ raise RuntimeError("Call fit() before staged_predict().")
225
+ X_arr = validate_x(X)
226
+ F = np.full(X_arr.shape[0], self.init_, dtype=np.float64)
227
+ for tree in self.estimators_:
228
+ F = F + self.learning_rate * tree.predict(X_arr)
229
+ yield F.copy()
230
+
231
+ def score(self, X: ArrayLike, y: ArrayLike) -> float:
232
+ """Return the coefficient of determination R^2."""
233
+ X_arr, y_arr = validate_xy(X, y)
234
+ preds = self.predict(X_arr)
235
+ ss_res = np.sum((y_arr - preds) ** 2)
236
+ ss_tot = np.sum((y_arr - y_arr.mean()) ** 2)
237
+ return float(1.0 - ss_res / ss_tot) if ss_tot > _EPS else 0.0
238
+
239
+
240
+ # ──────────────────────────────────────────────────────────────────────────
241
+ # GradientBoostingClassifier (binary)
242
+ # ──────────────────────────────────────────────────────────────────────────
243
+
244
+
245
+ class GradientBoostingClassifier:
246
+ """Gradient-boosted ensemble for binary classification (binomial deviance).
247
+
248
+ Parameters mirror :class:`GradientBoostingRegressor` minus ``loss``
249
+ (binomial deviance is the only supported objective).
250
+
251
+ Attributes
252
+ ----------
253
+ estimators_, init_, train_score_, feature_importances_ — see
254
+ :class:`GradientBoostingRegressor`.
255
+ classes_ : sorted unique labels seen during fit (exactly 2)
256
+ """
257
+
258
+ def __init__(
259
+ self,
260
+ n_estimators: int = 100,
261
+ learning_rate: float = 0.1,
262
+ max_depth: int = 3,
263
+ min_samples_split: int = 2,
264
+ min_samples_leaf: int = 1,
265
+ subsample: float = 1.0,
266
+ random_state: int | None = None,
267
+ ) -> None:
268
+ _check_common_params(n_estimators, learning_rate, subsample)
269
+ self.n_estimators = int(n_estimators)
270
+ self.learning_rate = float(learning_rate)
271
+ self.max_depth = max_depth
272
+ self.min_samples_split = int(min_samples_split)
273
+ self.min_samples_leaf = int(min_samples_leaf)
274
+ self.subsample = float(subsample)
275
+ self.random_state = random_state
276
+
277
+ self.estimators_: list[DecisionTreeRegressor] = []
278
+ self.init_: float | None = None
279
+ self.train_score_: FloatArray | None = None
280
+ self.feature_importances_: FloatArray | None = None
281
+ self.classes_: NDArray | None = None
282
+ self.n_features_in_: int | None = None
283
+
284
+ def fit(self, X: ArrayLike, y: ArrayLike) -> GradientBoostingClassifier:
285
+ X_arr, y_raw = validate_xy(X, y)
286
+ self.classes_ = np.unique(y_raw)
287
+ if self.classes_.size != 2:
288
+ raise ValueError("GradientBoostingClassifier supports only binary classification.")
289
+ y_bin = (y_raw == self.classes_[1]).astype(np.float64)
290
+
291
+ n = X_arr.shape[0]
292
+ self.n_features_in_ = X_arr.shape[1]
293
+ rng = np.random.default_rng(self.random_state)
294
+
295
+ p0 = float(np.clip(y_bin.mean(), 1e-6, 1.0 - 1e-6))
296
+ self.init_ = float(np.log(p0 / (1.0 - p0)))
297
+ F = np.full(n, self.init_, dtype=np.float64)
298
+ self.estimators_ = []
299
+ self.train_score_ = np.empty(self.n_estimators, dtype=np.float64)
300
+
301
+ for m in range(self.n_estimators):
302
+ p = _sigmoid(F)
303
+ residual = y_bin - p
304
+ denom = p * (1.0 - p)
305
+
306
+ idx = (
307
+ rng.choice(n, size=max(1, int(round(self.subsample * n))), replace=False)
308
+ if self.subsample < 1.0
309
+ else np.arange(n)
310
+ )
311
+
312
+ tree = DecisionTreeRegressor(
313
+ max_depth=self.max_depth,
314
+ min_samples_split=self.min_samples_split,
315
+ min_samples_leaf=self.min_samples_leaf,
316
+ )
317
+ tree.fit(X_arr[idx], residual[idx])
318
+ _newton_leaf_refit(tree, X_arr[idx], residual[idx], denom[idx])
319
+
320
+ F = F + self.learning_rate * tree.predict(X_arr)
321
+ self.estimators_.append(tree)
322
+
323
+ p_now = np.clip(_sigmoid(F), 1e-12, 1.0 - 1e-12)
324
+ self.train_score_[m] = float(
325
+ -np.mean(y_bin * np.log(p_now) + (1 - y_bin) * np.log(1 - p_now))
326
+ )
327
+
328
+ self.feature_importances_ = np.mean(
329
+ [t.feature_importances_ for t in self.estimators_], axis=0
330
+ )
331
+ return self
332
+
333
+ def decision_function(self, X: ArrayLike) -> FloatArray:
334
+ """Return the raw (pre-sigmoid) ensemble score."""
335
+ if not self.estimators_:
336
+ raise RuntimeError("Call fit() before decision_function().")
337
+ X_arr = validate_x(X)
338
+ F = np.full(X_arr.shape[0], self.init_, dtype=np.float64)
339
+ for tree in self.estimators_:
340
+ F += self.learning_rate * tree.predict(X_arr)
341
+ return F
342
+
343
+ def predict_proba(self, X: ArrayLike) -> FloatArray:
344
+ """Return class probabilities, columns ordered as ``classes_``."""
345
+ p1 = _sigmoid(self.decision_function(X))
346
+ return np.column_stack([1.0 - p1, p1])
347
+
348
+ def predict(self, X: ArrayLike) -> NDArray:
349
+ proba = self.predict_proba(X)
350
+ return self.classes_[np.argmax(proba, axis=1)]
351
+
352
+ def score(self, X: ArrayLike, y: ArrayLike) -> float:
353
+ X_arr, y_arr = validate_xy(X, y)
354
+ return float(np.mean(self.predict(X_arr) == y_arr))
@@ -0,0 +1,234 @@
1
+ """
2
+ K-Nearest Neighbours (KNN)
3
+ ===========================
4
+ Instance-based (lazy) learning — no training phase; prediction queries
5
+ the k most similar training samples.
6
+
7
+ Distance metrics supported: euclidean, manhattan, minkowski
8
+ Weighting: 'uniform' (vote equally) or 'distance' (weight by 1/d)
9
+
10
+ KNeighboursClassifier — majority-vote (weighted) classification
11
+ KNeighboursRegressor — mean (weighted) of k nearest target values
12
+
13
+ Time complexity: O(n·d) per prediction (brute-force).
14
+ Only numpy and Python stdlib are used.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from collections import Counter
20
+
21
+ import numpy as np
22
+
23
+
24
+ def _validate_x(X: np.ndarray) -> np.ndarray:
25
+ X_arr = np.asarray(X, dtype=float)
26
+ if X_arr.ndim != 2:
27
+ raise ValueError("X must be a 2D array of shape (n_samples, n_features).")
28
+ return X_arr
29
+
30
+
31
+ def _validate_xy(X: np.ndarray, y: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
32
+ X_arr = _validate_x(X)
33
+ y_arr = np.asarray(y).flatten()
34
+ if X_arr.shape[0] != y_arr.shape[0]:
35
+ raise ValueError(f"X has {X_arr.shape[0]} samples but y has {y_arr.shape[0]}.")
36
+ return X_arr, y_arr
37
+
38
+
39
+ def _pairwise_distances(
40
+ X: np.ndarray,
41
+ Y: np.ndarray,
42
+ metric: str = "euclidean",
43
+ p: float = 2.0,
44
+ ) -> np.ndarray:
45
+ """Return (n_X, n_Y) distance matrix."""
46
+ if metric == "euclidean":
47
+ # ||x-y||² = ||x||² + ||y||² - 2 x·y
48
+ sq_X = np.sum(X**2, axis=1, keepdims=True)
49
+ sq_Y = np.sum(Y**2, axis=1, keepdims=True)
50
+ dist2 = sq_X + sq_Y.T - 2 * X @ Y.T
51
+ return np.sqrt(np.maximum(dist2, 0.0))
52
+ elif metric == "manhattan":
53
+ return np.sum(np.abs(X[:, np.newaxis, :] - Y[np.newaxis, :, :]), axis=2)
54
+ elif metric == "minkowski":
55
+ return np.sum(np.abs(X[:, np.newaxis, :] - Y[np.newaxis, :, :]) ** p, axis=2) ** (1.0 / p)
56
+ else:
57
+ raise ValueError(f"Unknown metric '{metric}'. Choose euclidean, manhattan, minkowski.")
58
+
59
+
60
+ class KNeighboursClassifier:
61
+ """
62
+ K-Nearest Neighbours classifier.
63
+
64
+ Parameters
65
+ ----------
66
+ n_neighbors : int
67
+ weights : str 'uniform' | 'distance'
68
+ metric : str 'euclidean' | 'manhattan' | 'minkowski'
69
+ p : float Minkowski order (only when metric='minkowski')
70
+ """
71
+
72
+ def __init__(
73
+ self,
74
+ n_neighbors: int = 5,
75
+ weights: str = "uniform",
76
+ metric: str = "euclidean",
77
+ p: float = 2.0,
78
+ ) -> None:
79
+ if weights not in {"uniform", "distance"}:
80
+ raise ValueError("weights must be 'uniform' or 'distance'.")
81
+ self.n_neighbors = n_neighbors
82
+ self.weights = weights
83
+ self.metric = metric
84
+ self.p = p
85
+ self._X_train: np.ndarray | None = None
86
+ self._y_train: np.ndarray | None = None
87
+ self.classes_: np.ndarray | None = None
88
+
89
+ def fit(self, X: np.ndarray, y: np.ndarray) -> KNeighboursClassifier:
90
+ X_arr, y_arr = _validate_xy(X, y)
91
+ self._X_train = X_arr
92
+ self._y_train = y_arr
93
+ self.classes_ = np.unique(y_arr)
94
+ return self
95
+
96
+ def _get_knn(self, X: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
97
+ """Return (indices, distances) of k nearest neighbours for each row of X."""
98
+ if self._X_train is None:
99
+ raise RuntimeError("Call fit() before predict().")
100
+ D = _pairwise_distances(X, self._X_train, self.metric, self.p)
101
+ # Partition (faster than full sort for large n)
102
+ k = min(self.n_neighbors, len(self._X_train))
103
+ idx = np.argpartition(D, k - 1, axis=1)[:, :k]
104
+ dists = D[np.arange(len(X))[:, np.newaxis], idx]
105
+ # Sort within the k neighbours
106
+ order = np.argsort(dists, axis=1)
107
+ idx = idx[np.arange(len(X))[:, np.newaxis], order]
108
+ dists = dists[np.arange(len(X))[:, np.newaxis], order]
109
+ return idx, dists
110
+
111
+ def predict(self, X: np.ndarray) -> np.ndarray:
112
+ X_arr = _validate_x(X)
113
+ idx, dists = self._get_knn(X_arr)
114
+ neighbours = self._y_train[idx] # (n_test, k)
115
+
116
+ if self.weights == "uniform":
117
+ preds = [Counter(row).most_common(1)[0][0] for row in neighbours]
118
+ else:
119
+ preds = []
120
+ for i in range(len(X_arr)):
121
+ d = dists[i]
122
+ # Avoid division by zero for exact matches
123
+ if np.any(d == 0):
124
+ exact = neighbours[i][d == 0]
125
+ preds.append(Counter(exact).most_common(1)[0][0])
126
+ else:
127
+ w = 1.0 / d
128
+ vote: dict = {}
129
+ for cls, wi in zip(neighbours[i], w, strict=True):
130
+ vote[cls] = vote.get(cls, 0.0) + wi
131
+ preds.append(max(vote, key=vote.get))
132
+ return np.array(preds)
133
+
134
+ def predict_proba(self, X: np.ndarray) -> np.ndarray:
135
+ X_arr = _validate_x(X)
136
+ idx, dists = self._get_knn(X_arr)
137
+ n_test = len(X_arr)
138
+ n_cls = len(self.classes_)
139
+ cls_idx = {c: i for i, c in enumerate(self.classes_)}
140
+ proba = np.zeros((n_test, n_cls))
141
+
142
+ for i in range(n_test):
143
+ labels = self._y_train[idx[i]]
144
+ if self.weights == "uniform":
145
+ for lbl in labels:
146
+ proba[i, cls_idx[lbl]] += 1.0
147
+ else:
148
+ d = dists[i]
149
+ if np.any(d == 0):
150
+ for lbl in labels[d == 0]:
151
+ proba[i, cls_idx[lbl]] += 1.0
152
+ else:
153
+ for lbl, wi in zip(labels, 1.0 / d, strict=True):
154
+ proba[i, cls_idx[lbl]] += wi
155
+ proba[i] /= proba[i].sum() + 1e-12
156
+
157
+ return proba
158
+
159
+ def score(self, X: np.ndarray, y: np.ndarray) -> float:
160
+ """Return classification accuracy on the given data."""
161
+ X_arr, y_arr = _validate_xy(X, y)
162
+ return float(np.mean(self.predict(X_arr) == y_arr))
163
+
164
+
165
+ class KNeighboursRegressor:
166
+ """
167
+ K-Nearest Neighbours regressor.
168
+
169
+ Parameters
170
+ ----------
171
+ n_neighbors : int
172
+ weights : str 'uniform' | 'distance'
173
+ metric : str 'euclidean' | 'manhattan' | 'minkowski'
174
+ p : float Minkowski order
175
+ """
176
+
177
+ def __init__(
178
+ self,
179
+ n_neighbors: int = 5,
180
+ weights: str = "uniform",
181
+ metric: str = "euclidean",
182
+ p: float = 2.0,
183
+ ) -> None:
184
+ if weights not in {"uniform", "distance"}:
185
+ raise ValueError("weights must be 'uniform' or 'distance'.")
186
+ self.n_neighbors = n_neighbors
187
+ self.weights = weights
188
+ self.metric = metric
189
+ self.p = p
190
+ self._X_train: np.ndarray | None = None
191
+ self._y_train: np.ndarray | None = None
192
+
193
+ def fit(self, X: np.ndarray, y: np.ndarray) -> KNeighboursRegressor:
194
+ X_arr, y_arr = _validate_xy(X, y)
195
+ self._X_train = X_arr
196
+ self._y_train = y_arr.astype(float)
197
+ return self
198
+
199
+ def predict(self, X: np.ndarray) -> np.ndarray:
200
+ if self._X_train is None:
201
+ raise RuntimeError("Call fit() before predict().")
202
+ X_arr = _validate_x(X)
203
+ clf = KNeighboursClassifier(self.n_neighbors, self.weights, self.metric, self.p)
204
+ clf._X_train = self._X_train
205
+ clf._y_train = self._y_train
206
+ idx, dists = clf._get_knn(X_arr)
207
+ neighbours = self._y_train[idx]
208
+
209
+ if self.weights == "uniform":
210
+ return neighbours.mean(axis=1)
211
+
212
+ preds = np.zeros(len(X_arr))
213
+ for i in range(len(X_arr)):
214
+ d = dists[i]
215
+ if np.any(d == 0):
216
+ preds[i] = neighbours[i][d == 0].mean()
217
+ else:
218
+ w = 1.0 / d
219
+ preds[i] = np.dot(w, neighbours[i]) / w.sum()
220
+ return preds
221
+
222
+ def score(self, X: np.ndarray, y: np.ndarray) -> float:
223
+ """Return the coefficient of determination R^2 of the prediction."""
224
+ X_arr, y_arr = _validate_xy(X, y)
225
+ preds = self.predict(X_arr)
226
+ y_arr = y_arr.astype(float)
227
+ ss_res = float(np.sum((y_arr - preds) ** 2))
228
+ ss_tot = float(np.sum((y_arr - y_arr.mean()) ** 2))
229
+ return 1.0 - ss_res / ss_tot if ss_tot > 1e-12 else 0.0
230
+
231
+
232
+ # American-spelling aliases (sklearn-style) for ergonomics.
233
+ KNeighborsClassifier = KNeighboursClassifier
234
+ KNeighborsRegressor = KNeighboursRegressor