scratchkit 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlscratch/__init__.py +56 -0
- mlscratch/__main__.py +118 -0
- mlscratch/bayesian/__init__.py +53 -0
- mlscratch/bayesian/bayesian_linear_regression.py +171 -0
- mlscratch/bayesian/bayesian_network.py +248 -0
- mlscratch/bayesian/bayesian_nn.py +315 -0
- mlscratch/bayesian/gaussian_process.py +207 -0
- mlscratch/bayesian/hmm.py +277 -0
- mlscratch/bayesian/init.py +52 -0
- mlscratch/bayesian/kalman_filter.py +182 -0
- mlscratch/bayesian/naive_bayes.py +209 -0
- mlscratch/metrics/__init__.py +59 -0
- mlscratch/metrics/classification.py +365 -0
- mlscratch/metrics/regression.py +79 -0
- mlscratch/neural/__init__.py +121 -0
- mlscratch/neural/attention.py +420 -0
- mlscratch/neural/autoencoder.py +543 -0
- mlscratch/neural/boltzmann.py +231 -0
- mlscratch/neural/cnn.py +593 -0
- mlscratch/neural/cvnn.py +322 -0
- mlscratch/neural/gan.py +364 -0
- mlscratch/neural/hopfield.py +193 -0
- mlscratch/neural/perceptron.py +398 -0
- mlscratch/neural/rbf_network.py +230 -0
- mlscratch/neural/recurrent.py +569 -0
- mlscratch/preprocessing/__init__.py +38 -0
- mlscratch/preprocessing/encoders.py +140 -0
- mlscratch/preprocessing/model_selection.py +119 -0
- mlscratch/preprocessing/polynomial.py +105 -0
- mlscratch/preprocessing/scalers.py +220 -0
- mlscratch/py.typed +0 -0
- mlscratch/reinforcement/__init__.py +59 -0
- mlscratch/reinforcement/ddpg.py +363 -0
- mlscratch/reinforcement/dqn.py +319 -0
- mlscratch/reinforcement/ppo.py +452 -0
- mlscratch/reinforcement/q_learning.py +352 -0
- mlscratch/reinforcement/sac.py +382 -0
- mlscratch/reinforcement/utils.py +594 -0
- mlscratch/supervised/__init__.py +76 -0
- mlscratch/supervised/_validation.py +50 -0
- mlscratch/supervised/adaboost.py +255 -0
- mlscratch/supervised/decision_tree.py +495 -0
- mlscratch/supervised/gradient_boosting.py +354 -0
- mlscratch/supervised/knn.py +234 -0
- mlscratch/supervised/lasso_regression.py +125 -0
- mlscratch/supervised/linear_models.py +459 -0
- mlscratch/supervised/linear_regression.py +197 -0
- mlscratch/supervised/logistic_regression.py +119 -0
- mlscratch/supervised/naive_bayes.py +113 -0
- mlscratch/supervised/random_forest.py +321 -0
- mlscratch/supervised/ridge_regression.py +93 -0
- mlscratch/supervised/svm.py +356 -0
- mlscratch/unsupervised/__init__.py +39 -0
- mlscratch/unsupervised/apriori.py +178 -0
- mlscratch/unsupervised/dbscan.py +141 -0
- mlscratch/unsupervised/gmm.py +204 -0
- mlscratch/unsupervised/hierarchical_clustering.py +137 -0
- mlscratch/unsupervised/ica.py +167 -0
- mlscratch/unsupervised/kmeans.py +135 -0
- mlscratch/unsupervised/kmedoids.py +133 -0
- mlscratch/unsupervised/pca.py +103 -0
- mlscratch/unsupervised/tsne.py +200 -0
- scratchkit-0.2.0.dist-info/METADATA +241 -0
- scratchkit-0.2.0.dist-info/RECORD +68 -0
- scratchkit-0.2.0.dist-info/WHEEL +5 -0
- scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
- scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
- scratchkit-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
r"""
|
|
2
|
+
mlscratch.supervised._validation
|
|
3
|
+
==================================
|
|
4
|
+
Internal, private input-validation helpers shared by every estimator in
|
|
5
|
+
``mlscratch.supervised`` (trees, ensembles, kernel SVM). Not part of the
|
|
6
|
+
public API — do not import this module from outside the package.
|
|
7
|
+
|
|
8
|
+
Centralising these checks means every estimator raises the *same*
|
|
9
|
+
exception type with the *same* message shape for the *same* mistake
|
|
10
|
+
(wrong-dimensional `X`, mismatched `X`/`y` length, malformed
|
|
11
|
+
`sample_weight`), which is what lets the test suite assert on a single
|
|
12
|
+
shared error-message contract (e.g. ``match="samples"``) across five
|
|
13
|
+
otherwise-independent algorithm implementations.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
from numpy.typing import ArrayLike, NDArray
|
|
20
|
+
|
|
21
|
+
FloatArray = NDArray[np.float64]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def validate_x(X: ArrayLike) -> FloatArray:
|
|
25
|
+
"""Coerce X to a float64 ndarray and require it to be 2D."""
|
|
26
|
+
X_arr = np.asarray(X, dtype=np.float64)
|
|
27
|
+
if X_arr.ndim != 2:
|
|
28
|
+
raise ValueError("X must be a 2D array of shape (n_samples, n_features).")
|
|
29
|
+
return X_arr
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def validate_xy(X: ArrayLike, y: ArrayLike) -> tuple[FloatArray, NDArray]:
|
|
33
|
+
"""validate_x(X), plus require y to share X's sample count."""
|
|
34
|
+
X_arr = validate_x(X)
|
|
35
|
+
y_arr = np.asarray(y).flatten()
|
|
36
|
+
if X_arr.shape[0] != y_arr.shape[0]:
|
|
37
|
+
raise ValueError(f"X has {X_arr.shape[0]} samples but y has {y_arr.shape[0]}.")
|
|
38
|
+
return X_arr, y_arr
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def validate_sample_weight(sample_weight: ArrayLike | None, n_samples: int) -> FloatArray:
|
|
42
|
+
"""Return a uniform weight vector if None, else validate a user-supplied one."""
|
|
43
|
+
if sample_weight is None:
|
|
44
|
+
return np.ones(n_samples, dtype=np.float64)
|
|
45
|
+
w = np.asarray(sample_weight, dtype=np.float64).flatten()
|
|
46
|
+
if w.shape[0] != n_samples:
|
|
47
|
+
raise ValueError(f"sample_weight has {w.shape[0]} entries but X has {n_samples} samples.")
|
|
48
|
+
if np.any(w < 0):
|
|
49
|
+
raise ValueError("sample_weight entries must be non-negative.")
|
|
50
|
+
return w
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
r"""
|
|
2
|
+
AdaBoost
|
|
3
|
+
========
|
|
4
|
+
Adaptive Boosting: a weighted ensemble of shallow decision-tree "weak
|
|
5
|
+
learners" (stumps by default), where each successive learner is fit
|
|
6
|
+
on a re-weighted version of the training set that emphasises the
|
|
7
|
+
samples the ensemble-so-far gets wrong.
|
|
8
|
+
|
|
9
|
+
Two algorithms are supported, both natively multiclass
|
|
10
|
+
(Zhu, Rosset, Zhu & Hastie, 2009, "Multi-class AdaBoost"):
|
|
11
|
+
|
|
12
|
+
``'SAMME'`` (discrete)
|
|
13
|
+
-----------------------
|
|
14
|
+
Uses only each weak learner's hard predictions.
|
|
15
|
+
|
|
16
|
+
.. math::
|
|
17
|
+
\alpha_m = \eta \left[ \ln\frac{1-\mathrm{err}_m}{\mathrm{err}_m} + \ln(K-1) \right]
|
|
18
|
+
|
|
19
|
+
.. math::
|
|
20
|
+
w_i \leftarrow w_i \exp\!\big(\alpha_m \cdot \mathbb{1}[\hat y_i \ne y_i]\big),
|
|
21
|
+
\quad \text{then renormalise}
|
|
22
|
+
|
|
23
|
+
``'SAMME.R'`` (real-valued, the modern default)
|
|
24
|
+
--------------------------------------------------
|
|
25
|
+
Uses each weak learner's class-probability estimates directly, which
|
|
26
|
+
typically converges in fewer rounds:
|
|
27
|
+
|
|
28
|
+
.. math::
|
|
29
|
+
h_k^{(m)}(x) = (K-1)\left(\log p_k(x) - \frac1K\sum_{k'} \log p_{k'}(x)\right)
|
|
30
|
+
|
|
31
|
+
.. math::
|
|
32
|
+
w_i \leftarrow w_i \exp\!\left(-\eta\,\frac{K-1}{K}\, y_i^{\mathsf T} \log p(x_i)\right)
|
|
33
|
+
|
|
34
|
+
where :math:`y_i` uses the symmetric :math:`\{-1/(K-1), 1\}` class
|
|
35
|
+
coding. The ensemble decision is :math:`\arg\max_k \sum_m h_k^{(m)}(x)`.
|
|
36
|
+
|
|
37
|
+
For :math:`K=2` both algorithms reduce to the classic binary AdaBoost.
|
|
38
|
+
|
|
39
|
+
Complexity
|
|
40
|
+
----------
|
|
41
|
+
- Training : O(n_estimators * weak-learner fit cost)
|
|
42
|
+
- Inference: O(n_estimators * weak-learner predict cost)
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
from __future__ import annotations
|
|
46
|
+
|
|
47
|
+
import numpy as np
|
|
48
|
+
from numpy.typing import ArrayLike, NDArray
|
|
49
|
+
|
|
50
|
+
from ._validation import validate_x, validate_xy
|
|
51
|
+
from .decision_tree import DecisionTreeClassifier
|
|
52
|
+
|
|
53
|
+
FloatArray = NDArray[np.float64]
|
|
54
|
+
IntArray = NDArray[np.int64]
|
|
55
|
+
|
|
56
|
+
_EPS = 1e-10
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class AdaBoostClassifier:
|
|
60
|
+
"""Adaptive Boosting classifier (SAMME / SAMME.R), natively multiclass.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
n_estimators : int, default=50
|
|
65
|
+
Maximum number of weak learners. Boosting may stop earlier if
|
|
66
|
+
a weak learner achieves zero training error or (SAMME only)
|
|
67
|
+
becomes worse than random guessing.
|
|
68
|
+
learning_rate : float, default=1.0
|
|
69
|
+
Shrinks the contribution of each weak learner.
|
|
70
|
+
algorithm : str, default='SAMME.R'
|
|
71
|
+
``'SAMME'`` (discrete) or ``'SAMME.R'`` (real-valued).
|
|
72
|
+
max_depth : int, default=1
|
|
73
|
+
Depth of each weak learner; ``1`` gives the classic "decision
|
|
74
|
+
stump".
|
|
75
|
+
random_state : int | None, default=None
|
|
76
|
+
|
|
77
|
+
Attributes
|
|
78
|
+
----------
|
|
79
|
+
estimators_ : the fitted sequence of weak learners
|
|
80
|
+
estimator_weights_ : per-estimator combination weight (alpha)
|
|
81
|
+
estimator_errors_ : per-estimator weighted training error
|
|
82
|
+
classes_ : sorted unique labels seen during fit
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(
|
|
86
|
+
self,
|
|
87
|
+
n_estimators: int = 50,
|
|
88
|
+
learning_rate: float = 1.0,
|
|
89
|
+
algorithm: str = "SAMME.R",
|
|
90
|
+
max_depth: int = 1,
|
|
91
|
+
random_state: int | None = None,
|
|
92
|
+
) -> None:
|
|
93
|
+
if algorithm not in ("SAMME", "SAMME.R"):
|
|
94
|
+
raise ValueError("algorithm must be 'SAMME' or 'SAMME.R'.")
|
|
95
|
+
if int(n_estimators) < 1:
|
|
96
|
+
raise ValueError("n_estimators must be >= 1.")
|
|
97
|
+
if learning_rate <= 0:
|
|
98
|
+
raise ValueError("learning_rate must be positive.")
|
|
99
|
+
self.n_estimators = int(n_estimators)
|
|
100
|
+
self.learning_rate = float(learning_rate)
|
|
101
|
+
self.algorithm = algorithm
|
|
102
|
+
self.max_depth = int(max_depth)
|
|
103
|
+
self.random_state = random_state
|
|
104
|
+
|
|
105
|
+
self.estimators_: list[DecisionTreeClassifier] = []
|
|
106
|
+
self.estimator_weights_: FloatArray | None = None
|
|
107
|
+
self.estimator_errors_: FloatArray | None = None
|
|
108
|
+
self.classes_: IntArray | None = None
|
|
109
|
+
self.n_features_in_: int | None = None
|
|
110
|
+
|
|
111
|
+
# -- public API -----------------------------------------------------------
|
|
112
|
+
|
|
113
|
+
def fit(self, X: ArrayLike, y: ArrayLike) -> AdaBoostClassifier:
|
|
114
|
+
X_arr, y_raw = validate_xy(X, y)
|
|
115
|
+
self.classes_, y_idx = np.unique(y_raw, return_inverse=True)
|
|
116
|
+
y_idx = y_idx.astype(np.int64)
|
|
117
|
+
n_classes = self.classes_.size
|
|
118
|
+
if n_classes < 2:
|
|
119
|
+
raise ValueError("AdaBoostClassifier requires at least 2 classes.")
|
|
120
|
+
n = X_arr.shape[0]
|
|
121
|
+
self.n_features_in_ = X_arr.shape[1]
|
|
122
|
+
rng = np.random.default_rng(self.random_state)
|
|
123
|
+
|
|
124
|
+
sample_weight = np.full(n, 1.0 / n, dtype=np.float64)
|
|
125
|
+
self.estimators_ = []
|
|
126
|
+
weights: list[float] = []
|
|
127
|
+
errors: list[float] = []
|
|
128
|
+
|
|
129
|
+
for m in range(self.n_estimators):
|
|
130
|
+
stump = DecisionTreeClassifier(
|
|
131
|
+
max_depth=self.max_depth, random_state=int(rng.integers(0, 2**31 - 1))
|
|
132
|
+
)
|
|
133
|
+
stump.fit(X_arr, y_idx, sample_weight=sample_weight)
|
|
134
|
+
|
|
135
|
+
if self.algorithm == "SAMME":
|
|
136
|
+
pred = stump.predict(X_arr)
|
|
137
|
+
incorrect = pred != y_idx
|
|
138
|
+
err = float(np.average(incorrect, weights=sample_weight))
|
|
139
|
+
|
|
140
|
+
if err >= 1.0 - 1.0 / n_classes:
|
|
141
|
+
if not self.estimators_:
|
|
142
|
+
raise RuntimeError(
|
|
143
|
+
"BaseEstimator is worse than random guessing on the first "
|
|
144
|
+
"boosting round; AdaBoost cannot be fit."
|
|
145
|
+
)
|
|
146
|
+
break
|
|
147
|
+
|
|
148
|
+
err_clipped = float(np.clip(err, _EPS, 1.0 - _EPS))
|
|
149
|
+
alpha = self.learning_rate * (
|
|
150
|
+
np.log((1.0 - err_clipped) / err_clipped) + np.log(n_classes - 1)
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
self.estimators_.append(stump)
|
|
154
|
+
weights.append(float(alpha))
|
|
155
|
+
errors.append(err)
|
|
156
|
+
|
|
157
|
+
if err <= 0.0 or m == self.n_estimators - 1:
|
|
158
|
+
break
|
|
159
|
+
sample_weight = sample_weight * np.exp(alpha * incorrect)
|
|
160
|
+
|
|
161
|
+
else: # SAMME.R
|
|
162
|
+
proba = self._safe_proba(stump, X_arr, n_classes)
|
|
163
|
+
logp = np.log(proba)
|
|
164
|
+
pred = np.argmax(proba, axis=1)
|
|
165
|
+
incorrect = pred != y_idx
|
|
166
|
+
err = float(np.average(incorrect, weights=sample_weight))
|
|
167
|
+
|
|
168
|
+
self.estimators_.append(stump)
|
|
169
|
+
weights.append(1.0) # SAMME.R combines via h(x) directly, not a scalar alpha
|
|
170
|
+
errors.append(err)
|
|
171
|
+
|
|
172
|
+
if err <= 0.0 or m == self.n_estimators - 1:
|
|
173
|
+
break
|
|
174
|
+
|
|
175
|
+
y_coding = np.full((n, n_classes), -1.0 / (n_classes - 1))
|
|
176
|
+
y_coding[np.arange(n), y_idx] = 1.0
|
|
177
|
+
contrib = (
|
|
178
|
+
-self.learning_rate
|
|
179
|
+
* (n_classes - 1)
|
|
180
|
+
/ n_classes
|
|
181
|
+
* np.sum(y_coding * logp, axis=1)
|
|
182
|
+
)
|
|
183
|
+
sample_weight = sample_weight * np.exp(contrib)
|
|
184
|
+
|
|
185
|
+
sample_weight = np.maximum(sample_weight, _EPS)
|
|
186
|
+
sample_weight /= sample_weight.sum()
|
|
187
|
+
|
|
188
|
+
self.estimator_weights_ = np.array(weights, dtype=np.float64)
|
|
189
|
+
self.estimator_errors_ = np.array(errors, dtype=np.float64)
|
|
190
|
+
return self
|
|
191
|
+
|
|
192
|
+
def decision_function(self, X: ArrayLike) -> FloatArray:
|
|
193
|
+
"""Return the per-class ensemble score, shape ``(n_samples, n_classes)``."""
|
|
194
|
+
if not self.estimators_:
|
|
195
|
+
raise RuntimeError("Call fit() before decision_function().")
|
|
196
|
+
X_arr = validate_x(X)
|
|
197
|
+
n_classes = self.classes_.size
|
|
198
|
+
scores = np.zeros((X_arr.shape[0], n_classes), dtype=np.float64)
|
|
199
|
+
|
|
200
|
+
if self.algorithm == "SAMME":
|
|
201
|
+
for stump, alpha in zip(self.estimators_, self.estimator_weights_, strict=True):
|
|
202
|
+
pred = stump.predict(X_arr)
|
|
203
|
+
scores[np.arange(X_arr.shape[0]), pred] += alpha
|
|
204
|
+
else:
|
|
205
|
+
for stump in self.estimators_:
|
|
206
|
+
proba = self._safe_proba(stump, X_arr, n_classes)
|
|
207
|
+
logp = np.log(proba)
|
|
208
|
+
scores += (n_classes - 1) * (logp - logp.mean(axis=1, keepdims=True))
|
|
209
|
+
return scores
|
|
210
|
+
|
|
211
|
+
def predict(self, X: ArrayLike) -> NDArray:
|
|
212
|
+
scores = self.decision_function(X)
|
|
213
|
+
return self.classes_[np.argmax(scores, axis=1)]
|
|
214
|
+
|
|
215
|
+
def predict_proba(self, X: ArrayLike) -> FloatArray:
|
|
216
|
+
"""Softmax of the (rescaled) ensemble decision scores."""
|
|
217
|
+
scores = self.decision_function(X)
|
|
218
|
+
n_classes = self.classes_.size
|
|
219
|
+
scaled = scores / max(1, n_classes - 1)
|
|
220
|
+
e = np.exp(scaled - scaled.max(axis=1, keepdims=True))
|
|
221
|
+
return e / e.sum(axis=1, keepdims=True)
|
|
222
|
+
|
|
223
|
+
def staged_predict(self, X: ArrayLike):
|
|
224
|
+
"""Yield the ensemble's predicted labels after each boosting round."""
|
|
225
|
+
if not self.estimators_:
|
|
226
|
+
raise RuntimeError("Call fit() before staged_predict().")
|
|
227
|
+
X_arr = validate_x(X)
|
|
228
|
+
n_classes = self.classes_.size
|
|
229
|
+
cum_scores = np.zeros((X_arr.shape[0], n_classes), dtype=np.float64)
|
|
230
|
+
for i, stump in enumerate(self.estimators_):
|
|
231
|
+
if self.algorithm == "SAMME":
|
|
232
|
+
pred = stump.predict(X_arr)
|
|
233
|
+
cum_scores[np.arange(X_arr.shape[0]), pred] += self.estimator_weights_[i]
|
|
234
|
+
else:
|
|
235
|
+
proba = self._safe_proba(stump, X_arr, n_classes)
|
|
236
|
+
logp = np.log(proba)
|
|
237
|
+
cum_scores += (n_classes - 1) * (logp - logp.mean(axis=1, keepdims=True))
|
|
238
|
+
yield self.classes_[np.argmax(cum_scores, axis=1)]
|
|
239
|
+
|
|
240
|
+
def score(self, X: ArrayLike, y: ArrayLike) -> float:
|
|
241
|
+
X_arr, y_arr = validate_xy(X, y)
|
|
242
|
+
return float(np.mean(self.predict(X_arr) == y_arr))
|
|
243
|
+
|
|
244
|
+
# -- internals --------------------------------------------------------------
|
|
245
|
+
|
|
246
|
+
@staticmethod
|
|
247
|
+
def _safe_proba(stump: DecisionTreeClassifier, X: FloatArray, n_classes: int) -> FloatArray:
|
|
248
|
+
"""Map a stump's predict_proba (over its own classes_ subset, which may
|
|
249
|
+
be missing classes absent from its bootstrap/weighted sample) into a
|
|
250
|
+
full, strictly-positive (n_samples, n_classes) probability matrix."""
|
|
251
|
+
p_sub = stump.predict_proba(X)
|
|
252
|
+
proba = np.full((X.shape[0], n_classes), _EPS)
|
|
253
|
+
proba[:, stump.classes_] = np.maximum(p_sub, _EPS)
|
|
254
|
+
proba /= proba.sum(axis=1, keepdims=True)
|
|
255
|
+
return proba
|