scratchkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. mlscratch/__init__.py +56 -0
  2. mlscratch/__main__.py +118 -0
  3. mlscratch/bayesian/__init__.py +53 -0
  4. mlscratch/bayesian/bayesian_linear_regression.py +171 -0
  5. mlscratch/bayesian/bayesian_network.py +248 -0
  6. mlscratch/bayesian/bayesian_nn.py +315 -0
  7. mlscratch/bayesian/gaussian_process.py +207 -0
  8. mlscratch/bayesian/hmm.py +277 -0
  9. mlscratch/bayesian/init.py +52 -0
  10. mlscratch/bayesian/kalman_filter.py +182 -0
  11. mlscratch/bayesian/naive_bayes.py +209 -0
  12. mlscratch/metrics/__init__.py +59 -0
  13. mlscratch/metrics/classification.py +365 -0
  14. mlscratch/metrics/regression.py +79 -0
  15. mlscratch/neural/__init__.py +121 -0
  16. mlscratch/neural/attention.py +420 -0
  17. mlscratch/neural/autoencoder.py +543 -0
  18. mlscratch/neural/boltzmann.py +231 -0
  19. mlscratch/neural/cnn.py +593 -0
  20. mlscratch/neural/cvnn.py +322 -0
  21. mlscratch/neural/gan.py +364 -0
  22. mlscratch/neural/hopfield.py +193 -0
  23. mlscratch/neural/perceptron.py +398 -0
  24. mlscratch/neural/rbf_network.py +230 -0
  25. mlscratch/neural/recurrent.py +569 -0
  26. mlscratch/preprocessing/__init__.py +38 -0
  27. mlscratch/preprocessing/encoders.py +140 -0
  28. mlscratch/preprocessing/model_selection.py +119 -0
  29. mlscratch/preprocessing/polynomial.py +105 -0
  30. mlscratch/preprocessing/scalers.py +220 -0
  31. mlscratch/py.typed +0 -0
  32. mlscratch/reinforcement/__init__.py +59 -0
  33. mlscratch/reinforcement/ddpg.py +363 -0
  34. mlscratch/reinforcement/dqn.py +319 -0
  35. mlscratch/reinforcement/ppo.py +452 -0
  36. mlscratch/reinforcement/q_learning.py +352 -0
  37. mlscratch/reinforcement/sac.py +382 -0
  38. mlscratch/reinforcement/utils.py +594 -0
  39. mlscratch/supervised/__init__.py +76 -0
  40. mlscratch/supervised/_validation.py +50 -0
  41. mlscratch/supervised/adaboost.py +255 -0
  42. mlscratch/supervised/decision_tree.py +495 -0
  43. mlscratch/supervised/gradient_boosting.py +354 -0
  44. mlscratch/supervised/knn.py +234 -0
  45. mlscratch/supervised/lasso_regression.py +125 -0
  46. mlscratch/supervised/linear_models.py +459 -0
  47. mlscratch/supervised/linear_regression.py +197 -0
  48. mlscratch/supervised/logistic_regression.py +119 -0
  49. mlscratch/supervised/naive_bayes.py +113 -0
  50. mlscratch/supervised/random_forest.py +321 -0
  51. mlscratch/supervised/ridge_regression.py +93 -0
  52. mlscratch/supervised/svm.py +356 -0
  53. mlscratch/unsupervised/__init__.py +39 -0
  54. mlscratch/unsupervised/apriori.py +178 -0
  55. mlscratch/unsupervised/dbscan.py +141 -0
  56. mlscratch/unsupervised/gmm.py +204 -0
  57. mlscratch/unsupervised/hierarchical_clustering.py +137 -0
  58. mlscratch/unsupervised/ica.py +167 -0
  59. mlscratch/unsupervised/kmeans.py +135 -0
  60. mlscratch/unsupervised/kmedoids.py +133 -0
  61. mlscratch/unsupervised/pca.py +103 -0
  62. mlscratch/unsupervised/tsne.py +200 -0
  63. scratchkit-0.2.0.dist-info/METADATA +241 -0
  64. scratchkit-0.2.0.dist-info/RECORD +68 -0
  65. scratchkit-0.2.0.dist-info/WHEEL +5 -0
  66. scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
  67. scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
  68. scratchkit-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,50 @@
1
+ r"""
2
+ mlscratch.supervised._validation
3
+ ==================================
4
+ Internal, private input-validation helpers shared by every estimator in
5
+ ``mlscratch.supervised`` (trees, ensembles, kernel SVM). Not part of the
6
+ public API — do not import this module from outside the package.
7
+
8
+ Centralising these checks means every estimator raises the *same*
9
+ exception type with the *same* message shape for the *same* mistake
10
+ (wrong-dimensional `X`, mismatched `X`/`y` length, malformed
11
+ `sample_weight`), which is what lets the test suite assert on a single
12
+ shared error-message contract (e.g. ``match="samples"``) across five
13
+ otherwise-independent algorithm implementations.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import numpy as np
19
+ from numpy.typing import ArrayLike, NDArray
20
+
21
+ FloatArray = NDArray[np.float64]
22
+
23
+
24
+ def validate_x(X: ArrayLike) -> FloatArray:
25
+ """Coerce X to a float64 ndarray and require it to be 2D."""
26
+ X_arr = np.asarray(X, dtype=np.float64)
27
+ if X_arr.ndim != 2:
28
+ raise ValueError("X must be a 2D array of shape (n_samples, n_features).")
29
+ return X_arr
30
+
31
+
32
+ def validate_xy(X: ArrayLike, y: ArrayLike) -> tuple[FloatArray, NDArray]:
33
+ """validate_x(X), plus require y to share X's sample count."""
34
+ X_arr = validate_x(X)
35
+ y_arr = np.asarray(y).flatten()
36
+ if X_arr.shape[0] != y_arr.shape[0]:
37
+ raise ValueError(f"X has {X_arr.shape[0]} samples but y has {y_arr.shape[0]}.")
38
+ return X_arr, y_arr
39
+
40
+
41
+ def validate_sample_weight(sample_weight: ArrayLike | None, n_samples: int) -> FloatArray:
42
+ """Return a uniform weight vector if None, else validate a user-supplied one."""
43
+ if sample_weight is None:
44
+ return np.ones(n_samples, dtype=np.float64)
45
+ w = np.asarray(sample_weight, dtype=np.float64).flatten()
46
+ if w.shape[0] != n_samples:
47
+ raise ValueError(f"sample_weight has {w.shape[0]} entries but X has {n_samples} samples.")
48
+ if np.any(w < 0):
49
+ raise ValueError("sample_weight entries must be non-negative.")
50
+ return w
@@ -0,0 +1,255 @@
1
+ r"""
2
+ AdaBoost
3
+ ========
4
+ Adaptive Boosting: a weighted ensemble of shallow decision-tree "weak
5
+ learners" (stumps by default), where each successive learner is fit
6
+ on a re-weighted version of the training set that emphasises the
7
+ samples the ensemble-so-far gets wrong.
8
+
9
+ Two algorithms are supported, both natively multiclass
10
+ (Zhu, Rosset, Zhu & Hastie, 2009, "Multi-class AdaBoost"):
11
+
12
+ ``'SAMME'`` (discrete)
13
+ -----------------------
14
+ Uses only each weak learner's hard predictions.
15
+
16
+ .. math::
17
+ \alpha_m = \eta \left[ \ln\frac{1-\mathrm{err}_m}{\mathrm{err}_m} + \ln(K-1) \right]
18
+
19
+ .. math::
20
+ w_i \leftarrow w_i \exp\!\big(\alpha_m \cdot \mathbb{1}[\hat y_i \ne y_i]\big),
21
+ \quad \text{then renormalise}
22
+
23
+ ``'SAMME.R'`` (real-valued, the modern default)
24
+ --------------------------------------------------
25
+ Uses each weak learner's class-probability estimates directly, which
26
+ typically converges in fewer rounds:
27
+
28
+ .. math::
29
+ h_k^{(m)}(x) = (K-1)\left(\log p_k(x) - \frac1K\sum_{k'} \log p_{k'}(x)\right)
30
+
31
+ .. math::
32
+ w_i \leftarrow w_i \exp\!\left(-\eta\,\frac{K-1}{K}\, y_i^{\mathsf T} \log p(x_i)\right)
33
+
34
+ where :math:`y_i` uses the symmetric :math:`\{-1/(K-1), 1\}` class
35
+ coding. The ensemble decision is :math:`\arg\max_k \sum_m h_k^{(m)}(x)`.
36
+
37
+ For :math:`K=2` both algorithms reduce to the classic binary AdaBoost.
38
+
39
+ Complexity
40
+ ----------
41
+ - Training : O(n_estimators * weak-learner fit cost)
42
+ - Inference: O(n_estimators * weak-learner predict cost)
43
+ """
44
+
45
+ from __future__ import annotations
46
+
47
+ import numpy as np
48
+ from numpy.typing import ArrayLike, NDArray
49
+
50
+ from ._validation import validate_x, validate_xy
51
+ from .decision_tree import DecisionTreeClassifier
52
+
53
+ FloatArray = NDArray[np.float64]
54
+ IntArray = NDArray[np.int64]
55
+
56
+ _EPS = 1e-10
57
+
58
+
59
+ class AdaBoostClassifier:
60
+ """Adaptive Boosting classifier (SAMME / SAMME.R), natively multiclass.
61
+
62
+ Parameters
63
+ ----------
64
+ n_estimators : int, default=50
65
+ Maximum number of weak learners. Boosting may stop earlier if
66
+ a weak learner achieves zero training error or (SAMME only)
67
+ becomes worse than random guessing.
68
+ learning_rate : float, default=1.0
69
+ Shrinks the contribution of each weak learner.
70
+ algorithm : str, default='SAMME.R'
71
+ ``'SAMME'`` (discrete) or ``'SAMME.R'`` (real-valued).
72
+ max_depth : int, default=1
73
+ Depth of each weak learner; ``1`` gives the classic "decision
74
+ stump".
75
+ random_state : int | None, default=None
76
+
77
+ Attributes
78
+ ----------
79
+ estimators_ : the fitted sequence of weak learners
80
+ estimator_weights_ : per-estimator combination weight (alpha)
81
+ estimator_errors_ : per-estimator weighted training error
82
+ classes_ : sorted unique labels seen during fit
83
+ """
84
+
85
+ def __init__(
86
+ self,
87
+ n_estimators: int = 50,
88
+ learning_rate: float = 1.0,
89
+ algorithm: str = "SAMME.R",
90
+ max_depth: int = 1,
91
+ random_state: int | None = None,
92
+ ) -> None:
93
+ if algorithm not in ("SAMME", "SAMME.R"):
94
+ raise ValueError("algorithm must be 'SAMME' or 'SAMME.R'.")
95
+ if int(n_estimators) < 1:
96
+ raise ValueError("n_estimators must be >= 1.")
97
+ if learning_rate <= 0:
98
+ raise ValueError("learning_rate must be positive.")
99
+ self.n_estimators = int(n_estimators)
100
+ self.learning_rate = float(learning_rate)
101
+ self.algorithm = algorithm
102
+ self.max_depth = int(max_depth)
103
+ self.random_state = random_state
104
+
105
+ self.estimators_: list[DecisionTreeClassifier] = []
106
+ self.estimator_weights_: FloatArray | None = None
107
+ self.estimator_errors_: FloatArray | None = None
108
+ self.classes_: IntArray | None = None
109
+ self.n_features_in_: int | None = None
110
+
111
+ # -- public API -----------------------------------------------------------
112
+
113
+ def fit(self, X: ArrayLike, y: ArrayLike) -> AdaBoostClassifier:
114
+ X_arr, y_raw = validate_xy(X, y)
115
+ self.classes_, y_idx = np.unique(y_raw, return_inverse=True)
116
+ y_idx = y_idx.astype(np.int64)
117
+ n_classes = self.classes_.size
118
+ if n_classes < 2:
119
+ raise ValueError("AdaBoostClassifier requires at least 2 classes.")
120
+ n = X_arr.shape[0]
121
+ self.n_features_in_ = X_arr.shape[1]
122
+ rng = np.random.default_rng(self.random_state)
123
+
124
+ sample_weight = np.full(n, 1.0 / n, dtype=np.float64)
125
+ self.estimators_ = []
126
+ weights: list[float] = []
127
+ errors: list[float] = []
128
+
129
+ for m in range(self.n_estimators):
130
+ stump = DecisionTreeClassifier(
131
+ max_depth=self.max_depth, random_state=int(rng.integers(0, 2**31 - 1))
132
+ )
133
+ stump.fit(X_arr, y_idx, sample_weight=sample_weight)
134
+
135
+ if self.algorithm == "SAMME":
136
+ pred = stump.predict(X_arr)
137
+ incorrect = pred != y_idx
138
+ err = float(np.average(incorrect, weights=sample_weight))
139
+
140
+ if err >= 1.0 - 1.0 / n_classes:
141
+ if not self.estimators_:
142
+ raise RuntimeError(
143
+ "BaseEstimator is worse than random guessing on the first "
144
+ "boosting round; AdaBoost cannot be fit."
145
+ )
146
+ break
147
+
148
+ err_clipped = float(np.clip(err, _EPS, 1.0 - _EPS))
149
+ alpha = self.learning_rate * (
150
+ np.log((1.0 - err_clipped) / err_clipped) + np.log(n_classes - 1)
151
+ )
152
+
153
+ self.estimators_.append(stump)
154
+ weights.append(float(alpha))
155
+ errors.append(err)
156
+
157
+ if err <= 0.0 or m == self.n_estimators - 1:
158
+ break
159
+ sample_weight = sample_weight * np.exp(alpha * incorrect)
160
+
161
+ else: # SAMME.R
162
+ proba = self._safe_proba(stump, X_arr, n_classes)
163
+ logp = np.log(proba)
164
+ pred = np.argmax(proba, axis=1)
165
+ incorrect = pred != y_idx
166
+ err = float(np.average(incorrect, weights=sample_weight))
167
+
168
+ self.estimators_.append(stump)
169
+ weights.append(1.0) # SAMME.R combines via h(x) directly, not a scalar alpha
170
+ errors.append(err)
171
+
172
+ if err <= 0.0 or m == self.n_estimators - 1:
173
+ break
174
+
175
+ y_coding = np.full((n, n_classes), -1.0 / (n_classes - 1))
176
+ y_coding[np.arange(n), y_idx] = 1.0
177
+ contrib = (
178
+ -self.learning_rate
179
+ * (n_classes - 1)
180
+ / n_classes
181
+ * np.sum(y_coding * logp, axis=1)
182
+ )
183
+ sample_weight = sample_weight * np.exp(contrib)
184
+
185
+ sample_weight = np.maximum(sample_weight, _EPS)
186
+ sample_weight /= sample_weight.sum()
187
+
188
+ self.estimator_weights_ = np.array(weights, dtype=np.float64)
189
+ self.estimator_errors_ = np.array(errors, dtype=np.float64)
190
+ return self
191
+
192
+ def decision_function(self, X: ArrayLike) -> FloatArray:
193
+ """Return the per-class ensemble score, shape ``(n_samples, n_classes)``."""
194
+ if not self.estimators_:
195
+ raise RuntimeError("Call fit() before decision_function().")
196
+ X_arr = validate_x(X)
197
+ n_classes = self.classes_.size
198
+ scores = np.zeros((X_arr.shape[0], n_classes), dtype=np.float64)
199
+
200
+ if self.algorithm == "SAMME":
201
+ for stump, alpha in zip(self.estimators_, self.estimator_weights_, strict=True):
202
+ pred = stump.predict(X_arr)
203
+ scores[np.arange(X_arr.shape[0]), pred] += alpha
204
+ else:
205
+ for stump in self.estimators_:
206
+ proba = self._safe_proba(stump, X_arr, n_classes)
207
+ logp = np.log(proba)
208
+ scores += (n_classes - 1) * (logp - logp.mean(axis=1, keepdims=True))
209
+ return scores
210
+
211
+ def predict(self, X: ArrayLike) -> NDArray:
212
+ scores = self.decision_function(X)
213
+ return self.classes_[np.argmax(scores, axis=1)]
214
+
215
+ def predict_proba(self, X: ArrayLike) -> FloatArray:
216
+ """Softmax of the (rescaled) ensemble decision scores."""
217
+ scores = self.decision_function(X)
218
+ n_classes = self.classes_.size
219
+ scaled = scores / max(1, n_classes - 1)
220
+ e = np.exp(scaled - scaled.max(axis=1, keepdims=True))
221
+ return e / e.sum(axis=1, keepdims=True)
222
+
223
+ def staged_predict(self, X: ArrayLike):
224
+ """Yield the ensemble's predicted labels after each boosting round."""
225
+ if not self.estimators_:
226
+ raise RuntimeError("Call fit() before staged_predict().")
227
+ X_arr = validate_x(X)
228
+ n_classes = self.classes_.size
229
+ cum_scores = np.zeros((X_arr.shape[0], n_classes), dtype=np.float64)
230
+ for i, stump in enumerate(self.estimators_):
231
+ if self.algorithm == "SAMME":
232
+ pred = stump.predict(X_arr)
233
+ cum_scores[np.arange(X_arr.shape[0]), pred] += self.estimator_weights_[i]
234
+ else:
235
+ proba = self._safe_proba(stump, X_arr, n_classes)
236
+ logp = np.log(proba)
237
+ cum_scores += (n_classes - 1) * (logp - logp.mean(axis=1, keepdims=True))
238
+ yield self.classes_[np.argmax(cum_scores, axis=1)]
239
+
240
+ def score(self, X: ArrayLike, y: ArrayLike) -> float:
241
+ X_arr, y_arr = validate_xy(X, y)
242
+ return float(np.mean(self.predict(X_arr) == y_arr))
243
+
244
+ # -- internals --------------------------------------------------------------
245
+
246
+ @staticmethod
247
+ def _safe_proba(stump: DecisionTreeClassifier, X: FloatArray, n_classes: int) -> FloatArray:
248
+ """Map a stump's predict_proba (over its own classes_ subset, which may
249
+ be missing classes absent from its bootstrap/weighted sample) into a
250
+ full, strictly-positive (n_samples, n_classes) probability matrix."""
251
+ p_sub = stump.predict_proba(X)
252
+ proba = np.full((X.shape[0], n_classes), _EPS)
253
+ proba[:, stump.classes_] = np.maximum(p_sub, _EPS)
254
+ proba /= proba.sum(axis=1, keepdims=True)
255
+ return proba