scratchkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. mlscratch/__init__.py +56 -0
  2. mlscratch/__main__.py +118 -0
  3. mlscratch/bayesian/__init__.py +53 -0
  4. mlscratch/bayesian/bayesian_linear_regression.py +171 -0
  5. mlscratch/bayesian/bayesian_network.py +248 -0
  6. mlscratch/bayesian/bayesian_nn.py +315 -0
  7. mlscratch/bayesian/gaussian_process.py +207 -0
  8. mlscratch/bayesian/hmm.py +277 -0
  9. mlscratch/bayesian/init.py +52 -0
  10. mlscratch/bayesian/kalman_filter.py +182 -0
  11. mlscratch/bayesian/naive_bayes.py +209 -0
  12. mlscratch/metrics/__init__.py +59 -0
  13. mlscratch/metrics/classification.py +365 -0
  14. mlscratch/metrics/regression.py +79 -0
  15. mlscratch/neural/__init__.py +121 -0
  16. mlscratch/neural/attention.py +420 -0
  17. mlscratch/neural/autoencoder.py +543 -0
  18. mlscratch/neural/boltzmann.py +231 -0
  19. mlscratch/neural/cnn.py +593 -0
  20. mlscratch/neural/cvnn.py +322 -0
  21. mlscratch/neural/gan.py +364 -0
  22. mlscratch/neural/hopfield.py +193 -0
  23. mlscratch/neural/perceptron.py +398 -0
  24. mlscratch/neural/rbf_network.py +230 -0
  25. mlscratch/neural/recurrent.py +569 -0
  26. mlscratch/preprocessing/__init__.py +38 -0
  27. mlscratch/preprocessing/encoders.py +140 -0
  28. mlscratch/preprocessing/model_selection.py +119 -0
  29. mlscratch/preprocessing/polynomial.py +105 -0
  30. mlscratch/preprocessing/scalers.py +220 -0
  31. mlscratch/py.typed +0 -0
  32. mlscratch/reinforcement/__init__.py +59 -0
  33. mlscratch/reinforcement/ddpg.py +363 -0
  34. mlscratch/reinforcement/dqn.py +319 -0
  35. mlscratch/reinforcement/ppo.py +452 -0
  36. mlscratch/reinforcement/q_learning.py +352 -0
  37. mlscratch/reinforcement/sac.py +382 -0
  38. mlscratch/reinforcement/utils.py +594 -0
  39. mlscratch/supervised/__init__.py +76 -0
  40. mlscratch/supervised/_validation.py +50 -0
  41. mlscratch/supervised/adaboost.py +255 -0
  42. mlscratch/supervised/decision_tree.py +495 -0
  43. mlscratch/supervised/gradient_boosting.py +354 -0
  44. mlscratch/supervised/knn.py +234 -0
  45. mlscratch/supervised/lasso_regression.py +125 -0
  46. mlscratch/supervised/linear_models.py +459 -0
  47. mlscratch/supervised/linear_regression.py +197 -0
  48. mlscratch/supervised/logistic_regression.py +119 -0
  49. mlscratch/supervised/naive_bayes.py +113 -0
  50. mlscratch/supervised/random_forest.py +321 -0
  51. mlscratch/supervised/ridge_regression.py +93 -0
  52. mlscratch/supervised/svm.py +356 -0
  53. mlscratch/unsupervised/__init__.py +39 -0
  54. mlscratch/unsupervised/apriori.py +178 -0
  55. mlscratch/unsupervised/dbscan.py +141 -0
  56. mlscratch/unsupervised/gmm.py +204 -0
  57. mlscratch/unsupervised/hierarchical_clustering.py +137 -0
  58. mlscratch/unsupervised/ica.py +167 -0
  59. mlscratch/unsupervised/kmeans.py +135 -0
  60. mlscratch/unsupervised/kmedoids.py +133 -0
  61. mlscratch/unsupervised/pca.py +103 -0
  62. mlscratch/unsupervised/tsne.py +200 -0
  63. scratchkit-0.2.0.dist-info/METADATA +241 -0
  64. scratchkit-0.2.0.dist-info/RECORD +68 -0
  65. scratchkit-0.2.0.dist-info/WHEEL +5 -0
  66. scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
  67. scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
  68. scratchkit-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,321 @@
1
+ r"""
2
+ Random Forest
3
+ =============
4
+ Bootstrap-aggregated ("bagged") ensembles of :class:`DecisionTreeClassifier`
5
+ / :class:`DecisionTreeRegressor` trees, decorrelated by also restricting
6
+ each tree to a random subset of features (the "random subspace" method).
7
+
8
+ Algorithm
9
+ ---------
10
+ For each of ``n_estimators`` trees:
11
+
12
+ 1. Draw a bootstrap sample of ``n`` rows with replacement (if
13
+ ``bootstrap=True``).
14
+ 2. Draw ``max_features`` columns without replacement.
15
+ 3. Fit a full (or depth-limited) tree on that bootstrap sample restricted
16
+ to those columns.
17
+
18
+ ``RandomForestClassifier`` combines trees by averaging their
19
+ ``predict_proba`` output (soft voting) and taking the arg-max; rows
20
+ where a particular tree never saw a class during its bootstrap draw are
21
+ naturally handled because that tree's probability for the missing
22
+ class is implicitly zero, not undefined.
23
+
24
+ ``RandomForestRegressor`` combines trees by averaging their scalar
25
+ predictions.
26
+
27
+ Out-of-bag (OOB) estimation
28
+ ----------------------------
29
+ When ``oob_score=True``, each tree's prediction is also collected for
30
+ the ``~37%`` of rows it never trained on (the rows not drawn by its
31
+ bootstrap sample), giving an unbiased estimate of generalisation
32
+ performance without held-out data.
33
+
34
+ Complexity
35
+ ----------
36
+ - Training : O(n_estimators * n d log n)
37
+ - Inference: O(n_estimators * depth)
38
+ """
39
+
40
+ from __future__ import annotations
41
+
42
+ import numpy as np
43
+ from numpy.typing import ArrayLike, NDArray
44
+
45
+ from ._validation import validate_x, validate_xy
46
+ from .decision_tree import DecisionTreeClassifier, DecisionTreeRegressor
47
+
48
+ FloatArray = NDArray[np.float64]
49
+ IntArray = NDArray[np.int64]
50
+
51
+ _EPS = 1e-12
52
+
53
+
54
+ def _resolve_max_features(max_features: int | float | str | None, n_features: int) -> int:
55
+ if max_features is None:
56
+ return n_features
57
+ if isinstance(max_features, str):
58
+ if max_features == "sqrt":
59
+ return max(1, int(np.sqrt(n_features)))
60
+ if max_features == "log2":
61
+ return max(1, int(np.log2(n_features)))
62
+ raise ValueError("max_features must be None, int, float, 'sqrt', or 'log2'.")
63
+ if isinstance(max_features, float):
64
+ if not (0.0 < max_features <= 1.0):
65
+ raise ValueError("max_features as a float must be in (0, 1].")
66
+ return max(1, int(round(max_features * n_features)))
67
+ return max(1, min(int(max_features), n_features))
68
+
69
+
70
+ # ──────────────────────────────────────────────────────────────────────────
71
+ # RandomForestClassifier
72
+ # ──────────────────────────────────────────────────────────────────────────
73
+
74
+
75
+ class RandomForestClassifier:
76
+ """Bagged ensemble of decision-tree classifiers with feature subsampling.
77
+
78
+ Parameters
79
+ ----------
80
+ n_estimators : int, default=100
81
+ max_depth : int | None, default=None
82
+ min_samples_split : int, default=2
83
+ min_samples_leaf : int, default=1
84
+ criterion : str, default='gini'
85
+ ``'gini'`` or ``'entropy'``, forwarded to each tree.
86
+ max_features : int | float | str | None, default='sqrt'
87
+ Number of features considered by each tree: an int (exact count),
88
+ a float in (0, 1] (fraction), ``'sqrt'``, ``'log2'``, or ``None``
89
+ (use all features).
90
+ bootstrap : bool, default=True
91
+ Whether each tree is trained on a bootstrap resample.
92
+ oob_score : bool, default=False
93
+ Whether to compute an out-of-bag accuracy estimate (``oob_score_``).
94
+ random_state : int | None, default=None
95
+
96
+ Attributes
97
+ ----------
98
+ estimators_ : list of (tree, feature_indices) tuples
99
+ classes_ : sorted unique labels seen during fit
100
+ feature_importances_ : mean impurity-decrease importance across trees
101
+ oob_score_ : float, only set when ``oob_score=True``
102
+ """
103
+
104
+ def __init__(
105
+ self,
106
+ n_estimators: int = 100,
107
+ max_depth: int | None = None,
108
+ min_samples_split: int = 2,
109
+ min_samples_leaf: int = 1,
110
+ criterion: str = "gini",
111
+ max_features: int | float | str | None = "sqrt",
112
+ bootstrap: bool = True,
113
+ oob_score: bool = False,
114
+ random_state: int | None = None,
115
+ ) -> None:
116
+ if int(n_estimators) < 1:
117
+ raise ValueError("n_estimators must be >= 1.")
118
+ if oob_score and not bootstrap:
119
+ raise ValueError("oob_score requires bootstrap=True.")
120
+ self.n_estimators = int(n_estimators)
121
+ self.max_depth = max_depth
122
+ self.min_samples_split = int(min_samples_split)
123
+ self.min_samples_leaf = int(min_samples_leaf)
124
+ self.criterion = criterion
125
+ self.max_features = max_features
126
+ self.bootstrap = bootstrap
127
+ self.oob_score = oob_score
128
+ self.random_state = random_state
129
+
130
+ self.estimators_: list[tuple[DecisionTreeClassifier, IntArray]] = []
131
+ self.classes_: IntArray | None = None
132
+ self.n_features_in_: int | None = None
133
+ self.feature_importances_: FloatArray | None = None
134
+ self.oob_score_: float | None = None
135
+
136
+ def fit(self, X: ArrayLike, y: ArrayLike) -> RandomForestClassifier:
137
+ X_arr, y_raw = validate_xy(X, y)
138
+ self.classes_, y_idx = np.unique(y_raw, return_inverse=True)
139
+ y_idx = y_idx.astype(np.int64)
140
+ n_samples, n_features = X_arr.shape
141
+ self.n_features_in_ = n_features
142
+ n_classes = self.classes_.size
143
+ n_feat_sub = _resolve_max_features(self.max_features, n_features)
144
+
145
+ rng = np.random.default_rng(self.random_state)
146
+ self.estimators_ = []
147
+ importances = np.zeros(n_features, dtype=np.float64)
148
+
149
+ oob_proba = np.zeros((n_samples, n_classes)) if self.oob_score else None
150
+ oob_count = np.zeros(n_samples, dtype=np.int64) if self.oob_score else None
151
+
152
+ for _ in range(self.n_estimators):
153
+ sample_idx = (
154
+ rng.integers(0, n_samples, n_samples) if self.bootstrap else np.arange(n_samples)
155
+ )
156
+ feat_idx = rng.choice(n_features, size=n_feat_sub, replace=False)
157
+
158
+ tree = DecisionTreeClassifier(
159
+ max_depth=self.max_depth,
160
+ min_samples_split=self.min_samples_split,
161
+ min_samples_leaf=self.min_samples_leaf,
162
+ criterion=self.criterion,
163
+ )
164
+ tree.fit(X_arr[sample_idx][:, feat_idx], y_idx[sample_idx])
165
+ self.estimators_.append((tree, feat_idx))
166
+ importances[feat_idx] += tree.feature_importances_
167
+
168
+ if self.oob_score:
169
+ in_bag = np.zeros(n_samples, dtype=bool)
170
+ in_bag[sample_idx] = True
171
+ oob_idx = np.flatnonzero(~in_bag)
172
+ if oob_idx.size:
173
+ proba = tree.predict_proba(X_arr[oob_idx][:, feat_idx])
174
+ oob_proba[np.ix_(oob_idx, tree.classes_)] += proba
175
+ oob_count[oob_idx] += 1
176
+
177
+ importances /= self.n_estimators
178
+ total = importances.sum()
179
+ self.feature_importances_ = importances / total if total > _EPS else importances
180
+
181
+ if self.oob_score:
182
+ has_oob = oob_count > 0
183
+ if np.any(has_oob):
184
+ pred_idx = np.argmax(oob_proba[has_oob], axis=1)
185
+ self.oob_score_ = float(np.mean(pred_idx == y_idx[has_oob]))
186
+ else:
187
+ self.oob_score_ = float("nan")
188
+ return self
189
+
190
+ def predict_proba(self, X: ArrayLike) -> FloatArray:
191
+ if not self.estimators_:
192
+ raise RuntimeError("Call fit() before predict_proba().")
193
+ X_arr = validate_x(X)
194
+ n_classes = self.classes_.size
195
+ proba = np.zeros((X_arr.shape[0], n_classes), dtype=np.float64)
196
+ for tree, feat_idx in self.estimators_:
197
+ p = tree.predict_proba(X_arr[:, feat_idx])
198
+ proba[:, tree.classes_] += p
199
+ proba /= len(self.estimators_)
200
+ return proba
201
+
202
+ def predict(self, X: ArrayLike) -> NDArray:
203
+ proba = self.predict_proba(X)
204
+ return self.classes_[np.argmax(proba, axis=1)]
205
+
206
+ def score(self, X: ArrayLike, y: ArrayLike) -> float:
207
+ X_arr, y_arr = validate_xy(X, y)
208
+ return float(np.mean(self.predict(X_arr) == y_arr))
209
+
210
+
211
+ # ──────────────────────────────────────────────────────────────────────────
212
+ # RandomForestRegressor
213
+ # ──────────────────────────────────────────────────────────────────────────
214
+
215
+
216
+ class RandomForestRegressor:
217
+ """Bagged ensemble of decision-tree regressors with feature subsampling.
218
+
219
+ Parameters mirror :class:`RandomForestClassifier`, except
220
+ ``max_features`` defaults to ``1.0`` (consider all features at every
221
+ split, the conventional bagging-regressor default) and there is no
222
+ ``criterion`` choice (trees always split on weighted MSE).
223
+ """
224
+
225
+ def __init__(
226
+ self,
227
+ n_estimators: int = 100,
228
+ max_depth: int | None = None,
229
+ min_samples_split: int = 2,
230
+ min_samples_leaf: int = 1,
231
+ max_features: int | float | str | None = 1.0,
232
+ bootstrap: bool = True,
233
+ oob_score: bool = False,
234
+ random_state: int | None = None,
235
+ ) -> None:
236
+ if int(n_estimators) < 1:
237
+ raise ValueError("n_estimators must be >= 1.")
238
+ if oob_score and not bootstrap:
239
+ raise ValueError("oob_score requires bootstrap=True.")
240
+ self.n_estimators = int(n_estimators)
241
+ self.max_depth = max_depth
242
+ self.min_samples_split = int(min_samples_split)
243
+ self.min_samples_leaf = int(min_samples_leaf)
244
+ self.max_features = max_features
245
+ self.bootstrap = bootstrap
246
+ self.oob_score = oob_score
247
+ self.random_state = random_state
248
+
249
+ self.estimators_: list[tuple[DecisionTreeRegressor, IntArray]] = []
250
+ self.n_features_in_: int | None = None
251
+ self.feature_importances_: FloatArray | None = None
252
+ self.oob_score_: float | None = None
253
+
254
+ def fit(self, X: ArrayLike, y: ArrayLike) -> RandomForestRegressor:
255
+ X_arr, y_arr = validate_xy(X, y)
256
+ y_arr = y_arr.astype(np.float64)
257
+ n_samples, n_features = X_arr.shape
258
+ self.n_features_in_ = n_features
259
+ n_feat_sub = _resolve_max_features(self.max_features, n_features)
260
+
261
+ rng = np.random.default_rng(self.random_state)
262
+ self.estimators_ = []
263
+ importances = np.zeros(n_features, dtype=np.float64)
264
+
265
+ oob_sum = np.zeros(n_samples) if self.oob_score else None
266
+ oob_count = np.zeros(n_samples, dtype=np.int64) if self.oob_score else None
267
+
268
+ for _ in range(self.n_estimators):
269
+ sample_idx = (
270
+ rng.integers(0, n_samples, n_samples) if self.bootstrap else np.arange(n_samples)
271
+ )
272
+ feat_idx = rng.choice(n_features, size=n_feat_sub, replace=False)
273
+
274
+ tree = DecisionTreeRegressor(
275
+ max_depth=self.max_depth,
276
+ min_samples_split=self.min_samples_split,
277
+ min_samples_leaf=self.min_samples_leaf,
278
+ )
279
+ tree.fit(X_arr[sample_idx][:, feat_idx], y_arr[sample_idx])
280
+ self.estimators_.append((tree, feat_idx))
281
+ importances[feat_idx] += tree.feature_importances_
282
+
283
+ if self.oob_score:
284
+ in_bag = np.zeros(n_samples, dtype=bool)
285
+ in_bag[sample_idx] = True
286
+ oob_idx = np.flatnonzero(~in_bag)
287
+ if oob_idx.size:
288
+ oob_sum[oob_idx] += tree.predict(X_arr[oob_idx][:, feat_idx])
289
+ oob_count[oob_idx] += 1
290
+
291
+ importances /= self.n_estimators
292
+ total = importances.sum()
293
+ self.feature_importances_ = importances / total if total > _EPS else importances
294
+
295
+ if self.oob_score:
296
+ has_oob = oob_count > 0
297
+ if np.any(has_oob):
298
+ oob_pred = oob_sum[has_oob] / oob_count[has_oob]
299
+ y_true = y_arr[has_oob]
300
+ ss_res = np.sum((y_true - oob_pred) ** 2)
301
+ ss_tot = np.sum((y_true - y_true.mean()) ** 2)
302
+ self.oob_score_ = float(1.0 - ss_res / ss_tot) if ss_tot > _EPS else 0.0
303
+ else:
304
+ self.oob_score_ = float("nan")
305
+ return self
306
+
307
+ def predict(self, X: ArrayLike) -> FloatArray:
308
+ if not self.estimators_:
309
+ raise RuntimeError("Call fit() before predict().")
310
+ X_arr = validate_x(X)
311
+ preds = np.zeros(X_arr.shape[0], dtype=np.float64)
312
+ for tree, feat_idx in self.estimators_:
313
+ preds += tree.predict(X_arr[:, feat_idx])
314
+ return preds / len(self.estimators_)
315
+
316
+ def score(self, X: ArrayLike, y: ArrayLike) -> float:
317
+ X_arr, y_arr = validate_xy(X, y)
318
+ preds = self.predict(X_arr)
319
+ ss_res = np.sum((y_arr - preds) ** 2)
320
+ ss_tot = np.sum((y_arr - y_arr.mean()) ** 2)
321
+ return float(1.0 - ss_res / ss_tot) if ss_tot > _EPS else 0.0
@@ -0,0 +1,93 @@
1
+ """
2
+ Ridge Regression
3
+ ================
4
+
5
+ Ridge regression using the closed-form regularized normal equations.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import numpy as np
11
+ from numpy.typing import ArrayLike, NDArray
12
+
13
+ FloatArray = NDArray[np.float64]
14
+
15
+
16
+ def _validate_regression_inputs(
17
+ X: ArrayLike, y: ArrayLike,
18
+ ) -> tuple[FloatArray, FloatArray]:
19
+ X_arr = np.asarray(X, dtype=float)
20
+ y_arr = np.asarray(y, dtype=float).flatten()
21
+ if X_arr.ndim != 2:
22
+ raise ValueError("X must be a 2D array of shape (n_samples, n_features).")
23
+ if X_arr.shape[0] != y_arr.shape[0]:
24
+ raise ValueError(
25
+ f"X has {X_arr.shape[0]} samples but y has {y_arr.shape[0]}."
26
+ )
27
+ return X_arr, y_arr
28
+
29
+
30
+ class RidgeRegression:
31
+ """Ridge regression with an L2 penalty on coefficients.
32
+
33
+ Parameters
34
+ ----------
35
+ alpha : float, default=1.0
36
+ Regularization strength (L2 penalty coefficient).
37
+ add_intercept : bool, default=True
38
+ Whether to fit an intercept term.
39
+
40
+ Attributes
41
+ ----------
42
+ coef_ : FloatArray
43
+ Estimated coefficients for each feature.
44
+ intercept_ : float
45
+ Estimated intercept.
46
+ """
47
+
48
+ def __init__(self, alpha: float = 1.0, add_intercept: bool = True) -> None:
49
+ self.alpha = float(alpha)
50
+ self.add_intercept = add_intercept
51
+ self.coef_: FloatArray | None = None
52
+ self.intercept_: float | None = None
53
+
54
+ def fit(self, X: ArrayLike, y: ArrayLike) -> "RidgeRegression":
55
+ """Fit the Ridge regression model using the closed-form solution."""
56
+ X_arr, y_arr = _validate_regression_inputs(X, y)
57
+ if self.add_intercept:
58
+ X_arr = np.column_stack([np.ones(X_arr.shape[0]), X_arr])
59
+
60
+ n_features = X_arr.shape[1]
61
+ identity = np.eye(n_features)
62
+ if self.add_intercept:
63
+ identity[0, 0] = 0.0
64
+
65
+ coef = np.linalg.solve(
66
+ X_arr.T @ X_arr + self.alpha * identity,
67
+ X_arr.T @ y_arr,
68
+ )
69
+
70
+ if self.add_intercept:
71
+ self.intercept_ = float(coef[0])
72
+ self.coef_ = coef[1:].astype(np.float64)
73
+ else:
74
+ self.intercept_ = 0.0
75
+ self.coef_ = coef.astype(np.float64)
76
+ return self
77
+
78
+ def predict(self, X: ArrayLike) -> FloatArray:
79
+ """Predict targets using the fitted Ridge model."""
80
+ if self.coef_ is None or self.intercept_ is None:
81
+ raise RuntimeError("Call fit() before predict().")
82
+ X_arr = np.asarray(X, dtype=float)
83
+ if X_arr.ndim != 2:
84
+ raise ValueError("X must be a 2D array.")
85
+ return (X_arr @ self.coef_ + self.intercept_).astype(np.float64)
86
+
87
+ def score(self, X: ArrayLike, y: ArrayLike) -> float:
88
+ """Return R² of the fitted Ridge model."""
89
+ X_arr, y_arr = _validate_regression_inputs(X, y)
90
+ y_pred = self.predict(X_arr)
91
+ ss_res = np.sum((y_arr - y_pred) ** 2)
92
+ ss_tot = np.sum((y_arr - np.mean(y_arr)) ** 2)
93
+ return float(1.0 - ss_res / ss_tot) if ss_tot > 0 else 0.0