scratchkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. mlscratch/__init__.py +56 -0
  2. mlscratch/__main__.py +118 -0
  3. mlscratch/bayesian/__init__.py +53 -0
  4. mlscratch/bayesian/bayesian_linear_regression.py +171 -0
  5. mlscratch/bayesian/bayesian_network.py +248 -0
  6. mlscratch/bayesian/bayesian_nn.py +315 -0
  7. mlscratch/bayesian/gaussian_process.py +207 -0
  8. mlscratch/bayesian/hmm.py +277 -0
  9. mlscratch/bayesian/init.py +52 -0
  10. mlscratch/bayesian/kalman_filter.py +182 -0
  11. mlscratch/bayesian/naive_bayes.py +209 -0
  12. mlscratch/metrics/__init__.py +59 -0
  13. mlscratch/metrics/classification.py +365 -0
  14. mlscratch/metrics/regression.py +79 -0
  15. mlscratch/neural/__init__.py +121 -0
  16. mlscratch/neural/attention.py +420 -0
  17. mlscratch/neural/autoencoder.py +543 -0
  18. mlscratch/neural/boltzmann.py +231 -0
  19. mlscratch/neural/cnn.py +593 -0
  20. mlscratch/neural/cvnn.py +322 -0
  21. mlscratch/neural/gan.py +364 -0
  22. mlscratch/neural/hopfield.py +193 -0
  23. mlscratch/neural/perceptron.py +398 -0
  24. mlscratch/neural/rbf_network.py +230 -0
  25. mlscratch/neural/recurrent.py +569 -0
  26. mlscratch/preprocessing/__init__.py +38 -0
  27. mlscratch/preprocessing/encoders.py +140 -0
  28. mlscratch/preprocessing/model_selection.py +119 -0
  29. mlscratch/preprocessing/polynomial.py +105 -0
  30. mlscratch/preprocessing/scalers.py +220 -0
  31. mlscratch/py.typed +0 -0
  32. mlscratch/reinforcement/__init__.py +59 -0
  33. mlscratch/reinforcement/ddpg.py +363 -0
  34. mlscratch/reinforcement/dqn.py +319 -0
  35. mlscratch/reinforcement/ppo.py +452 -0
  36. mlscratch/reinforcement/q_learning.py +352 -0
  37. mlscratch/reinforcement/sac.py +382 -0
  38. mlscratch/reinforcement/utils.py +594 -0
  39. mlscratch/supervised/__init__.py +76 -0
  40. mlscratch/supervised/_validation.py +50 -0
  41. mlscratch/supervised/adaboost.py +255 -0
  42. mlscratch/supervised/decision_tree.py +495 -0
  43. mlscratch/supervised/gradient_boosting.py +354 -0
  44. mlscratch/supervised/knn.py +234 -0
  45. mlscratch/supervised/lasso_regression.py +125 -0
  46. mlscratch/supervised/linear_models.py +459 -0
  47. mlscratch/supervised/linear_regression.py +197 -0
  48. mlscratch/supervised/logistic_regression.py +119 -0
  49. mlscratch/supervised/naive_bayes.py +113 -0
  50. mlscratch/supervised/random_forest.py +321 -0
  51. mlscratch/supervised/ridge_regression.py +93 -0
  52. mlscratch/supervised/svm.py +356 -0
  53. mlscratch/unsupervised/__init__.py +39 -0
  54. mlscratch/unsupervised/apriori.py +178 -0
  55. mlscratch/unsupervised/dbscan.py +141 -0
  56. mlscratch/unsupervised/gmm.py +204 -0
  57. mlscratch/unsupervised/hierarchical_clustering.py +137 -0
  58. mlscratch/unsupervised/ica.py +167 -0
  59. mlscratch/unsupervised/kmeans.py +135 -0
  60. mlscratch/unsupervised/kmedoids.py +133 -0
  61. mlscratch/unsupervised/pca.py +103 -0
  62. mlscratch/unsupervised/tsne.py +200 -0
  63. scratchkit-0.2.0.dist-info/METADATA +241 -0
  64. scratchkit-0.2.0.dist-info/RECORD +68 -0
  65. scratchkit-0.2.0.dist-info/WHEEL +5 -0
  66. scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
  67. scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
  68. scratchkit-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,197 @@
1
+ """
2
+ Linear Regression
3
+ =================
4
+
5
+ Ordinary Least Squares and mini-batch gradient descent implementations for
6
+ linear regression.
7
+
8
+ The module uses a clear, sklearn-compatible interface with explicit math.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import Callable
14
+
15
+ import numpy as np
16
+ from numpy.typing import ArrayLike, NDArray
17
+
18
+ FloatArray = NDArray[np.float64]
19
+
20
+
21
+ def _validate_regression_inputs(
22
+ X: ArrayLike, y: ArrayLike,
23
+ ) -> tuple[FloatArray, FloatArray]:
24
+ X_arr = np.asarray(X, dtype=float)
25
+ y_arr = np.asarray(y, dtype=float).flatten()
26
+ if X_arr.ndim != 2:
27
+ raise ValueError("X must be a 2D array of shape (n_samples, n_features).")
28
+ if X_arr.shape[0] != y_arr.shape[0]:
29
+ raise ValueError(
30
+ f"X has {X_arr.shape[0]} samples but y has {y_arr.shape[0]}."
31
+ )
32
+ return X_arr, y_arr
33
+
34
+
35
+ class OrdinaryLeastSquares:
36
+ """Ordinary least squares regression using the normal equations.
37
+
38
+ Parameters
39
+ ----------
40
+ add_intercept : bool, default=True
41
+ If True, the model fits an intercept term by prepending a column of ones
42
+ to the design matrix.
43
+
44
+ Attributes
45
+ ----------
46
+ coef_ : FloatArray
47
+ Estimated regression coefficients for each feature.
48
+ intercept_ : float
49
+ Estimated bias term.
50
+ residuals_ : FloatArray
51
+ Residual values after fitting.
52
+ """
53
+
54
+ def __init__(self, add_intercept: bool = True) -> None:
55
+ self.add_intercept = add_intercept
56
+ self.coef_: FloatArray | None = None
57
+ self.intercept_: float | None = None
58
+ self.residuals_: FloatArray | None = None
59
+
60
+ def fit(self, X: ArrayLike, y: ArrayLike) -> "OrdinaryLeastSquares":
61
+ r"""Fit the linear regression model.
62
+
63
+ The closed-form least-squares solution is computed via a numerically
64
+ stable least-squares solver:
65
+
66
+ .. math::
67
+ \hat{\beta} = \operatorname{argmin}_\beta \|X \beta - y\|_2^2
68
+
69
+ Returns
70
+ -------
71
+ self : OrdinaryLeastSquares
72
+ """
73
+ X_arr, y_arr = _validate_regression_inputs(X, y)
74
+ if self.add_intercept:
75
+ X_arr = np.column_stack([np.ones(X_arr.shape[0]), X_arr])
76
+
77
+ solution, residuals, *_ = np.linalg.lstsq(X_arr, y_arr, rcond=None)
78
+ if self.add_intercept:
79
+ self.intercept_ = float(solution[0])
80
+ self.coef_ = solution[1:].astype(np.float64)
81
+ y_pred = X_arr[:, 1:] @ self.coef_ + self.intercept_
82
+ else:
83
+ self.intercept_ = 0.0
84
+ self.coef_ = solution.astype(np.float64)
85
+ y_pred = X_arr @ self.coef_
86
+
87
+ self.residuals_ = y_arr - y_pred
88
+ return self
89
+
90
+ def predict(self, X: ArrayLike) -> FloatArray:
91
+ """Predict target values for new data."""
92
+ if self.coef_ is None or self.intercept_ is None:
93
+ raise RuntimeError("Call fit() before predict().")
94
+ X_arr = np.asarray(X, dtype=float)
95
+ if X_arr.ndim != 2:
96
+ raise ValueError("X must be a 2D array.")
97
+ result = X_arr @ self.coef_ + self.intercept_
98
+ return result.astype(np.float64)
99
+
100
+ def score(self, X: ArrayLike, y: ArrayLike) -> float:
101
+ """Return the coefficient of determination R² on the given data."""
102
+ _, y_arr = _validate_regression_inputs(X, y)
103
+ y_pred = self.predict(X)
104
+ ss_res = np.sum((y_arr - y_pred) ** 2)
105
+ ss_tot = np.sum((y_arr - np.mean(y_arr)) ** 2)
106
+ return float(1.0 - ss_res / ss_tot) if ss_tot > 0 else 0.0
107
+
108
+
109
+ class GradientDescentRegressor:
110
+ r"""Linear regression using mini-batch gradient descent.
111
+
112
+ The squared error loss is:
113
+
114
+ .. math::
115
+ L(W, b) = \frac{1}{n} \sum_{i=1}^n (y_i - X_i W - b)^2
116
+
117
+ The gradient with respect to the weights is:
118
+
119
+ .. math::
120
+ \frac{\partial L}{\partial W} = -\frac{2}{n} X^\top (y - XW - b)
121
+ """
122
+
123
+ def __init__(
124
+ self,
125
+ learning_rate: float = 0.01,
126
+ n_epochs: int = 1000,
127
+ batch_size: int = 32,
128
+ random_state: int | None = None,
129
+ verbose: bool = False,
130
+ ) -> None:
131
+ self.learning_rate = learning_rate
132
+ self.n_epochs = n_epochs
133
+ self.batch_size = batch_size
134
+ self.random_state = random_state
135
+ self.verbose = verbose
136
+ self.coef_: FloatArray | None = None
137
+ self.intercept_: float | None = None
138
+ self.loss_history_: list[float] = []
139
+
140
+ def fit(self, X: ArrayLike, y: ArrayLike) -> "GradientDescentRegressor":
141
+ """Fit the model using mini-batch gradient descent."""
142
+ X_arr, y_arr = _validate_regression_inputs(X, y)
143
+ rng = np.random.default_rng(self.random_state)
144
+ n_samples, n_features = X_arr.shape
145
+ self.coef_ = np.zeros(n_features, dtype=np.float64)
146
+ self.intercept_ = 0.0
147
+ self.loss_history_ = []
148
+ learning_rate = self.learning_rate
149
+ prev_loss = float("inf")
150
+
151
+ for epoch in range(self.n_epochs):
152
+ indices = rng.permutation(n_samples)
153
+ X_shuffled = X_arr[indices]
154
+ y_shuffled = y_arr[indices]
155
+ coef_before = self.coef_.copy()
156
+ intercept_before = self.intercept_
157
+
158
+ for start in range(0, n_samples, self.batch_size):
159
+ end = start + self.batch_size
160
+ X_batch = X_shuffled[start:end]
161
+ y_batch = y_shuffled[start:end]
162
+ y_pred = X_batch @ self.coef_ + self.intercept_
163
+ errors = y_pred - y_batch
164
+ grad_w = (2.0 / X_batch.shape[0]) * (X_batch.T @ errors)
165
+ grad_b = (2.0 / X_batch.shape[0]) * np.sum(errors)
166
+ self.coef_ -= learning_rate * grad_w
167
+ self.intercept_ -= learning_rate * grad_b
168
+
169
+ loss = np.mean((X_arr @ self.coef_ + self.intercept_ - y_arr) ** 2)
170
+ if loss > prev_loss + 1e-12:
171
+ self.coef_ = coef_before
172
+ self.intercept_ = intercept_before
173
+ learning_rate *= 0.5
174
+ loss = prev_loss
175
+
176
+ self.loss_history_.append(float(loss))
177
+ prev_loss = loss
178
+ if self.verbose and epoch % 100 == 0:
179
+ print(f"Epoch {epoch} loss={loss:.6f}")
180
+ return self
181
+
182
+ def predict(self, X: ArrayLike) -> FloatArray:
183
+ """Predict target values for new data."""
184
+ if self.coef_ is None or self.intercept_ is None:
185
+ raise RuntimeError("Call fit() before predict().")
186
+ X_arr = np.asarray(X, dtype=float)
187
+ if X_arr.ndim != 2:
188
+ raise ValueError("X must be a 2D array.")
189
+ return (X_arr @ self.coef_ + self.intercept_).astype(np.float64)
190
+
191
+ def score(self, X: ArrayLike, y: ArrayLike) -> float:
192
+ """Return the coefficient of determination R² on the given data."""
193
+ _, y_arr = _validate_regression_inputs(X, y)
194
+ y_pred = self.predict(X)
195
+ ss_res = np.sum((y_arr - y_pred) ** 2)
196
+ ss_tot = np.sum((y_arr - np.mean(y_arr)) ** 2)
197
+ return float(1.0 - ss_res / ss_tot) if ss_tot > 0 else 0.0
@@ -0,0 +1,119 @@
1
+ """
2
+ Logistic Regression
3
+ ===================
4
+
5
+ A from-scratch binary classifier using gradient descent and a sigmoid link.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import numpy as np
11
+ from numpy.typing import ArrayLike, NDArray
12
+
13
+ FloatArray = NDArray[np.float64]
14
+
15
+
16
+ def _validate_classification_inputs(
17
+ X: ArrayLike, y: ArrayLike,
18
+ ) -> tuple[FloatArray, FloatArray]:
19
+ X_arr = np.asarray(X, dtype=float)
20
+ y_arr = np.asarray(y, dtype=float).flatten()
21
+ if X_arr.ndim != 2:
22
+ raise ValueError("X must be a 2D array of shape (n_samples, n_features).")
23
+ if X_arr.shape[0] != y_arr.shape[0]:
24
+ raise ValueError(
25
+ f"X has {X_arr.shape[0]} samples but y has {y_arr.shape[0]}."
26
+ )
27
+ if not np.all(np.isin(y_arr, [0.0, 1.0])):
28
+ raise ValueError("y must contain only binary labels 0 and 1.")
29
+ return X_arr, y_arr
30
+
31
+
32
+ class LogisticRegression:
33
+ r"""Binary logistic regression using gradient descent.
34
+
35
+ The model is:
36
+
37
+ .. math::
38
+ p(y=1 \mid x) = \sigma(w^\top x + b),
39
+ \quad \sigma(z) = \frac{1}{1 + e^{-z}}
40
+
41
+ The loss is the binary cross-entropy:
42
+
43
+ .. math::
44
+ L = -\frac{1}{n} \sum_{i=1}^n
45
+ \left[y_i \log \sigma(z_i) + (1-y_i) \log (1-\sigma(z_i))\right]
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ learning_rate: float = 0.01,
51
+ n_epochs: int = 1000,
52
+ batch_size: int = 32,
53
+ random_state: int | None = None,
54
+ verbose: bool = False,
55
+ ) -> None:
56
+ self.learning_rate = learning_rate
57
+ self.n_epochs = n_epochs
58
+ self.batch_size = batch_size
59
+ self.random_state = random_state
60
+ self.verbose = verbose
61
+ self.weights_: FloatArray | None = None
62
+ self.bias_: float | None = None
63
+ self.loss_history_: list[float] = []
64
+
65
+ def fit(self, X: ArrayLike, y: ArrayLike) -> "LogisticRegression":
66
+ """Fit the logistic regression model to binary data."""
67
+ X_arr, y_arr = _validate_classification_inputs(X, y)
68
+ rng = np.random.default_rng(self.random_state)
69
+ n_samples, n_features = X_arr.shape
70
+ self.weights_ = np.zeros(n_features, dtype=np.float64)
71
+ self.bias_ = 0.0
72
+ self.loss_history_ = []
73
+
74
+ for epoch in range(self.n_epochs):
75
+ perm = rng.permutation(n_samples)
76
+ for start in range(0, n_samples, self.batch_size):
77
+ end = start + self.batch_size
78
+ X_batch = X_arr[perm[start:end]]
79
+ y_batch = y_arr[perm[start:end]]
80
+ z = X_batch @ self.weights_ + self.bias_
81
+ predictions = self._sigmoid(z)
82
+ errors = predictions - y_batch
83
+ grad_w = X_batch.T @ errors / X_batch.shape[0]
84
+ grad_b = np.mean(errors)
85
+ self.weights_ -= self.learning_rate * grad_w
86
+ self.bias_ -= self.learning_rate * grad_b
87
+
88
+ loss = self._binary_cross_entropy(y_arr, self.predict_proba(X_arr))
89
+ self.loss_history_.append(float(loss))
90
+ if self.verbose and epoch % 100 == 0:
91
+ print(f"Epoch {epoch} loss={loss:.6f}")
92
+ return self
93
+
94
+ def predict_proba(self, X: ArrayLike) -> FloatArray:
95
+ """Return probability estimates for the positive class."""
96
+ if self.weights_ is None or self.bias_ is None:
97
+ raise RuntimeError("Call fit() before predict_proba().")
98
+ X_arr = np.asarray(X, dtype=float)
99
+ if X_arr.ndim != 2:
100
+ raise ValueError("X must be a 2D array.")
101
+ return self._sigmoid(X_arr @ self.weights_ + self.bias_)
102
+
103
+ def predict(self, X: ArrayLike) -> NDArray[np.int64]:
104
+ """Return binary predictions for the input data."""
105
+ return (self.predict_proba(X) >= 0.5).astype(np.int64)
106
+
107
+ def score(self, X: ArrayLike, y: ArrayLike) -> float:
108
+ """Return classification accuracy on the given dataset."""
109
+ X_arr, y_arr = _validate_classification_inputs(X, y)
110
+ y_pred = self.predict(X_arr)
111
+ return float(np.mean(y_pred == y_arr))
112
+
113
+ def _sigmoid(self, z: FloatArray) -> FloatArray:
114
+ z = np.clip(z, -500.0, 500.0)
115
+ return 1.0 / (1.0 + np.exp(-z))
116
+
117
+ def _binary_cross_entropy(self, y_true: FloatArray, y_prob: FloatArray) -> float:
118
+ y_prob = np.clip(y_prob, 1e-12, 1.0 - 1e-12)
119
+ return float(-np.mean(y_true * np.log(y_prob) + (1.0 - y_true) * np.log(1.0 - y_prob)))
@@ -0,0 +1,113 @@
1
+ r"""
2
+ Gaussian Naive Bayes Classifier
3
+ ================================
4
+
5
+ A probabilistic classifier that assumes each class follows a Gaussian
6
+ distribution and that features are conditionally independent given the class.
7
+
8
+ The model computes class log-likelihoods as:
9
+
10
+ .. math::
11
+ \log p(\mathbf{x}, y_k)
12
+ = \log \pi_k - \frac{1}{2} \sum_{j=1}^d \left[
13
+ \log(2\pi \sigma_{kj}^2)
14
+ + \frac{(x_j - \mu_{kj})^2}{\sigma_{kj}^2}
15
+ \right]
16
+
17
+ Complexity
18
+ ----------
19
+ - Training: O(n d)
20
+ - Inference: O(n d)
21
+ - Space: O(K d)
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import numpy as np
27
+ from numpy.typing import ArrayLike, NDArray
28
+
29
+ FloatArray = NDArray[np.float64]
30
+ IntArray = NDArray[np.int64]
31
+
32
+
33
+ def _validate_classification_inputs(
34
+ X: ArrayLike,
35
+ y: ArrayLike,
36
+ ) -> tuple[FloatArray, IntArray]:
37
+ X_arr = np.asarray(X, dtype=float)
38
+ y_arr = np.asarray(y, dtype=int).flatten()
39
+ if X_arr.ndim != 2:
40
+ raise ValueError("X must be a 2D array of shape (n_samples, n_features).")
41
+ if X_arr.shape[0] != y_arr.shape[0]:
42
+ raise ValueError(
43
+ f"X has {X_arr.shape[0]} samples but y has {y_arr.shape[0]}."
44
+ )
45
+ return X_arr, y_arr
46
+
47
+
48
+ class GaussianNB:
49
+ """Gaussian Naive Bayes classifier.
50
+
51
+ Parameters
52
+ ----------
53
+ var_smoothing : float, default=1e-9
54
+ Portion of the largest variance of all features added to variances for
55
+ stability in the Gaussian likelihood denominator.
56
+ """
57
+
58
+ def __init__(self, var_smoothing: float = 1e-9) -> None:
59
+ self.var_smoothing = float(var_smoothing)
60
+ self.class_count_: IntArray | None = None
61
+ self.class_prior_: FloatArray | None = None
62
+ self.class_mean_: FloatArray | None = None
63
+ self.class_var_: FloatArray | None = None
64
+ self.classes_: IntArray | None = None
65
+ self.n_features_in_: int | None = None
66
+
67
+ def fit(self, X: ArrayLike, y: ArrayLike) -> "GaussianNB":
68
+ X_arr, y_arr = _validate_classification_inputs(X, y)
69
+ self.n_features_in_ = X_arr.shape[1]
70
+ self.classes_, counts = np.unique(y_arr, return_counts=True)
71
+ self.class_count_ = counts.astype(np.int64)
72
+ self.class_prior_ = counts.astype(np.float64) / float(y_arr.size)
73
+
74
+ means = []
75
+ variances = []
76
+ for clazz in self.classes_:
77
+ X_class = X_arr[y_arr == clazz]
78
+ means.append(X_class.mean(axis=0))
79
+ variances.append(X_class.var(axis=0) + self.var_smoothing)
80
+
81
+ self.class_mean_ = np.vstack(means)
82
+ self.class_var_ = np.vstack(variances)
83
+ return self
84
+
85
+ def predict(self, X: ArrayLike) -> IntArray:
86
+ if self.class_prior_ is None or self.class_mean_ is None or self.class_var_ is None:
87
+ raise RuntimeError("Call fit() before predict().")
88
+ X_arr = np.asarray(X, dtype=float)
89
+ if X_arr.ndim != 2:
90
+ raise ValueError("X must be a 2D array.")
91
+ log_likelihood = self._joint_log_likelihood(X_arr)
92
+ argmax = np.argmax(log_likelihood, axis=1)
93
+ return self.classes_[argmax]
94
+
95
+ def score(self, X: ArrayLike, y: ArrayLike) -> float:
96
+ X_arr, y_arr = _validate_classification_inputs(X, y)
97
+ return float(np.mean(self.predict(X_arr) == y_arr))
98
+
99
+ def _joint_log_likelihood(self, X: FloatArray) -> FloatArray:
100
+ n_samples, n_features = X.shape
101
+ if self.class_mean_ is None or self.class_var_ is None or self.class_prior_ is None:
102
+ raise RuntimeError("Classifier must be fitted before computing likelihoods.")
103
+
104
+ joint = np.empty((n_samples, self.classes_.size), dtype=np.float64)
105
+ for idx, (prior, mean, var) in enumerate(
106
+ zip(self.class_prior_, self.class_mean_, self.class_var_)
107
+ ):
108
+ log_prior = np.log(prior)
109
+ log_det = -0.5 * np.sum(np.log(2.0 * np.pi * var))
110
+ diff = X - mean
111
+ exp_term = -0.5 * np.sum((diff ** 2) / var, axis=1)
112
+ joint[:, idx] = log_prior + log_det + exp_term
113
+ return joint