scratchkit 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlscratch/__init__.py +56 -0
- mlscratch/__main__.py +118 -0
- mlscratch/bayesian/__init__.py +53 -0
- mlscratch/bayesian/bayesian_linear_regression.py +171 -0
- mlscratch/bayesian/bayesian_network.py +248 -0
- mlscratch/bayesian/bayesian_nn.py +315 -0
- mlscratch/bayesian/gaussian_process.py +207 -0
- mlscratch/bayesian/hmm.py +277 -0
- mlscratch/bayesian/init.py +52 -0
- mlscratch/bayesian/kalman_filter.py +182 -0
- mlscratch/bayesian/naive_bayes.py +209 -0
- mlscratch/metrics/__init__.py +59 -0
- mlscratch/metrics/classification.py +365 -0
- mlscratch/metrics/regression.py +79 -0
- mlscratch/neural/__init__.py +121 -0
- mlscratch/neural/attention.py +420 -0
- mlscratch/neural/autoencoder.py +543 -0
- mlscratch/neural/boltzmann.py +231 -0
- mlscratch/neural/cnn.py +593 -0
- mlscratch/neural/cvnn.py +322 -0
- mlscratch/neural/gan.py +364 -0
- mlscratch/neural/hopfield.py +193 -0
- mlscratch/neural/perceptron.py +398 -0
- mlscratch/neural/rbf_network.py +230 -0
- mlscratch/neural/recurrent.py +569 -0
- mlscratch/preprocessing/__init__.py +38 -0
- mlscratch/preprocessing/encoders.py +140 -0
- mlscratch/preprocessing/model_selection.py +119 -0
- mlscratch/preprocessing/polynomial.py +105 -0
- mlscratch/preprocessing/scalers.py +220 -0
- mlscratch/py.typed +0 -0
- mlscratch/reinforcement/__init__.py +59 -0
- mlscratch/reinforcement/ddpg.py +363 -0
- mlscratch/reinforcement/dqn.py +319 -0
- mlscratch/reinforcement/ppo.py +452 -0
- mlscratch/reinforcement/q_learning.py +352 -0
- mlscratch/reinforcement/sac.py +382 -0
- mlscratch/reinforcement/utils.py +594 -0
- mlscratch/supervised/__init__.py +76 -0
- mlscratch/supervised/_validation.py +50 -0
- mlscratch/supervised/adaboost.py +255 -0
- mlscratch/supervised/decision_tree.py +495 -0
- mlscratch/supervised/gradient_boosting.py +354 -0
- mlscratch/supervised/knn.py +234 -0
- mlscratch/supervised/lasso_regression.py +125 -0
- mlscratch/supervised/linear_models.py +459 -0
- mlscratch/supervised/linear_regression.py +197 -0
- mlscratch/supervised/logistic_regression.py +119 -0
- mlscratch/supervised/naive_bayes.py +113 -0
- mlscratch/supervised/random_forest.py +321 -0
- mlscratch/supervised/ridge_regression.py +93 -0
- mlscratch/supervised/svm.py +356 -0
- mlscratch/unsupervised/__init__.py +39 -0
- mlscratch/unsupervised/apriori.py +178 -0
- mlscratch/unsupervised/dbscan.py +141 -0
- mlscratch/unsupervised/gmm.py +204 -0
- mlscratch/unsupervised/hierarchical_clustering.py +137 -0
- mlscratch/unsupervised/ica.py +167 -0
- mlscratch/unsupervised/kmeans.py +135 -0
- mlscratch/unsupervised/kmedoids.py +133 -0
- mlscratch/unsupervised/pca.py +103 -0
- mlscratch/unsupervised/tsne.py +200 -0
- scratchkit-0.2.0.dist-info/METADATA +241 -0
- scratchkit-0.2.0.dist-info/RECORD +68 -0
- scratchkit-0.2.0.dist-info/WHEEL +5 -0
- scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
- scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
- scratchkit-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Linear Regression
|
|
3
|
+
=================
|
|
4
|
+
|
|
5
|
+
Ordinary Least Squares and mini-batch gradient descent implementations for
|
|
6
|
+
linear regression.
|
|
7
|
+
|
|
8
|
+
The module uses a clear, sklearn-compatible interface with explicit math.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import Callable
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
from numpy.typing import ArrayLike, NDArray
|
|
17
|
+
|
|
18
|
+
FloatArray = NDArray[np.float64]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _validate_regression_inputs(
|
|
22
|
+
X: ArrayLike, y: ArrayLike,
|
|
23
|
+
) -> tuple[FloatArray, FloatArray]:
|
|
24
|
+
X_arr = np.asarray(X, dtype=float)
|
|
25
|
+
y_arr = np.asarray(y, dtype=float).flatten()
|
|
26
|
+
if X_arr.ndim != 2:
|
|
27
|
+
raise ValueError("X must be a 2D array of shape (n_samples, n_features).")
|
|
28
|
+
if X_arr.shape[0] != y_arr.shape[0]:
|
|
29
|
+
raise ValueError(
|
|
30
|
+
f"X has {X_arr.shape[0]} samples but y has {y_arr.shape[0]}."
|
|
31
|
+
)
|
|
32
|
+
return X_arr, y_arr
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class OrdinaryLeastSquares:
|
|
36
|
+
"""Ordinary least squares regression using the normal equations.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
add_intercept : bool, default=True
|
|
41
|
+
If True, the model fits an intercept term by prepending a column of ones
|
|
42
|
+
to the design matrix.
|
|
43
|
+
|
|
44
|
+
Attributes
|
|
45
|
+
----------
|
|
46
|
+
coef_ : FloatArray
|
|
47
|
+
Estimated regression coefficients for each feature.
|
|
48
|
+
intercept_ : float
|
|
49
|
+
Estimated bias term.
|
|
50
|
+
residuals_ : FloatArray
|
|
51
|
+
Residual values after fitting.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, add_intercept: bool = True) -> None:
|
|
55
|
+
self.add_intercept = add_intercept
|
|
56
|
+
self.coef_: FloatArray | None = None
|
|
57
|
+
self.intercept_: float | None = None
|
|
58
|
+
self.residuals_: FloatArray | None = None
|
|
59
|
+
|
|
60
|
+
def fit(self, X: ArrayLike, y: ArrayLike) -> "OrdinaryLeastSquares":
|
|
61
|
+
r"""Fit the linear regression model.
|
|
62
|
+
|
|
63
|
+
The closed-form least-squares solution is computed via a numerically
|
|
64
|
+
stable least-squares solver:
|
|
65
|
+
|
|
66
|
+
.. math::
|
|
67
|
+
\hat{\beta} = \operatorname{argmin}_\beta \|X \beta - y\|_2^2
|
|
68
|
+
|
|
69
|
+
Returns
|
|
70
|
+
-------
|
|
71
|
+
self : OrdinaryLeastSquares
|
|
72
|
+
"""
|
|
73
|
+
X_arr, y_arr = _validate_regression_inputs(X, y)
|
|
74
|
+
if self.add_intercept:
|
|
75
|
+
X_arr = np.column_stack([np.ones(X_arr.shape[0]), X_arr])
|
|
76
|
+
|
|
77
|
+
solution, residuals, *_ = np.linalg.lstsq(X_arr, y_arr, rcond=None)
|
|
78
|
+
if self.add_intercept:
|
|
79
|
+
self.intercept_ = float(solution[0])
|
|
80
|
+
self.coef_ = solution[1:].astype(np.float64)
|
|
81
|
+
y_pred = X_arr[:, 1:] @ self.coef_ + self.intercept_
|
|
82
|
+
else:
|
|
83
|
+
self.intercept_ = 0.0
|
|
84
|
+
self.coef_ = solution.astype(np.float64)
|
|
85
|
+
y_pred = X_arr @ self.coef_
|
|
86
|
+
|
|
87
|
+
self.residuals_ = y_arr - y_pred
|
|
88
|
+
return self
|
|
89
|
+
|
|
90
|
+
def predict(self, X: ArrayLike) -> FloatArray:
|
|
91
|
+
"""Predict target values for new data."""
|
|
92
|
+
if self.coef_ is None or self.intercept_ is None:
|
|
93
|
+
raise RuntimeError("Call fit() before predict().")
|
|
94
|
+
X_arr = np.asarray(X, dtype=float)
|
|
95
|
+
if X_arr.ndim != 2:
|
|
96
|
+
raise ValueError("X must be a 2D array.")
|
|
97
|
+
result = X_arr @ self.coef_ + self.intercept_
|
|
98
|
+
return result.astype(np.float64)
|
|
99
|
+
|
|
100
|
+
def score(self, X: ArrayLike, y: ArrayLike) -> float:
|
|
101
|
+
"""Return the coefficient of determination R² on the given data."""
|
|
102
|
+
_, y_arr = _validate_regression_inputs(X, y)
|
|
103
|
+
y_pred = self.predict(X)
|
|
104
|
+
ss_res = np.sum((y_arr - y_pred) ** 2)
|
|
105
|
+
ss_tot = np.sum((y_arr - np.mean(y_arr)) ** 2)
|
|
106
|
+
return float(1.0 - ss_res / ss_tot) if ss_tot > 0 else 0.0
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class GradientDescentRegressor:
|
|
110
|
+
r"""Linear regression using mini-batch gradient descent.
|
|
111
|
+
|
|
112
|
+
The squared error loss is:
|
|
113
|
+
|
|
114
|
+
.. math::
|
|
115
|
+
L(W, b) = \frac{1}{n} \sum_{i=1}^n (y_i - X_i W - b)^2
|
|
116
|
+
|
|
117
|
+
The gradient with respect to the weights is:
|
|
118
|
+
|
|
119
|
+
.. math::
|
|
120
|
+
\frac{\partial L}{\partial W} = -\frac{2}{n} X^\top (y - XW - b)
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
def __init__(
|
|
124
|
+
self,
|
|
125
|
+
learning_rate: float = 0.01,
|
|
126
|
+
n_epochs: int = 1000,
|
|
127
|
+
batch_size: int = 32,
|
|
128
|
+
random_state: int | None = None,
|
|
129
|
+
verbose: bool = False,
|
|
130
|
+
) -> None:
|
|
131
|
+
self.learning_rate = learning_rate
|
|
132
|
+
self.n_epochs = n_epochs
|
|
133
|
+
self.batch_size = batch_size
|
|
134
|
+
self.random_state = random_state
|
|
135
|
+
self.verbose = verbose
|
|
136
|
+
self.coef_: FloatArray | None = None
|
|
137
|
+
self.intercept_: float | None = None
|
|
138
|
+
self.loss_history_: list[float] = []
|
|
139
|
+
|
|
140
|
+
def fit(self, X: ArrayLike, y: ArrayLike) -> "GradientDescentRegressor":
|
|
141
|
+
"""Fit the model using mini-batch gradient descent."""
|
|
142
|
+
X_arr, y_arr = _validate_regression_inputs(X, y)
|
|
143
|
+
rng = np.random.default_rng(self.random_state)
|
|
144
|
+
n_samples, n_features = X_arr.shape
|
|
145
|
+
self.coef_ = np.zeros(n_features, dtype=np.float64)
|
|
146
|
+
self.intercept_ = 0.0
|
|
147
|
+
self.loss_history_ = []
|
|
148
|
+
learning_rate = self.learning_rate
|
|
149
|
+
prev_loss = float("inf")
|
|
150
|
+
|
|
151
|
+
for epoch in range(self.n_epochs):
|
|
152
|
+
indices = rng.permutation(n_samples)
|
|
153
|
+
X_shuffled = X_arr[indices]
|
|
154
|
+
y_shuffled = y_arr[indices]
|
|
155
|
+
coef_before = self.coef_.copy()
|
|
156
|
+
intercept_before = self.intercept_
|
|
157
|
+
|
|
158
|
+
for start in range(0, n_samples, self.batch_size):
|
|
159
|
+
end = start + self.batch_size
|
|
160
|
+
X_batch = X_shuffled[start:end]
|
|
161
|
+
y_batch = y_shuffled[start:end]
|
|
162
|
+
y_pred = X_batch @ self.coef_ + self.intercept_
|
|
163
|
+
errors = y_pred - y_batch
|
|
164
|
+
grad_w = (2.0 / X_batch.shape[0]) * (X_batch.T @ errors)
|
|
165
|
+
grad_b = (2.0 / X_batch.shape[0]) * np.sum(errors)
|
|
166
|
+
self.coef_ -= learning_rate * grad_w
|
|
167
|
+
self.intercept_ -= learning_rate * grad_b
|
|
168
|
+
|
|
169
|
+
loss = np.mean((X_arr @ self.coef_ + self.intercept_ - y_arr) ** 2)
|
|
170
|
+
if loss > prev_loss + 1e-12:
|
|
171
|
+
self.coef_ = coef_before
|
|
172
|
+
self.intercept_ = intercept_before
|
|
173
|
+
learning_rate *= 0.5
|
|
174
|
+
loss = prev_loss
|
|
175
|
+
|
|
176
|
+
self.loss_history_.append(float(loss))
|
|
177
|
+
prev_loss = loss
|
|
178
|
+
if self.verbose and epoch % 100 == 0:
|
|
179
|
+
print(f"Epoch {epoch} loss={loss:.6f}")
|
|
180
|
+
return self
|
|
181
|
+
|
|
182
|
+
def predict(self, X: ArrayLike) -> FloatArray:
|
|
183
|
+
"""Predict target values for new data."""
|
|
184
|
+
if self.coef_ is None or self.intercept_ is None:
|
|
185
|
+
raise RuntimeError("Call fit() before predict().")
|
|
186
|
+
X_arr = np.asarray(X, dtype=float)
|
|
187
|
+
if X_arr.ndim != 2:
|
|
188
|
+
raise ValueError("X must be a 2D array.")
|
|
189
|
+
return (X_arr @ self.coef_ + self.intercept_).astype(np.float64)
|
|
190
|
+
|
|
191
|
+
def score(self, X: ArrayLike, y: ArrayLike) -> float:
|
|
192
|
+
"""Return the coefficient of determination R² on the given data."""
|
|
193
|
+
_, y_arr = _validate_regression_inputs(X, y)
|
|
194
|
+
y_pred = self.predict(X)
|
|
195
|
+
ss_res = np.sum((y_arr - y_pred) ** 2)
|
|
196
|
+
ss_tot = np.sum((y_arr - np.mean(y_arr)) ** 2)
|
|
197
|
+
return float(1.0 - ss_res / ss_tot) if ss_tot > 0 else 0.0
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Logistic Regression
|
|
3
|
+
===================
|
|
4
|
+
|
|
5
|
+
A from-scratch binary classifier using gradient descent and a sigmoid link.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from numpy.typing import ArrayLike, NDArray
|
|
12
|
+
|
|
13
|
+
FloatArray = NDArray[np.float64]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _validate_classification_inputs(
|
|
17
|
+
X: ArrayLike, y: ArrayLike,
|
|
18
|
+
) -> tuple[FloatArray, FloatArray]:
|
|
19
|
+
X_arr = np.asarray(X, dtype=float)
|
|
20
|
+
y_arr = np.asarray(y, dtype=float).flatten()
|
|
21
|
+
if X_arr.ndim != 2:
|
|
22
|
+
raise ValueError("X must be a 2D array of shape (n_samples, n_features).")
|
|
23
|
+
if X_arr.shape[0] != y_arr.shape[0]:
|
|
24
|
+
raise ValueError(
|
|
25
|
+
f"X has {X_arr.shape[0]} samples but y has {y_arr.shape[0]}."
|
|
26
|
+
)
|
|
27
|
+
if not np.all(np.isin(y_arr, [0.0, 1.0])):
|
|
28
|
+
raise ValueError("y must contain only binary labels 0 and 1.")
|
|
29
|
+
return X_arr, y_arr
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class LogisticRegression:
|
|
33
|
+
r"""Binary logistic regression using gradient descent.
|
|
34
|
+
|
|
35
|
+
The model is:
|
|
36
|
+
|
|
37
|
+
.. math::
|
|
38
|
+
p(y=1 \mid x) = \sigma(w^\top x + b),
|
|
39
|
+
\quad \sigma(z) = \frac{1}{1 + e^{-z}}
|
|
40
|
+
|
|
41
|
+
The loss is the binary cross-entropy:
|
|
42
|
+
|
|
43
|
+
.. math::
|
|
44
|
+
L = -\frac{1}{n} \sum_{i=1}^n
|
|
45
|
+
\left[y_i \log \sigma(z_i) + (1-y_i) \log (1-\sigma(z_i))\right]
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
learning_rate: float = 0.01,
|
|
51
|
+
n_epochs: int = 1000,
|
|
52
|
+
batch_size: int = 32,
|
|
53
|
+
random_state: int | None = None,
|
|
54
|
+
verbose: bool = False,
|
|
55
|
+
) -> None:
|
|
56
|
+
self.learning_rate = learning_rate
|
|
57
|
+
self.n_epochs = n_epochs
|
|
58
|
+
self.batch_size = batch_size
|
|
59
|
+
self.random_state = random_state
|
|
60
|
+
self.verbose = verbose
|
|
61
|
+
self.weights_: FloatArray | None = None
|
|
62
|
+
self.bias_: float | None = None
|
|
63
|
+
self.loss_history_: list[float] = []
|
|
64
|
+
|
|
65
|
+
def fit(self, X: ArrayLike, y: ArrayLike) -> "LogisticRegression":
|
|
66
|
+
"""Fit the logistic regression model to binary data."""
|
|
67
|
+
X_arr, y_arr = _validate_classification_inputs(X, y)
|
|
68
|
+
rng = np.random.default_rng(self.random_state)
|
|
69
|
+
n_samples, n_features = X_arr.shape
|
|
70
|
+
self.weights_ = np.zeros(n_features, dtype=np.float64)
|
|
71
|
+
self.bias_ = 0.0
|
|
72
|
+
self.loss_history_ = []
|
|
73
|
+
|
|
74
|
+
for epoch in range(self.n_epochs):
|
|
75
|
+
perm = rng.permutation(n_samples)
|
|
76
|
+
for start in range(0, n_samples, self.batch_size):
|
|
77
|
+
end = start + self.batch_size
|
|
78
|
+
X_batch = X_arr[perm[start:end]]
|
|
79
|
+
y_batch = y_arr[perm[start:end]]
|
|
80
|
+
z = X_batch @ self.weights_ + self.bias_
|
|
81
|
+
predictions = self._sigmoid(z)
|
|
82
|
+
errors = predictions - y_batch
|
|
83
|
+
grad_w = X_batch.T @ errors / X_batch.shape[0]
|
|
84
|
+
grad_b = np.mean(errors)
|
|
85
|
+
self.weights_ -= self.learning_rate * grad_w
|
|
86
|
+
self.bias_ -= self.learning_rate * grad_b
|
|
87
|
+
|
|
88
|
+
loss = self._binary_cross_entropy(y_arr, self.predict_proba(X_arr))
|
|
89
|
+
self.loss_history_.append(float(loss))
|
|
90
|
+
if self.verbose and epoch % 100 == 0:
|
|
91
|
+
print(f"Epoch {epoch} loss={loss:.6f}")
|
|
92
|
+
return self
|
|
93
|
+
|
|
94
|
+
def predict_proba(self, X: ArrayLike) -> FloatArray:
|
|
95
|
+
"""Return probability estimates for the positive class."""
|
|
96
|
+
if self.weights_ is None or self.bias_ is None:
|
|
97
|
+
raise RuntimeError("Call fit() before predict_proba().")
|
|
98
|
+
X_arr = np.asarray(X, dtype=float)
|
|
99
|
+
if X_arr.ndim != 2:
|
|
100
|
+
raise ValueError("X must be a 2D array.")
|
|
101
|
+
return self._sigmoid(X_arr @ self.weights_ + self.bias_)
|
|
102
|
+
|
|
103
|
+
def predict(self, X: ArrayLike) -> NDArray[np.int64]:
|
|
104
|
+
"""Return binary predictions for the input data."""
|
|
105
|
+
return (self.predict_proba(X) >= 0.5).astype(np.int64)
|
|
106
|
+
|
|
107
|
+
def score(self, X: ArrayLike, y: ArrayLike) -> float:
|
|
108
|
+
"""Return classification accuracy on the given dataset."""
|
|
109
|
+
X_arr, y_arr = _validate_classification_inputs(X, y)
|
|
110
|
+
y_pred = self.predict(X_arr)
|
|
111
|
+
return float(np.mean(y_pred == y_arr))
|
|
112
|
+
|
|
113
|
+
def _sigmoid(self, z: FloatArray) -> FloatArray:
|
|
114
|
+
z = np.clip(z, -500.0, 500.0)
|
|
115
|
+
return 1.0 / (1.0 + np.exp(-z))
|
|
116
|
+
|
|
117
|
+
def _binary_cross_entropy(self, y_true: FloatArray, y_prob: FloatArray) -> float:
|
|
118
|
+
y_prob = np.clip(y_prob, 1e-12, 1.0 - 1e-12)
|
|
119
|
+
return float(-np.mean(y_true * np.log(y_prob) + (1.0 - y_true) * np.log(1.0 - y_prob)))
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
r"""
|
|
2
|
+
Gaussian Naive Bayes Classifier
|
|
3
|
+
================================
|
|
4
|
+
|
|
5
|
+
A probabilistic classifier that assumes each class follows a Gaussian
|
|
6
|
+
distribution and that features are conditionally independent given the class.
|
|
7
|
+
|
|
8
|
+
The model computes class log-likelihoods as:
|
|
9
|
+
|
|
10
|
+
.. math::
|
|
11
|
+
\log p(\mathbf{x}, y_k)
|
|
12
|
+
= \log \pi_k - \frac{1}{2} \sum_{j=1}^d \left[
|
|
13
|
+
\log(2\pi \sigma_{kj}^2)
|
|
14
|
+
+ \frac{(x_j - \mu_{kj})^2}{\sigma_{kj}^2}
|
|
15
|
+
\right]
|
|
16
|
+
|
|
17
|
+
Complexity
|
|
18
|
+
----------
|
|
19
|
+
- Training: O(n d)
|
|
20
|
+
- Inference: O(n d)
|
|
21
|
+
- Space: O(K d)
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import numpy as np
|
|
27
|
+
from numpy.typing import ArrayLike, NDArray
|
|
28
|
+
|
|
29
|
+
FloatArray = NDArray[np.float64]
|
|
30
|
+
IntArray = NDArray[np.int64]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _validate_classification_inputs(
|
|
34
|
+
X: ArrayLike,
|
|
35
|
+
y: ArrayLike,
|
|
36
|
+
) -> tuple[FloatArray, IntArray]:
|
|
37
|
+
X_arr = np.asarray(X, dtype=float)
|
|
38
|
+
y_arr = np.asarray(y, dtype=int).flatten()
|
|
39
|
+
if X_arr.ndim != 2:
|
|
40
|
+
raise ValueError("X must be a 2D array of shape (n_samples, n_features).")
|
|
41
|
+
if X_arr.shape[0] != y_arr.shape[0]:
|
|
42
|
+
raise ValueError(
|
|
43
|
+
f"X has {X_arr.shape[0]} samples but y has {y_arr.shape[0]}."
|
|
44
|
+
)
|
|
45
|
+
return X_arr, y_arr
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class GaussianNB:
|
|
49
|
+
"""Gaussian Naive Bayes classifier.
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
var_smoothing : float, default=1e-9
|
|
54
|
+
Portion of the largest variance of all features added to variances for
|
|
55
|
+
stability in the Gaussian likelihood denominator.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(self, var_smoothing: float = 1e-9) -> None:
|
|
59
|
+
self.var_smoothing = float(var_smoothing)
|
|
60
|
+
self.class_count_: IntArray | None = None
|
|
61
|
+
self.class_prior_: FloatArray | None = None
|
|
62
|
+
self.class_mean_: FloatArray | None = None
|
|
63
|
+
self.class_var_: FloatArray | None = None
|
|
64
|
+
self.classes_: IntArray | None = None
|
|
65
|
+
self.n_features_in_: int | None = None
|
|
66
|
+
|
|
67
|
+
def fit(self, X: ArrayLike, y: ArrayLike) -> "GaussianNB":
|
|
68
|
+
X_arr, y_arr = _validate_classification_inputs(X, y)
|
|
69
|
+
self.n_features_in_ = X_arr.shape[1]
|
|
70
|
+
self.classes_, counts = np.unique(y_arr, return_counts=True)
|
|
71
|
+
self.class_count_ = counts.astype(np.int64)
|
|
72
|
+
self.class_prior_ = counts.astype(np.float64) / float(y_arr.size)
|
|
73
|
+
|
|
74
|
+
means = []
|
|
75
|
+
variances = []
|
|
76
|
+
for clazz in self.classes_:
|
|
77
|
+
X_class = X_arr[y_arr == clazz]
|
|
78
|
+
means.append(X_class.mean(axis=0))
|
|
79
|
+
variances.append(X_class.var(axis=0) + self.var_smoothing)
|
|
80
|
+
|
|
81
|
+
self.class_mean_ = np.vstack(means)
|
|
82
|
+
self.class_var_ = np.vstack(variances)
|
|
83
|
+
return self
|
|
84
|
+
|
|
85
|
+
def predict(self, X: ArrayLike) -> IntArray:
|
|
86
|
+
if self.class_prior_ is None or self.class_mean_ is None or self.class_var_ is None:
|
|
87
|
+
raise RuntimeError("Call fit() before predict().")
|
|
88
|
+
X_arr = np.asarray(X, dtype=float)
|
|
89
|
+
if X_arr.ndim != 2:
|
|
90
|
+
raise ValueError("X must be a 2D array.")
|
|
91
|
+
log_likelihood = self._joint_log_likelihood(X_arr)
|
|
92
|
+
argmax = np.argmax(log_likelihood, axis=1)
|
|
93
|
+
return self.classes_[argmax]
|
|
94
|
+
|
|
95
|
+
def score(self, X: ArrayLike, y: ArrayLike) -> float:
|
|
96
|
+
X_arr, y_arr = _validate_classification_inputs(X, y)
|
|
97
|
+
return float(np.mean(self.predict(X_arr) == y_arr))
|
|
98
|
+
|
|
99
|
+
def _joint_log_likelihood(self, X: FloatArray) -> FloatArray:
|
|
100
|
+
n_samples, n_features = X.shape
|
|
101
|
+
if self.class_mean_ is None or self.class_var_ is None or self.class_prior_ is None:
|
|
102
|
+
raise RuntimeError("Classifier must be fitted before computing likelihoods.")
|
|
103
|
+
|
|
104
|
+
joint = np.empty((n_samples, self.classes_.size), dtype=np.float64)
|
|
105
|
+
for idx, (prior, mean, var) in enumerate(
|
|
106
|
+
zip(self.class_prior_, self.class_mean_, self.class_var_)
|
|
107
|
+
):
|
|
108
|
+
log_prior = np.log(prior)
|
|
109
|
+
log_det = -0.5 * np.sum(np.log(2.0 * np.pi * var))
|
|
110
|
+
diff = X - mean
|
|
111
|
+
exp_term = -0.5 * np.sum((diff ** 2) / var, axis=1)
|
|
112
|
+
joint[:, idx] = log_prior + log_det + exp_term
|
|
113
|
+
return joint
|