scratchkit 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlscratch/__init__.py +56 -0
- mlscratch/__main__.py +118 -0
- mlscratch/bayesian/__init__.py +53 -0
- mlscratch/bayesian/bayesian_linear_regression.py +171 -0
- mlscratch/bayesian/bayesian_network.py +248 -0
- mlscratch/bayesian/bayesian_nn.py +315 -0
- mlscratch/bayesian/gaussian_process.py +207 -0
- mlscratch/bayesian/hmm.py +277 -0
- mlscratch/bayesian/init.py +52 -0
- mlscratch/bayesian/kalman_filter.py +182 -0
- mlscratch/bayesian/naive_bayes.py +209 -0
- mlscratch/metrics/__init__.py +59 -0
- mlscratch/metrics/classification.py +365 -0
- mlscratch/metrics/regression.py +79 -0
- mlscratch/neural/__init__.py +121 -0
- mlscratch/neural/attention.py +420 -0
- mlscratch/neural/autoencoder.py +543 -0
- mlscratch/neural/boltzmann.py +231 -0
- mlscratch/neural/cnn.py +593 -0
- mlscratch/neural/cvnn.py +322 -0
- mlscratch/neural/gan.py +364 -0
- mlscratch/neural/hopfield.py +193 -0
- mlscratch/neural/perceptron.py +398 -0
- mlscratch/neural/rbf_network.py +230 -0
- mlscratch/neural/recurrent.py +569 -0
- mlscratch/preprocessing/__init__.py +38 -0
- mlscratch/preprocessing/encoders.py +140 -0
- mlscratch/preprocessing/model_selection.py +119 -0
- mlscratch/preprocessing/polynomial.py +105 -0
- mlscratch/preprocessing/scalers.py +220 -0
- mlscratch/py.typed +0 -0
- mlscratch/reinforcement/__init__.py +59 -0
- mlscratch/reinforcement/ddpg.py +363 -0
- mlscratch/reinforcement/dqn.py +319 -0
- mlscratch/reinforcement/ppo.py +452 -0
- mlscratch/reinforcement/q_learning.py +352 -0
- mlscratch/reinforcement/sac.py +382 -0
- mlscratch/reinforcement/utils.py +594 -0
- mlscratch/supervised/__init__.py +76 -0
- mlscratch/supervised/_validation.py +50 -0
- mlscratch/supervised/adaboost.py +255 -0
- mlscratch/supervised/decision_tree.py +495 -0
- mlscratch/supervised/gradient_boosting.py +354 -0
- mlscratch/supervised/knn.py +234 -0
- mlscratch/supervised/lasso_regression.py +125 -0
- mlscratch/supervised/linear_models.py +459 -0
- mlscratch/supervised/linear_regression.py +197 -0
- mlscratch/supervised/logistic_regression.py +119 -0
- mlscratch/supervised/naive_bayes.py +113 -0
- mlscratch/supervised/random_forest.py +321 -0
- mlscratch/supervised/ridge_regression.py +93 -0
- mlscratch/supervised/svm.py +356 -0
- mlscratch/unsupervised/__init__.py +39 -0
- mlscratch/unsupervised/apriori.py +178 -0
- mlscratch/unsupervised/dbscan.py +141 -0
- mlscratch/unsupervised/gmm.py +204 -0
- mlscratch/unsupervised/hierarchical_clustering.py +137 -0
- mlscratch/unsupervised/ica.py +167 -0
- mlscratch/unsupervised/kmeans.py +135 -0
- mlscratch/unsupervised/kmedoids.py +133 -0
- mlscratch/unsupervised/pca.py +103 -0
- mlscratch/unsupervised/tsne.py +200 -0
- scratchkit-0.2.0.dist-info/METADATA +241 -0
- scratchkit-0.2.0.dist-info/RECORD +68 -0
- scratchkit-0.2.0.dist-info/WHEEL +5 -0
- scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
- scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
- scratchkit-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hopfield Network
|
|
3
|
+
=================
|
|
4
|
+
A recurrent, fully-connected network that stores patterns as attractors
|
|
5
|
+
of an energy landscape (Hopfield, 1982). Used as a content-addressable
|
|
6
|
+
associative memory.
|
|
7
|
+
|
|
8
|
+
Storage (Hebbian learning rule)
|
|
9
|
+
---------------------------------
|
|
10
|
+
For patterns p^(1), ..., p^(M) ∈ {-1, +1}^N:
|
|
11
|
+
|
|
12
|
+
W_ij = (1/N) Σ_μ p_i^(μ) p_j^(μ), W_ii = 0
|
|
13
|
+
|
|
14
|
+
Energy function
|
|
15
|
+
-----------------
|
|
16
|
+
E(s) = -½ Σ_ij W_ij s_i s_j
|
|
17
|
+
|
|
18
|
+
Recall dynamics
|
|
19
|
+
-----------------
|
|
20
|
+
Asynchronous update (one neuron at a time, random order) or synchronous
|
|
21
|
+
update (all neurons at once):
|
|
22
|
+
|
|
23
|
+
s_i ← sign(Σ_j W_ij s_j)
|
|
24
|
+
|
|
25
|
+
The network converges to a local minimum of E — ideally the nearest
|
|
26
|
+
stored pattern, enabling error correction / pattern completion.
|
|
27
|
+
|
|
28
|
+
Capacity
|
|
29
|
+
---------
|
|
30
|
+
Approximately 0.138 N patterns can be stored reliably (Amit et al., 1985).
|
|
31
|
+
|
|
32
|
+
Reference
|
|
33
|
+
----------
|
|
34
|
+
Hopfield, J. J. (1982). Neural networks and physical systems with emergent
|
|
35
|
+
collective computational abilities. PNAS, 79(8), 2554-2558.
|
|
36
|
+
|
|
37
|
+
Only numpy is used.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
from __future__ import annotations
|
|
41
|
+
|
|
42
|
+
import numpy as np
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class HopfieldNetwork:
|
|
46
|
+
"""
|
|
47
|
+
Discrete Hopfield Network with bipolar {-1, +1} states.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
n_units : int
|
|
52
|
+
Number of neurons (= dimensionality of stored patterns).
|
|
53
|
+
random_state : int or None
|
|
54
|
+
Seed for asynchronous update ordering.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(self, n_units: int, random_state: int | None = None) -> None:
|
|
58
|
+
self.n_units = n_units
|
|
59
|
+
self.weights = np.zeros((n_units, n_units))
|
|
60
|
+
self._rng = np.random.default_rng(random_state)
|
|
61
|
+
self.n_patterns_stored_ = 0
|
|
62
|
+
|
|
63
|
+
# ------------------------------------------------------------------
|
|
64
|
+
# Storage
|
|
65
|
+
# ------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
def fit(self, patterns: np.ndarray) -> "HopfieldNetwork":
|
|
68
|
+
"""
|
|
69
|
+
Store patterns via the Hebbian outer-product rule.
|
|
70
|
+
|
|
71
|
+
Parameters
|
|
72
|
+
----------
|
|
73
|
+
patterns : ndarray of shape (n_patterns, n_units)
|
|
74
|
+
Each row is a bipolar pattern with values in {-1, +1}.
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
self
|
|
79
|
+
"""
|
|
80
|
+
patterns = np.atleast_2d(patterns).astype(float)
|
|
81
|
+
if patterns.shape[1] != self.n_units:
|
|
82
|
+
raise ValueError(
|
|
83
|
+
f"Pattern dimension {patterns.shape[1]} != n_units {self.n_units}."
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
self.weights = (patterns.T @ patterns) / self.n_units
|
|
87
|
+
np.fill_diagonal(self.weights, 0.0)
|
|
88
|
+
self.n_patterns_stored_ = len(patterns)
|
|
89
|
+
return self
|
|
90
|
+
|
|
91
|
+
# ------------------------------------------------------------------
|
|
92
|
+
# Energy
|
|
93
|
+
# ------------------------------------------------------------------
|
|
94
|
+
|
|
95
|
+
def energy(self, state: np.ndarray) -> float:
|
|
96
|
+
"""
|
|
97
|
+
Compute E(s) = -½ sᵗ W s.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
state : ndarray of shape (n_units,)
|
|
102
|
+
|
|
103
|
+
Returns
|
|
104
|
+
-------
|
|
105
|
+
float
|
|
106
|
+
"""
|
|
107
|
+
return float(-0.5 * state @ self.weights @ state)
|
|
108
|
+
|
|
109
|
+
# ------------------------------------------------------------------
|
|
110
|
+
# Recall
|
|
111
|
+
# ------------------------------------------------------------------
|
|
112
|
+
|
|
113
|
+
def recall(
|
|
114
|
+
self,
|
|
115
|
+
state: np.ndarray,
|
|
116
|
+
mode: str = "async",
|
|
117
|
+
max_iter: int = 100,
|
|
118
|
+
) -> np.ndarray:
|
|
119
|
+
"""
|
|
120
|
+
Run network dynamics from an initial state to convergence.
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
state : ndarray of shape (n_units,)
|
|
125
|
+
Initial state, values in {-1, +1} (or any sign-able reals).
|
|
126
|
+
mode : str
|
|
127
|
+
``'async'`` — update one randomly-chosen neuron at a time
|
|
128
|
+
(classic Hopfield dynamics, guaranteed convergence
|
|
129
|
+
for symmetric W with zero diagonal).
|
|
130
|
+
``'sync'`` — update all neurons simultaneously each step.
|
|
131
|
+
max_iter : int
|
|
132
|
+
Maximum number of update sweeps.
|
|
133
|
+
|
|
134
|
+
Returns
|
|
135
|
+
-------
|
|
136
|
+
ndarray of shape (n_units,) — converged state
|
|
137
|
+
"""
|
|
138
|
+
if mode not in {"async", "sync"}:
|
|
139
|
+
raise ValueError("mode must be 'async' or 'sync'.")
|
|
140
|
+
|
|
141
|
+
s = np.sign(state.astype(float))
|
|
142
|
+
s[s == 0] = 1.0 # break ties
|
|
143
|
+
|
|
144
|
+
if mode == "sync":
|
|
145
|
+
for _ in range(max_iter):
|
|
146
|
+
s_new = np.sign(self.weights @ s)
|
|
147
|
+
s_new[s_new == 0] = 1.0
|
|
148
|
+
if np.array_equal(s_new, s):
|
|
149
|
+
break
|
|
150
|
+
s = s_new
|
|
151
|
+
return s
|
|
152
|
+
|
|
153
|
+
# Asynchronous updates
|
|
154
|
+
for _ in range(max_iter):
|
|
155
|
+
order = self._rng.permutation(self.n_units)
|
|
156
|
+
changed = False
|
|
157
|
+
for i in order:
|
|
158
|
+
activation = self.weights[i] @ s
|
|
159
|
+
new_val = 1.0 if activation >= 0 else -1.0
|
|
160
|
+
if new_val != s[i]:
|
|
161
|
+
s[i] = new_val
|
|
162
|
+
changed = True
|
|
163
|
+
if not changed:
|
|
164
|
+
break
|
|
165
|
+
|
|
166
|
+
return s
|
|
167
|
+
|
|
168
|
+
# ------------------------------------------------------------------
|
|
169
|
+
# Evaluation helpers
|
|
170
|
+
# ------------------------------------------------------------------
|
|
171
|
+
|
|
172
|
+
def is_stable(self, pattern: np.ndarray) -> bool:
|
|
173
|
+
"""
|
|
174
|
+
Check whether ``pattern`` is a fixed point of the dynamics
|
|
175
|
+
(i.e. recall(pattern) == pattern).
|
|
176
|
+
|
|
177
|
+
Returns
|
|
178
|
+
-------
|
|
179
|
+
bool
|
|
180
|
+
"""
|
|
181
|
+
recalled = self.recall(pattern.copy(), mode="sync", max_iter=1)
|
|
182
|
+
return bool(np.array_equal(recalled, np.sign(pattern)))
|
|
183
|
+
|
|
184
|
+
def hamming_distance(self, a: np.ndarray, b: np.ndarray) -> int:
|
|
185
|
+
"""Number of differing bipolar units between two states."""
|
|
186
|
+
return int(np.sum(np.sign(a) != np.sign(b)))
|
|
187
|
+
|
|
188
|
+
def overlap(self, a: np.ndarray, b: np.ndarray) -> float:
|
|
189
|
+
"""
|
|
190
|
+
Normalised overlap (similarity) between two bipolar states,
|
|
191
|
+
in [-1, 1]. +1 = identical, -1 = exact inverse, 0 = orthogonal.
|
|
192
|
+
"""
|
|
193
|
+
return float(np.dot(np.sign(a), np.sign(b)) / self.n_units)
|
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Perceptrons — Single-Layer and Multi-Layer
|
|
3
|
+
===========================================
|
|
4
|
+
The foundational building blocks of neural networks.
|
|
5
|
+
|
|
6
|
+
SingleLayerPerceptron
|
|
7
|
+
---------------------
|
|
8
|
+
A single layer of neurons with a configurable activation function.
|
|
9
|
+
Supports binary classification (sigmoid + binary cross-entropy) and
|
|
10
|
+
regression (linear + MSE), making the original two scripts a single
|
|
11
|
+
clean class with a ``task`` switch.
|
|
12
|
+
|
|
13
|
+
z = X W + b
|
|
14
|
+
ŷ = σ(z) # classification
|
|
15
|
+
ŷ = z # regression
|
|
16
|
+
|
|
17
|
+
Multi-Layer Perceptron
|
|
18
|
+
-----------------------
|
|
19
|
+
Fully-connected feedforward network with:
|
|
20
|
+
- Arbitrary depth via ``hidden_sizes``
|
|
21
|
+
- ReLU hidden activations
|
|
22
|
+
- Softmax output for multi-class classification
|
|
23
|
+
- Linear output for regression
|
|
24
|
+
- Mini-batch SGD with momentum
|
|
25
|
+
- He weight initialisation
|
|
26
|
+
|
|
27
|
+
References
|
|
28
|
+
----------
|
|
29
|
+
Rosenblatt, F. (1958). The perceptron: a probabilistic model for information
|
|
30
|
+
storage and organization in the brain. Psychological Review, 65(6), 386–408.
|
|
31
|
+
|
|
32
|
+
Rumelhart, D. E., Hinton, G. E., & Williams, R. J. (1986). Learning
|
|
33
|
+
representations by back-propagating errors. Nature, 323, 533–536.
|
|
34
|
+
|
|
35
|
+
Only numpy is used.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
from __future__ import annotations
|
|
39
|
+
|
|
40
|
+
import numpy as np
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ============================================================
|
|
44
|
+
# Activations (module-level helpers)
|
|
45
|
+
# ============================================================
|
|
46
|
+
|
|
47
|
+
def _sigmoid(x: np.ndarray) -> np.ndarray:
|
|
48
|
+
return 1.0 / (1.0 + np.exp(-np.clip(x, -500, 500)))
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _relu(x: np.ndarray) -> np.ndarray:
|
|
52
|
+
return np.maximum(0.0, x)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _relu_grad(x: np.ndarray) -> np.ndarray:
|
|
56
|
+
return (x > 0.0).astype(float)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _softmax(x: np.ndarray) -> np.ndarray:
|
|
60
|
+
e = np.exp(x - x.max(axis=-1, keepdims=True))
|
|
61
|
+
return e / e.sum(axis=-1, keepdims=True)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ============================================================
|
|
65
|
+
# Single-Layer Perceptron
|
|
66
|
+
# ============================================================
|
|
67
|
+
|
|
68
|
+
class SingleLayerPerceptron:
|
|
69
|
+
"""
|
|
70
|
+
Single-Layer Perceptron for binary classification or regression.
|
|
71
|
+
|
|
72
|
+
Parameters
|
|
73
|
+
----------
|
|
74
|
+
input_size : int
|
|
75
|
+
Number of input features.
|
|
76
|
+
task : str
|
|
77
|
+
``'classification'`` (sigmoid + binary cross-entropy) or
|
|
78
|
+
``'regression'`` (linear + MSE).
|
|
79
|
+
learning_rate : float
|
|
80
|
+
Gradient-descent step size.
|
|
81
|
+
epochs : int
|
|
82
|
+
Number of full passes over the training data.
|
|
83
|
+
random_state : int or None
|
|
84
|
+
Seed for reproducible weight initialisation.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
input_size: int,
|
|
90
|
+
task: str = "classification",
|
|
91
|
+
learning_rate: float = 0.01,
|
|
92
|
+
epochs: int = 1000,
|
|
93
|
+
random_state: int | None = None,
|
|
94
|
+
) -> None:
|
|
95
|
+
if task not in {"classification", "regression"}:
|
|
96
|
+
raise ValueError("task must be 'classification' or 'regression'.")
|
|
97
|
+
self.input_size = input_size
|
|
98
|
+
self.task = task
|
|
99
|
+
self.learning_rate = learning_rate
|
|
100
|
+
self.epochs = epochs
|
|
101
|
+
self._rng = np.random.default_rng(random_state)
|
|
102
|
+
|
|
103
|
+
# Parameters (initialised in fit)
|
|
104
|
+
self.weights_: np.ndarray | None = None
|
|
105
|
+
self.bias_: float | None = None
|
|
106
|
+
self.losses_: list[float] = []
|
|
107
|
+
|
|
108
|
+
# ------------------------------------------------------------------
|
|
109
|
+
# Internal helpers
|
|
110
|
+
# ------------------------------------------------------------------
|
|
111
|
+
|
|
112
|
+
def _activate(self, z: np.ndarray) -> np.ndarray:
|
|
113
|
+
return _sigmoid(z) if self.task == "classification" else z
|
|
114
|
+
|
|
115
|
+
def _loss(self, y: np.ndarray, y_hat: np.ndarray, eps: float = 1e-8) -> float:
|
|
116
|
+
if self.task == "classification":
|
|
117
|
+
return float(-np.mean(
|
|
118
|
+
y * np.log(y_hat + eps) + (1 - y) * np.log(1 - y_hat + eps)
|
|
119
|
+
))
|
|
120
|
+
return float(np.mean((y_hat - y) ** 2))
|
|
121
|
+
|
|
122
|
+
# ------------------------------------------------------------------
|
|
123
|
+
# Public API
|
|
124
|
+
# ------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
def fit(self, X: np.ndarray, y: np.ndarray) -> "SingleLayerPerceptron":
|
|
127
|
+
"""
|
|
128
|
+
Train the perceptron on (X, y).
|
|
129
|
+
|
|
130
|
+
Parameters
|
|
131
|
+
----------
|
|
132
|
+
X : ndarray of shape (n_samples, n_features)
|
|
133
|
+
y : ndarray of shape (n_samples,)
|
|
134
|
+
|
|
135
|
+
Returns
|
|
136
|
+
-------
|
|
137
|
+
self
|
|
138
|
+
"""
|
|
139
|
+
n_samples = len(X)
|
|
140
|
+
scale = np.sqrt(2.0 / self.input_size)
|
|
141
|
+
self.weights_ = self._rng.normal(0, scale, self.input_size)
|
|
142
|
+
self.bias_ = 0.0
|
|
143
|
+
self.losses_ = []
|
|
144
|
+
|
|
145
|
+
for _ in range(self.epochs):
|
|
146
|
+
z = X @ self.weights_ + self.bias_
|
|
147
|
+
y_hat = self._activate(z)
|
|
148
|
+
|
|
149
|
+
self.losses_.append(self._loss(y, y_hat))
|
|
150
|
+
|
|
151
|
+
# Gradient (identical form for both tasks)
|
|
152
|
+
error = y_hat - y
|
|
153
|
+
dw = X.T @ error / n_samples
|
|
154
|
+
db = error.mean()
|
|
155
|
+
|
|
156
|
+
self.weights_ -= self.learning_rate * dw
|
|
157
|
+
self.bias_ -= self.learning_rate * db
|
|
158
|
+
|
|
159
|
+
return self
|
|
160
|
+
|
|
161
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
162
|
+
"""
|
|
163
|
+
Predict class labels (classification) or values (regression).
|
|
164
|
+
|
|
165
|
+
Returns
|
|
166
|
+
-------
|
|
167
|
+
ndarray of shape (n_samples,)
|
|
168
|
+
"""
|
|
169
|
+
z = X @ self.weights_ + self.bias_
|
|
170
|
+
y_hat = self._activate(z)
|
|
171
|
+
if self.task == "classification":
|
|
172
|
+
return (y_hat >= 0.5).astype(int)
|
|
173
|
+
return y_hat
|
|
174
|
+
|
|
175
|
+
def predict_proba(self, X: np.ndarray) -> np.ndarray:
|
|
176
|
+
"""
|
|
177
|
+
Return sigmoid probabilities (classification only).
|
|
178
|
+
|
|
179
|
+
Returns
|
|
180
|
+
-------
|
|
181
|
+
ndarray of shape (n_samples,)
|
|
182
|
+
"""
|
|
183
|
+
if self.task != "classification":
|
|
184
|
+
raise ValueError("predict_proba is only available for classification.")
|
|
185
|
+
return _sigmoid(X @ self.weights_ + self.bias_)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# ============================================================
|
|
189
|
+
# Multi-Layer Perceptron
|
|
190
|
+
# ============================================================
|
|
191
|
+
|
|
192
|
+
class MultiLayerPerceptron:
|
|
193
|
+
"""
|
|
194
|
+
Multi-Layer Perceptron (fully-connected feedforward network).
|
|
195
|
+
|
|
196
|
+
Parameters
|
|
197
|
+
----------
|
|
198
|
+
hidden_sizes : list[int]
|
|
199
|
+
Sizes of the hidden layers (e.g. [64, 64]).
|
|
200
|
+
task : str
|
|
201
|
+
``'classification'`` (softmax + cross-entropy) or
|
|
202
|
+
``'regression'`` (linear + MSE).
|
|
203
|
+
n_classes : int
|
|
204
|
+
Number of output classes (ignored for regression).
|
|
205
|
+
learning_rate : float
|
|
206
|
+
momentum : float
|
|
207
|
+
Momentum coefficient for SGD (0 = vanilla SGD).
|
|
208
|
+
epochs : int
|
|
209
|
+
batch_size : int or None
|
|
210
|
+
Mini-batch size. None = full-batch.
|
|
211
|
+
random_state : int or None
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
def __init__(
|
|
215
|
+
self,
|
|
216
|
+
hidden_sizes: list[int] | None = None,
|
|
217
|
+
task: str = "classification",
|
|
218
|
+
n_classes: int = 2,
|
|
219
|
+
learning_rate: float = 0.01,
|
|
220
|
+
momentum: float = 0.9,
|
|
221
|
+
epochs: int = 200,
|
|
222
|
+
batch_size: int | None = 32,
|
|
223
|
+
random_state: int | None = None,
|
|
224
|
+
) -> None:
|
|
225
|
+
if task not in {"classification", "regression"}:
|
|
226
|
+
raise ValueError("task must be 'classification' or 'regression'.")
|
|
227
|
+
self.hidden_sizes = hidden_sizes or [64, 64]
|
|
228
|
+
self.task = task
|
|
229
|
+
self.n_classes = n_classes
|
|
230
|
+
self.learning_rate = learning_rate
|
|
231
|
+
self.momentum = momentum
|
|
232
|
+
self.epochs = epochs
|
|
233
|
+
self.batch_size = batch_size
|
|
234
|
+
self._rng = np.random.default_rng(random_state)
|
|
235
|
+
|
|
236
|
+
# Built in fit()
|
|
237
|
+
self.weights_: list[np.ndarray] = []
|
|
238
|
+
self.biases_: list[np.ndarray] = []
|
|
239
|
+
self.losses_: list[float] = []
|
|
240
|
+
|
|
241
|
+
# ------------------------------------------------------------------
|
|
242
|
+
# Build
|
|
243
|
+
# ------------------------------------------------------------------
|
|
244
|
+
|
|
245
|
+
def _build(self, n_features: int) -> None:
|
|
246
|
+
n_out = 1 if self.task == "regression" else self.n_classes
|
|
247
|
+
sizes = [n_features] + list(self.hidden_sizes) + [n_out]
|
|
248
|
+
|
|
249
|
+
self.weights_ = []
|
|
250
|
+
self.biases_ = []
|
|
251
|
+
for i in range(len(sizes) - 1):
|
|
252
|
+
scale = np.sqrt(2.0 / sizes[i]) # He initialisation
|
|
253
|
+
self.weights_.append(self._rng.normal(0, scale, (sizes[i], sizes[i + 1])))
|
|
254
|
+
self.biases_.append(np.zeros(sizes[i + 1]))
|
|
255
|
+
|
|
256
|
+
# ------------------------------------------------------------------
|
|
257
|
+
# Forward
|
|
258
|
+
# ------------------------------------------------------------------
|
|
259
|
+
|
|
260
|
+
def _forward(self, X: np.ndarray) -> tuple[list, list]:
|
|
261
|
+
"""Return (pre_activations, activations) for backprop."""
|
|
262
|
+
pre_acts, acts = [], [X]
|
|
263
|
+
a = X
|
|
264
|
+
for i, (W, b) in enumerate(zip(self.weights_, self.biases_)):
|
|
265
|
+
z = a @ W + b
|
|
266
|
+
pre_acts.append(z)
|
|
267
|
+
if i < len(self.weights_) - 1:
|
|
268
|
+
a = _relu(z)
|
|
269
|
+
else:
|
|
270
|
+
a = _softmax(z) if self.task == "classification" else z
|
|
271
|
+
acts.append(a)
|
|
272
|
+
return pre_acts, acts
|
|
273
|
+
|
|
274
|
+
# ------------------------------------------------------------------
|
|
275
|
+
# Loss
|
|
276
|
+
# ------------------------------------------------------------------
|
|
277
|
+
|
|
278
|
+
def _loss(self, y_hot: np.ndarray, y_hat: np.ndarray, eps: float = 1e-8) -> float:
|
|
279
|
+
if self.task == "classification":
|
|
280
|
+
return float(-np.mean(np.sum(y_hot * np.log(y_hat + eps), axis=1)))
|
|
281
|
+
return float(np.mean((y_hat.ravel() - y_hot.ravel()) ** 2))
|
|
282
|
+
|
|
283
|
+
# ------------------------------------------------------------------
|
|
284
|
+
# Backward
|
|
285
|
+
# ------------------------------------------------------------------
|
|
286
|
+
|
|
287
|
+
def _backward(
|
|
288
|
+
self,
|
|
289
|
+
pre_acts: list,
|
|
290
|
+
acts: list,
|
|
291
|
+
y_hot: np.ndarray,
|
|
292
|
+
vel_w: list,
|
|
293
|
+
vel_b: list,
|
|
294
|
+
) -> None:
|
|
295
|
+
n = len(y_hot)
|
|
296
|
+
y_hat = acts[-1]
|
|
297
|
+
|
|
298
|
+
# Output delta
|
|
299
|
+
if self.task == "classification":
|
|
300
|
+
delta = (y_hat - y_hot) / n
|
|
301
|
+
else:
|
|
302
|
+
delta = 2.0 * (y_hat - y_hot) / n
|
|
303
|
+
|
|
304
|
+
for i in reversed(range(len(self.weights_))):
|
|
305
|
+
dW = acts[i].T @ delta
|
|
306
|
+
db = delta.sum(axis=0)
|
|
307
|
+
|
|
308
|
+
# Momentum update
|
|
309
|
+
vel_w[i] = self.momentum * vel_w[i] + self.learning_rate * dW
|
|
310
|
+
vel_b[i] = self.momentum * vel_b[i] + self.learning_rate * db
|
|
311
|
+
|
|
312
|
+
self.weights_[i] -= vel_w[i]
|
|
313
|
+
self.biases_[i] -= vel_b[i]
|
|
314
|
+
|
|
315
|
+
if i > 0:
|
|
316
|
+
delta = (delta @ self.weights_[i].T) * _relu_grad(pre_acts[i - 1])
|
|
317
|
+
|
|
318
|
+
# ------------------------------------------------------------------
|
|
319
|
+
# Public API
|
|
320
|
+
# ------------------------------------------------------------------
|
|
321
|
+
|
|
322
|
+
def fit(self, X: np.ndarray, y: np.ndarray) -> "MultiLayerPerceptron":
|
|
323
|
+
"""
|
|
324
|
+
Train the MLP.
|
|
325
|
+
|
|
326
|
+
Parameters
|
|
327
|
+
----------
|
|
328
|
+
X : ndarray of shape (n_samples, n_features)
|
|
329
|
+
y : ndarray of shape (n_samples,) — integer class labels or floats
|
|
330
|
+
|
|
331
|
+
Returns
|
|
332
|
+
-------
|
|
333
|
+
self
|
|
334
|
+
"""
|
|
335
|
+
n_samples = len(X)
|
|
336
|
+
self._build(X.shape[1])
|
|
337
|
+
|
|
338
|
+
# One-hot encode targets for classification
|
|
339
|
+
if self.task == "classification":
|
|
340
|
+
n_cls = self.n_classes
|
|
341
|
+
y_hot = np.zeros((n_samples, n_cls))
|
|
342
|
+
y_hot[np.arange(n_samples), y.astype(int)] = 1.0
|
|
343
|
+
else:
|
|
344
|
+
y_hot = y.reshape(-1, 1).astype(float)
|
|
345
|
+
|
|
346
|
+
# Velocity buffers for momentum
|
|
347
|
+
vel_w = [np.zeros_like(w) for w in self.weights_]
|
|
348
|
+
vel_b = [np.zeros_like(b) for b in self.biases_]
|
|
349
|
+
|
|
350
|
+
bs = self.batch_size or n_samples
|
|
351
|
+
self.losses_ = []
|
|
352
|
+
|
|
353
|
+
for _ in range(self.epochs):
|
|
354
|
+
idx = self._rng.permutation(n_samples)
|
|
355
|
+
epoch_loss = 0.0
|
|
356
|
+
n_batches = 0
|
|
357
|
+
|
|
358
|
+
for start in range(0, n_samples, bs):
|
|
359
|
+
mb = idx[start:start + bs]
|
|
360
|
+
Xb = X[mb]
|
|
361
|
+
yb = y_hot[mb]
|
|
362
|
+
|
|
363
|
+
pre_acts, acts = self._forward(Xb)
|
|
364
|
+
epoch_loss += self._loss(yb, acts[-1])
|
|
365
|
+
n_batches += 1
|
|
366
|
+
|
|
367
|
+
self._backward(pre_acts, acts, yb, vel_w, vel_b)
|
|
368
|
+
|
|
369
|
+
self.losses_.append(epoch_loss / n_batches)
|
|
370
|
+
|
|
371
|
+
return self
|
|
372
|
+
|
|
373
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
374
|
+
"""
|
|
375
|
+
Predict class labels (classification) or values (regression).
|
|
376
|
+
|
|
377
|
+
Returns
|
|
378
|
+
-------
|
|
379
|
+
ndarray of shape (n_samples,)
|
|
380
|
+
"""
|
|
381
|
+
_, acts = self._forward(X)
|
|
382
|
+
y_hat = acts[-1]
|
|
383
|
+
if self.task == "classification":
|
|
384
|
+
return np.argmax(y_hat, axis=1)
|
|
385
|
+
return y_hat.ravel()
|
|
386
|
+
|
|
387
|
+
def predict_proba(self, X: np.ndarray) -> np.ndarray:
|
|
388
|
+
"""
|
|
389
|
+
Return softmax probabilities (classification only).
|
|
390
|
+
|
|
391
|
+
Returns
|
|
392
|
+
-------
|
|
393
|
+
ndarray of shape (n_samples, n_classes)
|
|
394
|
+
"""
|
|
395
|
+
if self.task != "classification":
|
|
396
|
+
raise ValueError("predict_proba is only available for classification.")
|
|
397
|
+
_, acts = self._forward(X)
|
|
398
|
+
return acts[-1]
|