scratchkit 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlscratch/__init__.py +56 -0
- mlscratch/__main__.py +118 -0
- mlscratch/bayesian/__init__.py +53 -0
- mlscratch/bayesian/bayesian_linear_regression.py +171 -0
- mlscratch/bayesian/bayesian_network.py +248 -0
- mlscratch/bayesian/bayesian_nn.py +315 -0
- mlscratch/bayesian/gaussian_process.py +207 -0
- mlscratch/bayesian/hmm.py +277 -0
- mlscratch/bayesian/init.py +52 -0
- mlscratch/bayesian/kalman_filter.py +182 -0
- mlscratch/bayesian/naive_bayes.py +209 -0
- mlscratch/metrics/__init__.py +59 -0
- mlscratch/metrics/classification.py +365 -0
- mlscratch/metrics/regression.py +79 -0
- mlscratch/neural/__init__.py +121 -0
- mlscratch/neural/attention.py +420 -0
- mlscratch/neural/autoencoder.py +543 -0
- mlscratch/neural/boltzmann.py +231 -0
- mlscratch/neural/cnn.py +593 -0
- mlscratch/neural/cvnn.py +322 -0
- mlscratch/neural/gan.py +364 -0
- mlscratch/neural/hopfield.py +193 -0
- mlscratch/neural/perceptron.py +398 -0
- mlscratch/neural/rbf_network.py +230 -0
- mlscratch/neural/recurrent.py +569 -0
- mlscratch/preprocessing/__init__.py +38 -0
- mlscratch/preprocessing/encoders.py +140 -0
- mlscratch/preprocessing/model_selection.py +119 -0
- mlscratch/preprocessing/polynomial.py +105 -0
- mlscratch/preprocessing/scalers.py +220 -0
- mlscratch/py.typed +0 -0
- mlscratch/reinforcement/__init__.py +59 -0
- mlscratch/reinforcement/ddpg.py +363 -0
- mlscratch/reinforcement/dqn.py +319 -0
- mlscratch/reinforcement/ppo.py +452 -0
- mlscratch/reinforcement/q_learning.py +352 -0
- mlscratch/reinforcement/sac.py +382 -0
- mlscratch/reinforcement/utils.py +594 -0
- mlscratch/supervised/__init__.py +76 -0
- mlscratch/supervised/_validation.py +50 -0
- mlscratch/supervised/adaboost.py +255 -0
- mlscratch/supervised/decision_tree.py +495 -0
- mlscratch/supervised/gradient_boosting.py +354 -0
- mlscratch/supervised/knn.py +234 -0
- mlscratch/supervised/lasso_regression.py +125 -0
- mlscratch/supervised/linear_models.py +459 -0
- mlscratch/supervised/linear_regression.py +197 -0
- mlscratch/supervised/logistic_regression.py +119 -0
- mlscratch/supervised/naive_bayes.py +113 -0
- mlscratch/supervised/random_forest.py +321 -0
- mlscratch/supervised/ridge_regression.py +93 -0
- mlscratch/supervised/svm.py +356 -0
- mlscratch/unsupervised/__init__.py +39 -0
- mlscratch/unsupervised/apriori.py +178 -0
- mlscratch/unsupervised/dbscan.py +141 -0
- mlscratch/unsupervised/gmm.py +204 -0
- mlscratch/unsupervised/hierarchical_clustering.py +137 -0
- mlscratch/unsupervised/ica.py +167 -0
- mlscratch/unsupervised/kmeans.py +135 -0
- mlscratch/unsupervised/kmedoids.py +133 -0
- mlscratch/unsupervised/pca.py +103 -0
- mlscratch/unsupervised/tsne.py +200 -0
- scratchkit-0.2.0.dist-info/METADATA +241 -0
- scratchkit-0.2.0.dist-info/RECORD +68 -0
- scratchkit-0.2.0.dist-info/WHEEL +5 -0
- scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
- scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
- scratchkit-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,569 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Recurrent Neural Networks
|
|
3
|
+
==========================
|
|
4
|
+
Sequential-data architectures that maintain a hidden state across timesteps.
|
|
5
|
+
|
|
6
|
+
SimpleRNN
|
|
7
|
+
---------
|
|
8
|
+
Elman recurrent network:
|
|
9
|
+
h_t = tanh(W_xh x_t + W_hh h_{t-1} + b_h)
|
|
10
|
+
y_t = W_hy h_t + b_y (output layer, optional)
|
|
11
|
+
|
|
12
|
+
LSTMCell / LSTM
|
|
13
|
+
---------------
|
|
14
|
+
Long Short-Term Memory (Hochreiter & Schmidhuber, 1997).
|
|
15
|
+
Four gates operating on the concatenated [x_t; h_{t-1}]:
|
|
16
|
+
|
|
17
|
+
i_t = σ(W_i [x_t; h_{t-1}] + b_i) input gate
|
|
18
|
+
f_t = σ(W_f [x_t; h_{t-1}] + b_f) forget gate
|
|
19
|
+
g_t = tanh(W_g [x_t; h_{t-1}] + b_g) cell gate (candidate)
|
|
20
|
+
o_t = σ(W_o [x_t; h_{t-1}] + b_o) output gate
|
|
21
|
+
c_t = f_t ⊙ c_{t-1} + i_t ⊙ g_t
|
|
22
|
+
h_t = o_t ⊙ tanh(c_t)
|
|
23
|
+
|
|
24
|
+
EncoderDecoder
|
|
25
|
+
--------------
|
|
26
|
+
Sequence-to-sequence architecture with an RNN encoder that compresses
|
|
27
|
+
an input sequence to a context vector, and an RNN decoder that
|
|
28
|
+
unrolls to produce the output sequence.
|
|
29
|
+
|
|
30
|
+
References
|
|
31
|
+
----------
|
|
32
|
+
Elman, J. (1990). Finding structure in time. Cognitive Science, 14(2), 179-211.
|
|
33
|
+
Hochreiter & Schmidhuber (1997). Long short-term memory. Neural Computation.
|
|
34
|
+
Sutskever et al. (2014). Sequence to sequence learning with neural networks. NeurIPS.
|
|
35
|
+
|
|
36
|
+
Only numpy is used.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
from __future__ import annotations
|
|
40
|
+
|
|
41
|
+
import numpy as np
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# ============================================================
|
|
45
|
+
# Helpers
|
|
46
|
+
# ============================================================
|
|
47
|
+
|
|
48
|
+
def _sigmoid(x: np.ndarray) -> np.ndarray:
|
|
49
|
+
return 1.0 / (1.0 + np.exp(-np.clip(x, -500, 500)))
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _softmax(x: np.ndarray) -> np.ndarray:
|
|
53
|
+
e = np.exp(x - x.max(axis=-1, keepdims=True))
|
|
54
|
+
return e / e.sum(axis=-1, keepdims=True)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# ============================================================
|
|
58
|
+
# SimpleRNN
|
|
59
|
+
# ============================================================
|
|
60
|
+
|
|
61
|
+
class SimpleRNN:
|
|
62
|
+
"""
|
|
63
|
+
Simple Elman RNN.
|
|
64
|
+
|
|
65
|
+
Supports sequence classification (uses final hidden state),
|
|
66
|
+
sequence regression, and returning all hidden states.
|
|
67
|
+
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
input_size : int
|
|
71
|
+
hidden_size : int
|
|
72
|
+
output_size : int or None
|
|
73
|
+
If None, the network is a feature extractor (returns hidden states).
|
|
74
|
+
return_sequences : bool
|
|
75
|
+
If True, return hidden state at every timestep.
|
|
76
|
+
If False (default), return only the final hidden state.
|
|
77
|
+
learning_rate : float
|
|
78
|
+
epochs : int
|
|
79
|
+
random_state : int or None
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def __init__(
|
|
83
|
+
self,
|
|
84
|
+
input_size: int,
|
|
85
|
+
hidden_size: int,
|
|
86
|
+
output_size: int | None = None,
|
|
87
|
+
return_sequences: bool = False,
|
|
88
|
+
learning_rate: float = 1e-3,
|
|
89
|
+
epochs: int = 50,
|
|
90
|
+
random_state: int | None = None,
|
|
91
|
+
) -> None:
|
|
92
|
+
self.input_size = input_size
|
|
93
|
+
self.hidden_size = hidden_size
|
|
94
|
+
self.output_size = output_size
|
|
95
|
+
self.return_sequences = return_sequences
|
|
96
|
+
self.learning_rate = learning_rate
|
|
97
|
+
self.epochs = epochs
|
|
98
|
+
self._rng = np.random.default_rng(random_state)
|
|
99
|
+
|
|
100
|
+
self._init_params()
|
|
101
|
+
self.losses_: list[float] = []
|
|
102
|
+
|
|
103
|
+
# ------------------------------------------------------------------
|
|
104
|
+
# Init
|
|
105
|
+
# ------------------------------------------------------------------
|
|
106
|
+
|
|
107
|
+
def _init_params(self) -> None:
|
|
108
|
+
D, H = self.input_size, self.hidden_size
|
|
109
|
+
s_xh = np.sqrt(2.0 / D)
|
|
110
|
+
s_hh = np.sqrt(2.0 / H)
|
|
111
|
+
|
|
112
|
+
self.W_xh = self._rng.normal(0, s_xh, (D, H))
|
|
113
|
+
self.W_hh = self._rng.normal(0, s_hh, (H, H))
|
|
114
|
+
self.b_h = np.zeros(H)
|
|
115
|
+
|
|
116
|
+
if self.output_size is not None:
|
|
117
|
+
self.W_hy = self._rng.normal(0, np.sqrt(2.0 / H), (H, self.output_size))
|
|
118
|
+
self.b_y = np.zeros(self.output_size)
|
|
119
|
+
|
|
120
|
+
# ------------------------------------------------------------------
|
|
121
|
+
# Forward
|
|
122
|
+
# ------------------------------------------------------------------
|
|
123
|
+
|
|
124
|
+
def forward(self, X: np.ndarray) -> np.ndarray:
|
|
125
|
+
"""
|
|
126
|
+
Forward pass through the RNN.
|
|
127
|
+
|
|
128
|
+
Parameters
|
|
129
|
+
----------
|
|
130
|
+
X : ndarray of shape (seq_len, input_size) or
|
|
131
|
+
(batch, seq_len, input_size)
|
|
132
|
+
|
|
133
|
+
Returns
|
|
134
|
+
-------
|
|
135
|
+
ndarray — hidden states (and optionally output projections)
|
|
136
|
+
"""
|
|
137
|
+
batched = X.ndim == 3
|
|
138
|
+
if not batched:
|
|
139
|
+
X = X[np.newaxis, :] # (1, T, D)
|
|
140
|
+
|
|
141
|
+
B, T, D = X.shape
|
|
142
|
+
H = self.hidden_size
|
|
143
|
+
h = np.zeros((B, H))
|
|
144
|
+
hidden_states = []
|
|
145
|
+
|
|
146
|
+
for t in range(T):
|
|
147
|
+
h = np.tanh(X[:, t, :] @ self.W_xh + h @ self.W_hh + self.b_h)
|
|
148
|
+
hidden_states.append(h.copy())
|
|
149
|
+
|
|
150
|
+
hidden_states = np.stack(hidden_states, axis=1) # (B, T, H)
|
|
151
|
+
|
|
152
|
+
if self.return_sequences:
|
|
153
|
+
out = hidden_states
|
|
154
|
+
else:
|
|
155
|
+
out = hidden_states[:, -1, :] # (B, H)
|
|
156
|
+
|
|
157
|
+
if self.output_size is not None:
|
|
158
|
+
out = out @ self.W_hy + self.b_y
|
|
159
|
+
|
|
160
|
+
return out[0] if not batched else out
|
|
161
|
+
|
|
162
|
+
def fit(self, X: np.ndarray, y: np.ndarray) -> "SimpleRNN":
|
|
163
|
+
"""
|
|
164
|
+
Train the RNN on sequences X with targets y.
|
|
165
|
+
|
|
166
|
+
Parameters
|
|
167
|
+
----------
|
|
168
|
+
X : ndarray (n_samples, seq_len, input_size)
|
|
169
|
+
y : ndarray (n_samples,) or (n_samples, output_size)
|
|
170
|
+
|
|
171
|
+
Returns
|
|
172
|
+
-------
|
|
173
|
+
self
|
|
174
|
+
"""
|
|
175
|
+
if self.output_size is None:
|
|
176
|
+
raise ValueError("output_size must be set to use fit().")
|
|
177
|
+
n = len(X)
|
|
178
|
+
lr = self.learning_rate
|
|
179
|
+
self.losses_ = []
|
|
180
|
+
|
|
181
|
+
for epoch in range(self.epochs):
|
|
182
|
+
idx = self._rng.permutation(n)
|
|
183
|
+
loss = 0.0
|
|
184
|
+
|
|
185
|
+
for i in idx:
|
|
186
|
+
# Single-sample BPTT (simplified; no truncation)
|
|
187
|
+
xi = X[i] # (T, D)
|
|
188
|
+
yi = y[i:i+1] if y.ndim == 1 else y[i:i+1]
|
|
189
|
+
T_len = xi.shape[0]
|
|
190
|
+
H = self.hidden_size
|
|
191
|
+
|
|
192
|
+
# Forward
|
|
193
|
+
hs = np.zeros((T_len + 1, H))
|
|
194
|
+
for t in range(T_len):
|
|
195
|
+
hs[t + 1] = np.tanh(
|
|
196
|
+
xi[t:t+1] @ self.W_xh + hs[t:t+1] @ self.W_hh + self.b_h
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
out = hs[-1:] @ self.W_hy + self.b_y
|
|
200
|
+
error = out - yi.reshape(1, -1)
|
|
201
|
+
loss += float(np.mean(error ** 2))
|
|
202
|
+
|
|
203
|
+
# Backward through output layer
|
|
204
|
+
d_out = 2.0 * error
|
|
205
|
+
dW_hy = hs[-1:].T @ d_out
|
|
206
|
+
db_y = d_out.squeeze()
|
|
207
|
+
|
|
208
|
+
# BPTT
|
|
209
|
+
dh_next = d_out @ self.W_hy.T
|
|
210
|
+
dW_xh = np.zeros_like(self.W_xh)
|
|
211
|
+
dW_hh = np.zeros_like(self.W_hh)
|
|
212
|
+
db_h = np.zeros(H)
|
|
213
|
+
|
|
214
|
+
for t in reversed(range(T_len)):
|
|
215
|
+
dtanh = dh_next * (1.0 - hs[t + 1] ** 2)
|
|
216
|
+
dW_xh += xi[t:t+1].T @ dtanh
|
|
217
|
+
dW_hh += hs[t:t+1].T @ dtanh
|
|
218
|
+
db_h += dtanh.squeeze()
|
|
219
|
+
dh_next = dtanh @ self.W_hh.T
|
|
220
|
+
|
|
221
|
+
# Clip gradients
|
|
222
|
+
for grad in [dW_xh, dW_hh, dW_hy, db_h, db_y]:
|
|
223
|
+
np.clip(grad, -5, 5, out=grad)
|
|
224
|
+
|
|
225
|
+
self.W_xh -= lr * dW_xh
|
|
226
|
+
self.W_hh -= lr * dW_hh
|
|
227
|
+
self.b_h -= lr * db_h
|
|
228
|
+
self.W_hy -= lr * dW_hy
|
|
229
|
+
self.b_y -= lr * db_y
|
|
230
|
+
|
|
231
|
+
self.losses_.append(loss / n)
|
|
232
|
+
|
|
233
|
+
return self
|
|
234
|
+
|
|
235
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
236
|
+
"""Run forward pass on X."""
|
|
237
|
+
return self.forward(X)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
# ============================================================
|
|
241
|
+
# LSTMCell
|
|
242
|
+
# ============================================================
|
|
243
|
+
|
|
244
|
+
class LSTMCell:
|
|
245
|
+
"""
|
|
246
|
+
A single LSTM cell — stateful, processes one timestep at a time.
|
|
247
|
+
|
|
248
|
+
Parameters
|
|
249
|
+
----------
|
|
250
|
+
input_size : int
|
|
251
|
+
hidden_size : int
|
|
252
|
+
random_state : int or None
|
|
253
|
+
"""
|
|
254
|
+
|
|
255
|
+
def __init__(
|
|
256
|
+
self,
|
|
257
|
+
input_size: int,
|
|
258
|
+
hidden_size: int,
|
|
259
|
+
random_state: int | None = None,
|
|
260
|
+
) -> None:
|
|
261
|
+
self.input_size = input_size
|
|
262
|
+
self.hidden_size = hidden_size
|
|
263
|
+
self._rng = np.random.default_rng(random_state)
|
|
264
|
+
|
|
265
|
+
H, D = hidden_size, input_size
|
|
266
|
+
scale = np.sqrt(2.0 / (D + H))
|
|
267
|
+
# Single stacked weight matrix for efficiency: [i, f, g, o]
|
|
268
|
+
self.W = self._rng.normal(0, scale, (4 * H, D + H))
|
|
269
|
+
self.b = np.zeros(4 * H)
|
|
270
|
+
|
|
271
|
+
self.reset_state()
|
|
272
|
+
|
|
273
|
+
def reset_state(self) -> None:
|
|
274
|
+
"""Reset hidden and cell state to zeros."""
|
|
275
|
+
H = self.hidden_size
|
|
276
|
+
self.h_t = np.zeros((1, H))
|
|
277
|
+
self.c_t = np.zeros((1, H))
|
|
278
|
+
|
|
279
|
+
def forward(self, x_t: np.ndarray) -> np.ndarray:
|
|
280
|
+
"""
|
|
281
|
+
Process one timestep.
|
|
282
|
+
|
|
283
|
+
Parameters
|
|
284
|
+
----------
|
|
285
|
+
x_t : ndarray of shape (input_size,) or (1, input_size)
|
|
286
|
+
|
|
287
|
+
Returns
|
|
288
|
+
-------
|
|
289
|
+
h_t : ndarray of shape (hidden_size,)
|
|
290
|
+
"""
|
|
291
|
+
x_t = np.atleast_2d(x_t) # (1, D)
|
|
292
|
+
xh = np.concatenate([x_t, self.h_t], axis=1) # (1, D+H)
|
|
293
|
+
gates = xh @ self.W.T + self.b # (1, 4H)
|
|
294
|
+
|
|
295
|
+
H = self.hidden_size
|
|
296
|
+
i_t = _sigmoid(gates[:, :H])
|
|
297
|
+
f_t = _sigmoid(gates[:, H:2*H])
|
|
298
|
+
g_t = np.tanh(gates[:, 2*H:3*H])
|
|
299
|
+
o_t = _sigmoid(gates[:, 3*H:])
|
|
300
|
+
|
|
301
|
+
self.c_t = f_t * self.c_t + i_t * g_t
|
|
302
|
+
self.h_t = o_t * np.tanh(self.c_t)
|
|
303
|
+
|
|
304
|
+
return self.h_t.squeeze()
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
# ============================================================
|
|
308
|
+
# LSTM (multi-layer, with optional linear output head)
|
|
309
|
+
# ============================================================
|
|
310
|
+
|
|
311
|
+
class LSTM:
|
|
312
|
+
"""
|
|
313
|
+
Multi-layer LSTM for sequence modelling.
|
|
314
|
+
|
|
315
|
+
Parameters
|
|
316
|
+
----------
|
|
317
|
+
input_size : int
|
|
318
|
+
hidden_size : int
|
|
319
|
+
num_layers : int
|
|
320
|
+
Number of stacked LSTM layers.
|
|
321
|
+
output_size : int or None
|
|
322
|
+
If set, a linear projection layer is added on top of the final
|
|
323
|
+
hidden state.
|
|
324
|
+
return_sequences : bool
|
|
325
|
+
Return all hidden states (True) or just the final one (False).
|
|
326
|
+
dropout : float
|
|
327
|
+
Dropout probability applied between LSTM layers (0 = no dropout).
|
|
328
|
+
random_state : int or None
|
|
329
|
+
"""
|
|
330
|
+
|
|
331
|
+
def __init__(
|
|
332
|
+
self,
|
|
333
|
+
input_size: int,
|
|
334
|
+
hidden_size: int,
|
|
335
|
+
num_layers: int = 1,
|
|
336
|
+
output_size: int | None = None,
|
|
337
|
+
return_sequences: bool = False,
|
|
338
|
+
dropout: float = 0.0,
|
|
339
|
+
random_state: int | None = None,
|
|
340
|
+
) -> None:
|
|
341
|
+
self.input_size = input_size
|
|
342
|
+
self.hidden_size = hidden_size
|
|
343
|
+
self.num_layers = num_layers
|
|
344
|
+
self.output_size = output_size
|
|
345
|
+
self.return_sequences = return_sequences
|
|
346
|
+
self.dropout = dropout
|
|
347
|
+
self._rng = np.random.default_rng(random_state)
|
|
348
|
+
|
|
349
|
+
# Build one cell per layer
|
|
350
|
+
layer_input = input_size
|
|
351
|
+
self.cells: list[LSTMCell] = []
|
|
352
|
+
for i in range(num_layers):
|
|
353
|
+
seed = (random_state or 0) + i
|
|
354
|
+
self.cells.append(LSTMCell(layer_input, hidden_size, seed))
|
|
355
|
+
layer_input = hidden_size
|
|
356
|
+
|
|
357
|
+
# Optional linear output head
|
|
358
|
+
if output_size is not None:
|
|
359
|
+
scale = np.sqrt(2.0 / hidden_size)
|
|
360
|
+
self.W_out = self._rng.normal(0, scale, (hidden_size, output_size))
|
|
361
|
+
self.b_out = np.zeros(output_size)
|
|
362
|
+
else:
|
|
363
|
+
self.W_out = None
|
|
364
|
+
self.b_out = None
|
|
365
|
+
|
|
366
|
+
def reset_states(self) -> None:
|
|
367
|
+
"""Reset all cell hidden and cell states."""
|
|
368
|
+
for cell in self.cells:
|
|
369
|
+
cell.reset_state()
|
|
370
|
+
|
|
371
|
+
def forward(self, X: np.ndarray, training: bool = False) -> np.ndarray:
|
|
372
|
+
"""
|
|
373
|
+
Forward pass through the stacked LSTM.
|
|
374
|
+
|
|
375
|
+
Parameters
|
|
376
|
+
----------
|
|
377
|
+
X : ndarray of shape (seq_len, input_size) or
|
|
378
|
+
(batch, seq_len, input_size)
|
|
379
|
+
training : bool
|
|
380
|
+
If True and dropout > 0, apply dropout between layers.
|
|
381
|
+
|
|
382
|
+
Returns
|
|
383
|
+
-------
|
|
384
|
+
ndarray — shape depends on return_sequences and output_size
|
|
385
|
+
"""
|
|
386
|
+
batched = X.ndim == 3
|
|
387
|
+
if batched:
|
|
388
|
+
# Process each sequence in batch independently
|
|
389
|
+
results = [self._forward_single(X[b], training) for b in range(X.shape[0])]
|
|
390
|
+
return np.stack(results)
|
|
391
|
+
return self._forward_single(X, training)
|
|
392
|
+
|
|
393
|
+
def _forward_single(self, X: np.ndarray, training: bool) -> np.ndarray:
|
|
394
|
+
"""Forward pass for a single (unbatched) sequence (T, D)."""
|
|
395
|
+
T = len(X)
|
|
396
|
+
H = self.hidden_size
|
|
397
|
+
|
|
398
|
+
# Reset states for fresh inference
|
|
399
|
+
self.reset_states()
|
|
400
|
+
|
|
401
|
+
all_outputs = []
|
|
402
|
+
current_input = X # (T, D)
|
|
403
|
+
|
|
404
|
+
for layer_idx, cell in enumerate(self.cells):
|
|
405
|
+
layer_outputs = []
|
|
406
|
+
for t in range(T):
|
|
407
|
+
h_t = cell.forward(current_input[t])
|
|
408
|
+
layer_outputs.append(h_t.copy())
|
|
409
|
+
layer_outputs = np.stack(layer_outputs) # (T, H)
|
|
410
|
+
|
|
411
|
+
# Dropout between layers (not on last layer)
|
|
412
|
+
if (training and self.dropout > 0
|
|
413
|
+
and layer_idx < self.num_layers - 1):
|
|
414
|
+
mask = (self._rng.random(layer_outputs.shape) > self.dropout).astype(float)
|
|
415
|
+
layer_outputs = layer_outputs * mask / (1.0 - self.dropout + 1e-8)
|
|
416
|
+
|
|
417
|
+
current_input = layer_outputs
|
|
418
|
+
all_outputs.append(layer_outputs)
|
|
419
|
+
|
|
420
|
+
final_hidden = all_outputs[-1] # (T, H) from last layer
|
|
421
|
+
|
|
422
|
+
if self.return_sequences:
|
|
423
|
+
out = final_hidden
|
|
424
|
+
else:
|
|
425
|
+
out = final_hidden[-1] # (H,)
|
|
426
|
+
|
|
427
|
+
if self.W_out is not None:
|
|
428
|
+
out = out @ self.W_out + self.b_out
|
|
429
|
+
|
|
430
|
+
return out
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
# ============================================================
|
|
434
|
+
# Encoder-Decoder (Seq2Seq)
|
|
435
|
+
# ============================================================
|
|
436
|
+
|
|
437
|
+
class EncoderDecoder:
|
|
438
|
+
"""
|
|
439
|
+
Sequence-to-sequence Encoder-Decoder with RNN encoder and decoder.
|
|
440
|
+
|
|
441
|
+
The encoder reads the input sequence and produces a context vector
|
|
442
|
+
(final hidden state). The decoder is initialised with this context
|
|
443
|
+
and unrolls to generate the output sequence.
|
|
444
|
+
|
|
445
|
+
Parameters
|
|
446
|
+
----------
|
|
447
|
+
input_vocab_size : int
|
|
448
|
+
Vocabulary size of the input sequence (one-hot encoded).
|
|
449
|
+
output_vocab_size : int
|
|
450
|
+
Vocabulary size of the output sequence.
|
|
451
|
+
hidden_size : int
|
|
452
|
+
Hidden state size for both encoder and decoder.
|
|
453
|
+
random_state : int or None
|
|
454
|
+
"""
|
|
455
|
+
|
|
456
|
+
def __init__(
|
|
457
|
+
self,
|
|
458
|
+
input_vocab_size: int,
|
|
459
|
+
output_vocab_size: int,
|
|
460
|
+
hidden_size: int,
|
|
461
|
+
random_state: int | None = None,
|
|
462
|
+
) -> None:
|
|
463
|
+
self.input_vocab_size = input_vocab_size
|
|
464
|
+
self.output_vocab_size = output_vocab_size
|
|
465
|
+
self.hidden_size = hidden_size
|
|
466
|
+
self._rng = np.random.default_rng(random_state)
|
|
467
|
+
|
|
468
|
+
self._init_params()
|
|
469
|
+
|
|
470
|
+
def _init_params(self) -> None:
|
|
471
|
+
D_in = self.input_vocab_size
|
|
472
|
+
D_out = self.output_vocab_size
|
|
473
|
+
H = self.hidden_size
|
|
474
|
+
s = lambda fi: np.sqrt(2.0 / fi)
|
|
475
|
+
|
|
476
|
+
# Encoder
|
|
477
|
+
self.We_xh = self._rng.normal(0, s(D_in), (D_in, H))
|
|
478
|
+
self.We_hh = self._rng.normal(0, s(H), (H, H))
|
|
479
|
+
self.be_h = np.zeros(H)
|
|
480
|
+
|
|
481
|
+
# Decoder
|
|
482
|
+
self.Wd_xh = self._rng.normal(0, s(H), (H, H))
|
|
483
|
+
self.Wd_hh = self._rng.normal(0, s(H), (H, H))
|
|
484
|
+
self.bd_h = np.zeros(H)
|
|
485
|
+
|
|
486
|
+
# Decoder output projection
|
|
487
|
+
self.Wd_hy = self._rng.normal(0, s(H), (H, D_out))
|
|
488
|
+
self.bd_y = np.zeros(D_out)
|
|
489
|
+
|
|
490
|
+
def _encode(self, X_one_hot: np.ndarray) -> np.ndarray:
|
|
491
|
+
"""
|
|
492
|
+
Encode input sequence into a context vector.
|
|
493
|
+
|
|
494
|
+
Parameters
|
|
495
|
+
----------
|
|
496
|
+
X_one_hot : ndarray (seq_len, input_vocab_size)
|
|
497
|
+
|
|
498
|
+
Returns
|
|
499
|
+
-------
|
|
500
|
+
context : ndarray (hidden_size,) — final hidden state
|
|
501
|
+
"""
|
|
502
|
+
H = self.hidden_size
|
|
503
|
+
h = np.zeros(H)
|
|
504
|
+
for t in range(len(X_one_hot)):
|
|
505
|
+
h = np.tanh(
|
|
506
|
+
X_one_hot[t] @ self.We_xh + h @ self.We_hh + self.be_h
|
|
507
|
+
)
|
|
508
|
+
return h
|
|
509
|
+
|
|
510
|
+
def _decode(self, context: np.ndarray, output_len: int) -> np.ndarray:
|
|
511
|
+
"""
|
|
512
|
+
Decode context vector into an output sequence.
|
|
513
|
+
|
|
514
|
+
Parameters
|
|
515
|
+
----------
|
|
516
|
+
context : ndarray (hidden_size,)
|
|
517
|
+
output_len : int
|
|
518
|
+
|
|
519
|
+
Returns
|
|
520
|
+
-------
|
|
521
|
+
outputs : ndarray (output_len, output_vocab_size)
|
|
522
|
+
"""
|
|
523
|
+
H = self.hidden_size
|
|
524
|
+
h = np.zeros(H)
|
|
525
|
+
outputs = []
|
|
526
|
+
|
|
527
|
+
for t in range(output_len):
|
|
528
|
+
h = np.tanh(
|
|
529
|
+
context @ self.Wd_xh + h @ self.Wd_hh + self.bd_h
|
|
530
|
+
)
|
|
531
|
+
y_t = _softmax((h @ self.Wd_hy + self.bd_y).reshape(1, -1)).squeeze()
|
|
532
|
+
outputs.append(y_t)
|
|
533
|
+
|
|
534
|
+
return np.stack(outputs)
|
|
535
|
+
|
|
536
|
+
def forward(
|
|
537
|
+
self,
|
|
538
|
+
input_sequence: np.ndarray,
|
|
539
|
+
output_len: int | None = None,
|
|
540
|
+
) -> np.ndarray:
|
|
541
|
+
"""
|
|
542
|
+
Encode input_sequence and decode to output_len tokens.
|
|
543
|
+
|
|
544
|
+
Parameters
|
|
545
|
+
----------
|
|
546
|
+
input_sequence : ndarray (seq_len, input_vocab_size)
|
|
547
|
+
One-hot encoded input.
|
|
548
|
+
output_len : int or None
|
|
549
|
+
Target sequence length. Defaults to len(input_sequence).
|
|
550
|
+
|
|
551
|
+
Returns
|
|
552
|
+
-------
|
|
553
|
+
outputs : ndarray (output_len, output_vocab_size)
|
|
554
|
+
"""
|
|
555
|
+
if output_len is None:
|
|
556
|
+
output_len = len(input_sequence)
|
|
557
|
+
context = self._encode(input_sequence)
|
|
558
|
+
return self._decode(context, output_len)
|
|
559
|
+
|
|
560
|
+
def predict_sequence(self, input_sequence: np.ndarray) -> np.ndarray:
|
|
561
|
+
"""
|
|
562
|
+
Predict token indices for an input sequence.
|
|
563
|
+
|
|
564
|
+
Returns
|
|
565
|
+
-------
|
|
566
|
+
ndarray of shape (output_len,) — integer token indices
|
|
567
|
+
"""
|
|
568
|
+
outputs = self.forward(input_sequence)
|
|
569
|
+
return np.argmax(outputs, axis=1)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
mlscratch.preprocessing
|
|
3
|
+
=========================
|
|
4
|
+
Feature scaling, categorical encoding, polynomial feature expansion,
|
|
5
|
+
and train/test splitting utilities — pure numpy, sklearn-familiar API.
|
|
6
|
+
|
|
7
|
+
Scalers
|
|
8
|
+
-------
|
|
9
|
+
StandardScaler, MinMaxScaler, RobustScaler, Normalizer
|
|
10
|
+
|
|
11
|
+
Encoders
|
|
12
|
+
--------
|
|
13
|
+
LabelEncoder, OneHotEncoder
|
|
14
|
+
|
|
15
|
+
Feature expansion
|
|
16
|
+
------------------
|
|
17
|
+
PolynomialFeatures
|
|
18
|
+
|
|
19
|
+
Splitting
|
|
20
|
+
---------
|
|
21
|
+
train_test_split
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from .encoders import LabelEncoder, OneHotEncoder # noqa: F401
|
|
25
|
+
from .model_selection import train_test_split # noqa: F401
|
|
26
|
+
from .polynomial import PolynomialFeatures # noqa: F401
|
|
27
|
+
from .scalers import MinMaxScaler, Normalizer, RobustScaler, StandardScaler # noqa: F401
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"StandardScaler",
|
|
31
|
+
"MinMaxScaler",
|
|
32
|
+
"RobustScaler",
|
|
33
|
+
"Normalizer",
|
|
34
|
+
"LabelEncoder",
|
|
35
|
+
"OneHotEncoder",
|
|
36
|
+
"PolynomialFeatures",
|
|
37
|
+
"train_test_split",
|
|
38
|
+
]
|