scratchkit 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlscratch/__init__.py +56 -0
- mlscratch/__main__.py +118 -0
- mlscratch/bayesian/__init__.py +53 -0
- mlscratch/bayesian/bayesian_linear_regression.py +171 -0
- mlscratch/bayesian/bayesian_network.py +248 -0
- mlscratch/bayesian/bayesian_nn.py +315 -0
- mlscratch/bayesian/gaussian_process.py +207 -0
- mlscratch/bayesian/hmm.py +277 -0
- mlscratch/bayesian/init.py +52 -0
- mlscratch/bayesian/kalman_filter.py +182 -0
- mlscratch/bayesian/naive_bayes.py +209 -0
- mlscratch/metrics/__init__.py +59 -0
- mlscratch/metrics/classification.py +365 -0
- mlscratch/metrics/regression.py +79 -0
- mlscratch/neural/__init__.py +121 -0
- mlscratch/neural/attention.py +420 -0
- mlscratch/neural/autoencoder.py +543 -0
- mlscratch/neural/boltzmann.py +231 -0
- mlscratch/neural/cnn.py +593 -0
- mlscratch/neural/cvnn.py +322 -0
- mlscratch/neural/gan.py +364 -0
- mlscratch/neural/hopfield.py +193 -0
- mlscratch/neural/perceptron.py +398 -0
- mlscratch/neural/rbf_network.py +230 -0
- mlscratch/neural/recurrent.py +569 -0
- mlscratch/preprocessing/__init__.py +38 -0
- mlscratch/preprocessing/encoders.py +140 -0
- mlscratch/preprocessing/model_selection.py +119 -0
- mlscratch/preprocessing/polynomial.py +105 -0
- mlscratch/preprocessing/scalers.py +220 -0
- mlscratch/py.typed +0 -0
- mlscratch/reinforcement/__init__.py +59 -0
- mlscratch/reinforcement/ddpg.py +363 -0
- mlscratch/reinforcement/dqn.py +319 -0
- mlscratch/reinforcement/ppo.py +452 -0
- mlscratch/reinforcement/q_learning.py +352 -0
- mlscratch/reinforcement/sac.py +382 -0
- mlscratch/reinforcement/utils.py +594 -0
- mlscratch/supervised/__init__.py +76 -0
- mlscratch/supervised/_validation.py +50 -0
- mlscratch/supervised/adaboost.py +255 -0
- mlscratch/supervised/decision_tree.py +495 -0
- mlscratch/supervised/gradient_boosting.py +354 -0
- mlscratch/supervised/knn.py +234 -0
- mlscratch/supervised/lasso_regression.py +125 -0
- mlscratch/supervised/linear_models.py +459 -0
- mlscratch/supervised/linear_regression.py +197 -0
- mlscratch/supervised/logistic_regression.py +119 -0
- mlscratch/supervised/naive_bayes.py +113 -0
- mlscratch/supervised/random_forest.py +321 -0
- mlscratch/supervised/ridge_regression.py +93 -0
- mlscratch/supervised/svm.py +356 -0
- mlscratch/unsupervised/__init__.py +39 -0
- mlscratch/unsupervised/apriori.py +178 -0
- mlscratch/unsupervised/dbscan.py +141 -0
- mlscratch/unsupervised/gmm.py +204 -0
- mlscratch/unsupervised/hierarchical_clustering.py +137 -0
- mlscratch/unsupervised/ica.py +167 -0
- mlscratch/unsupervised/kmeans.py +135 -0
- mlscratch/unsupervised/kmedoids.py +133 -0
- mlscratch/unsupervised/pca.py +103 -0
- mlscratch/unsupervised/tsne.py +200 -0
- scratchkit-0.2.0.dist-info/METADATA +241 -0
- scratchkit-0.2.0.dist-info/RECORD +68 -0
- scratchkit-0.2.0.dist-info/WHEEL +5 -0
- scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
- scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
- scratchkit-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Q-Learning
|
|
3
|
+
===========
|
|
4
|
+
Tabular and linear-function-approximation variants of the classic
|
|
5
|
+
off-policy TD control algorithm (Watkins & Dayan, 1992).
|
|
6
|
+
|
|
7
|
+
Tabular Q-Learning
|
|
8
|
+
------------------
|
|
9
|
+
Maintains a Q-table Q[s, a] and updates via:
|
|
10
|
+
|
|
11
|
+
Q(s,a) ← Q(s,a) + α [r + γ max_a' Q(s',a') - Q(s,a)]
|
|
12
|
+
|
|
13
|
+
Supports ε-greedy exploration with optional linear or exponential decay.
|
|
14
|
+
|
|
15
|
+
Linear Q-Learning (Linear Function Approximation)
|
|
16
|
+
--------------------------------------------------
|
|
17
|
+
Represents Q(s,a) = φ(s,a)^T w where φ is a hand-crafted feature
|
|
18
|
+
vector and w are learned weights — useful for larger state spaces.
|
|
19
|
+
|
|
20
|
+
Both classes follow the same fit() / predict_action() API and expose
|
|
21
|
+
episode-level training via train_episode().
|
|
22
|
+
|
|
23
|
+
Only numpy and Python stdlib are used.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
import numpy as np
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ============================================================
|
|
31
|
+
# Tabular Q-Learning
|
|
32
|
+
# ============================================================
|
|
33
|
+
|
|
34
|
+
class QLearning:
|
|
35
|
+
"""
|
|
36
|
+
Tabular Q-Learning agent.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
n_states : int
|
|
41
|
+
n_actions : int
|
|
42
|
+
alpha : float learning rate
|
|
43
|
+
gamma : float discount factor
|
|
44
|
+
epsilon : float initial exploration probability
|
|
45
|
+
epsilon_min : float minimum exploration probability
|
|
46
|
+
epsilon_decay : float multiplicative decay per episode
|
|
47
|
+
random_state : int | None
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
n_states: int,
|
|
53
|
+
n_actions: int,
|
|
54
|
+
alpha: float = 0.1,
|
|
55
|
+
gamma: float = 0.99,
|
|
56
|
+
epsilon: float = 1.0,
|
|
57
|
+
epsilon_min: float = 0.01,
|
|
58
|
+
epsilon_decay: float = 0.995,
|
|
59
|
+
random_state: int | None = None,
|
|
60
|
+
):
|
|
61
|
+
self.n_states = n_states
|
|
62
|
+
self.n_actions = n_actions
|
|
63
|
+
self.alpha = alpha
|
|
64
|
+
self.gamma = gamma
|
|
65
|
+
self.epsilon = epsilon
|
|
66
|
+
self.epsilon_min = epsilon_min
|
|
67
|
+
self.epsilon_decay = epsilon_decay
|
|
68
|
+
self._rng = np.random.default_rng(random_state)
|
|
69
|
+
|
|
70
|
+
# Q-table initialised to zeros
|
|
71
|
+
self.Q: np.ndarray = np.zeros((n_states, n_actions))
|
|
72
|
+
|
|
73
|
+
# Episode-level tracking
|
|
74
|
+
self.episode_rewards_: list[float] = []
|
|
75
|
+
self.epsilons_: list[float] = []
|
|
76
|
+
|
|
77
|
+
# ------------------------------------------------------------------
|
|
78
|
+
# Action selection
|
|
79
|
+
# ------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
def select_action(self, state: int, greedy: bool = False) -> int:
|
|
82
|
+
"""ε-greedy action selection."""
|
|
83
|
+
if not greedy and self._rng.random() < self.epsilon:
|
|
84
|
+
return int(self._rng.integers(self.n_actions))
|
|
85
|
+
return int(np.argmax(self.Q[state]))
|
|
86
|
+
|
|
87
|
+
# ------------------------------------------------------------------
|
|
88
|
+
# Single update step
|
|
89
|
+
# ------------------------------------------------------------------
|
|
90
|
+
|
|
91
|
+
def update(
|
|
92
|
+
self,
|
|
93
|
+
state: int,
|
|
94
|
+
action: int,
|
|
95
|
+
reward: float,
|
|
96
|
+
next_state: int,
|
|
97
|
+
done: bool,
|
|
98
|
+
) -> float:
|
|
99
|
+
"""
|
|
100
|
+
Apply one Q-learning update.
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
td_error : float
|
|
105
|
+
"""
|
|
106
|
+
target = reward if done else reward + self.gamma * np.max(self.Q[next_state])
|
|
107
|
+
td_error = target - self.Q[state, action]
|
|
108
|
+
self.Q[state, action] += self.alpha * td_error
|
|
109
|
+
return float(td_error)
|
|
110
|
+
|
|
111
|
+
# ------------------------------------------------------------------
|
|
112
|
+
# Episode training
|
|
113
|
+
# ------------------------------------------------------------------
|
|
114
|
+
|
|
115
|
+
def train_episode(self, env) -> float:
|
|
116
|
+
"""
|
|
117
|
+
Run one full episode and return total reward.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
env : object with .reset() → int and .step(a) → (int, float, bool)
|
|
122
|
+
"""
|
|
123
|
+
state = env.reset()
|
|
124
|
+
total_reward = 0.0
|
|
125
|
+
done = False
|
|
126
|
+
|
|
127
|
+
while not done:
|
|
128
|
+
action = self.select_action(state)
|
|
129
|
+
next_state, reward, done = env.step(action)
|
|
130
|
+
self.update(state, action, reward, next_state, done)
|
|
131
|
+
state = next_state
|
|
132
|
+
total_reward += reward
|
|
133
|
+
|
|
134
|
+
# Decay epsilon
|
|
135
|
+
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
|
|
136
|
+
self.episode_rewards_.append(total_reward)
|
|
137
|
+
self.epsilons_.append(self.epsilon)
|
|
138
|
+
return total_reward
|
|
139
|
+
|
|
140
|
+
def train(self, env, n_episodes: int) -> "QLearning":
|
|
141
|
+
"""Train for n_episodes episodes."""
|
|
142
|
+
for _ in range(n_episodes):
|
|
143
|
+
self.train_episode(env)
|
|
144
|
+
return self
|
|
145
|
+
|
|
146
|
+
# ------------------------------------------------------------------
|
|
147
|
+
# Value / policy helpers
|
|
148
|
+
# ------------------------------------------------------------------
|
|
149
|
+
|
|
150
|
+
def value_function(self) -> np.ndarray:
|
|
151
|
+
"""V(s) = max_a Q(s,a) for all states."""
|
|
152
|
+
return self.Q.max(axis=1)
|
|
153
|
+
|
|
154
|
+
def policy(self) -> np.ndarray:
|
|
155
|
+
"""Greedy policy: π(s) = argmax_a Q(s,a)."""
|
|
156
|
+
return self.Q.argmax(axis=1)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
# ============================================================
|
|
160
|
+
# Double Q-Learning
|
|
161
|
+
# ============================================================
|
|
162
|
+
|
|
163
|
+
class DoubleQLearning:
|
|
164
|
+
"""
|
|
165
|
+
Double Q-Learning (van Hasselt, 2010).
|
|
166
|
+
|
|
167
|
+
Maintains two independent Q-tables Q_A and Q_B.
|
|
168
|
+
On each step, one is selected at random for the update, using the
|
|
169
|
+
other to evaluate the greedy action — removing maximisation bias.
|
|
170
|
+
|
|
171
|
+
Same API as QLearning.
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
def __init__(
|
|
175
|
+
self,
|
|
176
|
+
n_states: int,
|
|
177
|
+
n_actions: int,
|
|
178
|
+
alpha: float = 0.1,
|
|
179
|
+
gamma: float = 0.99,
|
|
180
|
+
epsilon: float = 1.0,
|
|
181
|
+
epsilon_min: float = 0.01,
|
|
182
|
+
epsilon_decay: float = 0.995,
|
|
183
|
+
random_state: int | None = None,
|
|
184
|
+
):
|
|
185
|
+
self.n_states = n_states
|
|
186
|
+
self.n_actions = n_actions
|
|
187
|
+
self.alpha = alpha
|
|
188
|
+
self.gamma = gamma
|
|
189
|
+
self.epsilon = epsilon
|
|
190
|
+
self.epsilon_min = epsilon_min
|
|
191
|
+
self.epsilon_decay = epsilon_decay
|
|
192
|
+
self._rng = np.random.default_rng(random_state)
|
|
193
|
+
|
|
194
|
+
self.Q_A: np.ndarray = np.zeros((n_states, n_actions))
|
|
195
|
+
self.Q_B: np.ndarray = np.zeros((n_states, n_actions))
|
|
196
|
+
|
|
197
|
+
self.episode_rewards_: list[float] = []
|
|
198
|
+
self.epsilons_: list[float] = []
|
|
199
|
+
|
|
200
|
+
@property
|
|
201
|
+
def Q(self) -> np.ndarray:
|
|
202
|
+
"""Combined Q estimate (average of both tables)."""
|
|
203
|
+
return (self.Q_A + self.Q_B) / 2.0
|
|
204
|
+
|
|
205
|
+
def select_action(self, state: int, greedy: bool = False) -> int:
|
|
206
|
+
if not greedy and self._rng.random() < self.epsilon:
|
|
207
|
+
return int(self._rng.integers(self.n_actions))
|
|
208
|
+
return int(np.argmax(self.Q[state]))
|
|
209
|
+
|
|
210
|
+
def update(
|
|
211
|
+
self,
|
|
212
|
+
state: int,
|
|
213
|
+
action: int,
|
|
214
|
+
reward: float,
|
|
215
|
+
next_state: int,
|
|
216
|
+
done: bool,
|
|
217
|
+
) -> float:
|
|
218
|
+
if self._rng.random() < 0.5:
|
|
219
|
+
# Update A, evaluate with B
|
|
220
|
+
a_star = int(np.argmax(self.Q_A[next_state]))
|
|
221
|
+
target = reward if done else reward + self.gamma * self.Q_B[next_state, a_star]
|
|
222
|
+
td_error = target - self.Q_A[state, action]
|
|
223
|
+
self.Q_A[state, action] += self.alpha * td_error
|
|
224
|
+
else:
|
|
225
|
+
# Update B, evaluate with A
|
|
226
|
+
a_star = int(np.argmax(self.Q_B[next_state]))
|
|
227
|
+
target = reward if done else reward + self.gamma * self.Q_A[next_state, a_star]
|
|
228
|
+
td_error = target - self.Q_B[state, action]
|
|
229
|
+
self.Q_B[state, action] += self.alpha * td_error
|
|
230
|
+
return float(td_error)
|
|
231
|
+
|
|
232
|
+
def train_episode(self, env) -> float:
|
|
233
|
+
state = env.reset()
|
|
234
|
+
total_reward = 0.0
|
|
235
|
+
done = False
|
|
236
|
+
while not done:
|
|
237
|
+
action = self.select_action(state)
|
|
238
|
+
next_state, reward, done = env.step(action)
|
|
239
|
+
self.update(state, action, reward, next_state, done)
|
|
240
|
+
state = next_state
|
|
241
|
+
total_reward += reward
|
|
242
|
+
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
|
|
243
|
+
self.episode_rewards_.append(total_reward)
|
|
244
|
+
self.epsilons_.append(self.epsilon)
|
|
245
|
+
return total_reward
|
|
246
|
+
|
|
247
|
+
def train(self, env, n_episodes: int) -> "DoubleQLearning":
|
|
248
|
+
for _ in range(n_episodes):
|
|
249
|
+
self.train_episode(env)
|
|
250
|
+
return self
|
|
251
|
+
|
|
252
|
+
def value_function(self) -> np.ndarray:
|
|
253
|
+
return self.Q.max(axis=1)
|
|
254
|
+
|
|
255
|
+
def policy(self) -> np.ndarray:
|
|
256
|
+
return self.Q.argmax(axis=1)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
# ============================================================
|
|
260
|
+
# Linear Function Approximation Q-Learning
|
|
261
|
+
# ============================================================
|
|
262
|
+
|
|
263
|
+
class LinearQLearning:
|
|
264
|
+
"""
|
|
265
|
+
Q-Learning with linear function approximation.
|
|
266
|
+
|
|
267
|
+
Q(s, a) ≈ φ(s, a)^T w
|
|
268
|
+
|
|
269
|
+
Feature construction: one-hot state × one-hot action tiling.
|
|
270
|
+
Works with integer state/action spaces.
|
|
271
|
+
|
|
272
|
+
Parameters
|
|
273
|
+
----------
|
|
274
|
+
n_states : int
|
|
275
|
+
n_actions : int
|
|
276
|
+
alpha, gamma, epsilon, epsilon_min, epsilon_decay : see QLearning
|
|
277
|
+
"""
|
|
278
|
+
|
|
279
|
+
def __init__(
|
|
280
|
+
self,
|
|
281
|
+
n_states: int,
|
|
282
|
+
n_actions: int,
|
|
283
|
+
alpha: float = 0.01,
|
|
284
|
+
gamma: float = 0.99,
|
|
285
|
+
epsilon: float = 1.0,
|
|
286
|
+
epsilon_min: float = 0.01,
|
|
287
|
+
epsilon_decay: float = 0.995,
|
|
288
|
+
random_state: int | None = None,
|
|
289
|
+
):
|
|
290
|
+
self.n_states = n_states
|
|
291
|
+
self.n_actions = n_actions
|
|
292
|
+
self.alpha = alpha
|
|
293
|
+
self.gamma = gamma
|
|
294
|
+
self.epsilon = epsilon
|
|
295
|
+
self.epsilon_min = epsilon_min
|
|
296
|
+
self.epsilon_decay = epsilon_decay
|
|
297
|
+
self._rng = np.random.default_rng(random_state)
|
|
298
|
+
|
|
299
|
+
self.n_features = n_states * n_actions
|
|
300
|
+
self.w = np.zeros(self.n_features)
|
|
301
|
+
|
|
302
|
+
self.episode_rewards_: list[float] = []
|
|
303
|
+
|
|
304
|
+
def _features(self, state: int, action: int) -> np.ndarray:
|
|
305
|
+
"""One-hot feature vector for (state, action) pair."""
|
|
306
|
+
phi = np.zeros(self.n_features)
|
|
307
|
+
phi[state * self.n_actions + action] = 1.0
|
|
308
|
+
return phi
|
|
309
|
+
|
|
310
|
+
def _q(self, state: int, action: int) -> float:
|
|
311
|
+
return float(self.w @ self._features(state, action))
|
|
312
|
+
|
|
313
|
+
def select_action(self, state: int, greedy: bool = False) -> int:
|
|
314
|
+
if not greedy and self._rng.random() < self.epsilon:
|
|
315
|
+
return int(self._rng.integers(self.n_actions))
|
|
316
|
+
q_vals = [self._q(state, a) for a in range(self.n_actions)]
|
|
317
|
+
return int(np.argmax(q_vals))
|
|
318
|
+
|
|
319
|
+
def update(self, state, action, reward, next_state, done) -> float:
|
|
320
|
+
q_next = max(self._q(next_state, a) for a in range(self.n_actions))
|
|
321
|
+
target = reward if done else reward + self.gamma * q_next
|
|
322
|
+
td_error = target - self._q(state, action)
|
|
323
|
+
self.w += self.alpha * td_error * self._features(state, action)
|
|
324
|
+
return float(td_error)
|
|
325
|
+
|
|
326
|
+
def train_episode(self, env) -> float:
|
|
327
|
+
state = env.reset()
|
|
328
|
+
total_reward = 0.0
|
|
329
|
+
done = False
|
|
330
|
+
while not done:
|
|
331
|
+
action = self.select_action(state)
|
|
332
|
+
next_state, reward, done = env.step(action)
|
|
333
|
+
self.update(state, action, reward, next_state, done)
|
|
334
|
+
state = next_state
|
|
335
|
+
total_reward += reward
|
|
336
|
+
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
|
|
337
|
+
self.episode_rewards_.append(total_reward)
|
|
338
|
+
return total_reward
|
|
339
|
+
|
|
340
|
+
def train(self, env, n_episodes: int) -> "LinearQLearning":
|
|
341
|
+
for _ in range(n_episodes):
|
|
342
|
+
self.train_episode(env)
|
|
343
|
+
return self
|
|
344
|
+
|
|
345
|
+
@property
|
|
346
|
+
def Q(self) -> np.ndarray:
|
|
347
|
+
"""Recover Q-table from weight vector."""
|
|
348
|
+
Q = np.zeros((self.n_states, self.n_actions))
|
|
349
|
+
for s in range(self.n_states):
|
|
350
|
+
for a in range(self.n_actions):
|
|
351
|
+
Q[s, a] = self._q(s, a)
|
|
352
|
+
return Q
|