scratchkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. mlscratch/__init__.py +56 -0
  2. mlscratch/__main__.py +118 -0
  3. mlscratch/bayesian/__init__.py +53 -0
  4. mlscratch/bayesian/bayesian_linear_regression.py +171 -0
  5. mlscratch/bayesian/bayesian_network.py +248 -0
  6. mlscratch/bayesian/bayesian_nn.py +315 -0
  7. mlscratch/bayesian/gaussian_process.py +207 -0
  8. mlscratch/bayesian/hmm.py +277 -0
  9. mlscratch/bayesian/init.py +52 -0
  10. mlscratch/bayesian/kalman_filter.py +182 -0
  11. mlscratch/bayesian/naive_bayes.py +209 -0
  12. mlscratch/metrics/__init__.py +59 -0
  13. mlscratch/metrics/classification.py +365 -0
  14. mlscratch/metrics/regression.py +79 -0
  15. mlscratch/neural/__init__.py +121 -0
  16. mlscratch/neural/attention.py +420 -0
  17. mlscratch/neural/autoencoder.py +543 -0
  18. mlscratch/neural/boltzmann.py +231 -0
  19. mlscratch/neural/cnn.py +593 -0
  20. mlscratch/neural/cvnn.py +322 -0
  21. mlscratch/neural/gan.py +364 -0
  22. mlscratch/neural/hopfield.py +193 -0
  23. mlscratch/neural/perceptron.py +398 -0
  24. mlscratch/neural/rbf_network.py +230 -0
  25. mlscratch/neural/recurrent.py +569 -0
  26. mlscratch/preprocessing/__init__.py +38 -0
  27. mlscratch/preprocessing/encoders.py +140 -0
  28. mlscratch/preprocessing/model_selection.py +119 -0
  29. mlscratch/preprocessing/polynomial.py +105 -0
  30. mlscratch/preprocessing/scalers.py +220 -0
  31. mlscratch/py.typed +0 -0
  32. mlscratch/reinforcement/__init__.py +59 -0
  33. mlscratch/reinforcement/ddpg.py +363 -0
  34. mlscratch/reinforcement/dqn.py +319 -0
  35. mlscratch/reinforcement/ppo.py +452 -0
  36. mlscratch/reinforcement/q_learning.py +352 -0
  37. mlscratch/reinforcement/sac.py +382 -0
  38. mlscratch/reinforcement/utils.py +594 -0
  39. mlscratch/supervised/__init__.py +76 -0
  40. mlscratch/supervised/_validation.py +50 -0
  41. mlscratch/supervised/adaboost.py +255 -0
  42. mlscratch/supervised/decision_tree.py +495 -0
  43. mlscratch/supervised/gradient_boosting.py +354 -0
  44. mlscratch/supervised/knn.py +234 -0
  45. mlscratch/supervised/lasso_regression.py +125 -0
  46. mlscratch/supervised/linear_models.py +459 -0
  47. mlscratch/supervised/linear_regression.py +197 -0
  48. mlscratch/supervised/logistic_regression.py +119 -0
  49. mlscratch/supervised/naive_bayes.py +113 -0
  50. mlscratch/supervised/random_forest.py +321 -0
  51. mlscratch/supervised/ridge_regression.py +93 -0
  52. mlscratch/supervised/svm.py +356 -0
  53. mlscratch/unsupervised/__init__.py +39 -0
  54. mlscratch/unsupervised/apriori.py +178 -0
  55. mlscratch/unsupervised/dbscan.py +141 -0
  56. mlscratch/unsupervised/gmm.py +204 -0
  57. mlscratch/unsupervised/hierarchical_clustering.py +137 -0
  58. mlscratch/unsupervised/ica.py +167 -0
  59. mlscratch/unsupervised/kmeans.py +135 -0
  60. mlscratch/unsupervised/kmedoids.py +133 -0
  61. mlscratch/unsupervised/pca.py +103 -0
  62. mlscratch/unsupervised/tsne.py +200 -0
  63. scratchkit-0.2.0.dist-info/METADATA +241 -0
  64. scratchkit-0.2.0.dist-info/RECORD +68 -0
  65. scratchkit-0.2.0.dist-info/WHEEL +5 -0
  66. scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
  67. scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
  68. scratchkit-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,363 @@
1
+ """
2
+ Deep Deterministic Policy Gradient (DDPG) and TD3
3
+ ===================================================
4
+ DDPG (Lillicrap et al., 2015) extends DQN to continuous action spaces:
5
+ - Deterministic actor π_θ(s) → a (output activation: tanh → scaled)
6
+ - Critic Q_φ(s, a) approximates action-value function
7
+ - Target networks (soft update) for both actor and critic
8
+ - Ornstein-Uhlenbeck or Gaussian noise for exploration
9
+
10
+ TD3 — Twin Delayed Deep Deterministic Policy Gradient (Fujimoto et al., 2018)
11
+ -----------------------------------------------------------------------
12
+ Three key improvements over DDPG:
13
+ 1. Twin critics — two independent Q-networks; use min for targets
14
+ 2. Delayed policy update — actor updated every `policy_delay` critic steps
15
+ 3. Target policy noise — smoothed noisy targets prevent over-fitting to peaks
16
+
17
+ Update equations
18
+ ----------------
19
+ Critic targets (TD3):
20
+ ã = π_θ'(s') + clip(N(0,σ̃), -c, c) # smoothed target action
21
+ y = r + γ(1-d) min(Q_1'(s',ã), Q_2'(s',ã))
22
+
23
+ Actor loss (DDPG / TD3):
24
+ L_π = -E[Q_1(s, π_θ(s))]
25
+
26
+ Only numpy and Python stdlib are used.
27
+ """
28
+
29
+ from __future__ import annotations
30
+ import numpy as np
31
+ from .utils import ReplayBuffer, MLP, OrnsteinUhlenbeckNoise, GaussianNoise
32
+
33
+
34
+ # ============================================================
35
+ # DDPG
36
+ # ============================================================
37
+
38
+ class DDPG:
39
+ """
40
+ Deep Deterministic Policy Gradient agent.
41
+
42
+ Parameters
43
+ ----------
44
+ state_dim : int
45
+ action_dim : int
46
+ action_low : float lower bound of action space
47
+ action_high : float upper bound of action space
48
+ hidden_sizes : list[int]
49
+ actor_lr : float
50
+ critic_lr : float
51
+ gamma : float
52
+ tau : float soft update coefficient
53
+ buffer_capacity : int
54
+ batch_size : int
55
+ noise_type : str 'ou' | 'gaussian'
56
+ noise_sigma : float exploration noise scale
57
+ warmup_steps : int random actions before learning starts
58
+ random_state : int | None
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ state_dim: int,
64
+ action_dim: int,
65
+ action_low: float = -1.0,
66
+ action_high: float = 1.0,
67
+ hidden_sizes: list[int] | None = None,
68
+ actor_lr: float = 1e-3,
69
+ critic_lr: float = 1e-3,
70
+ gamma: float = 0.99,
71
+ tau: float = 0.005,
72
+ buffer_capacity: int = 100_000,
73
+ batch_size: int = 64,
74
+ noise_type: str = "ou",
75
+ noise_sigma: float = 0.1,
76
+ warmup_steps: int = 1000,
77
+ random_state: int | None = None,
78
+ ):
79
+ self.action_dim = action_dim
80
+ self.action_low = action_low
81
+ self.action_high = action_high
82
+ self.gamma = gamma
83
+ self.tau = tau
84
+ self.batch_size = batch_size
85
+ self.warmup_steps = warmup_steps
86
+ self._rng = np.random.default_rng(random_state)
87
+ self._step = 0
88
+
89
+ hidden = hidden_sizes or [256, 256]
90
+ act_scale = (action_high - action_low) / 2.0
91
+ act_bias = (action_high + action_low) / 2.0
92
+ self._act_scale = act_scale
93
+ self._act_bias = act_bias
94
+
95
+ # Actor: s → a (tanh output scaled to [low, high])
96
+ self.actor = MLP([state_dim] + hidden + [action_dim],
97
+ output_activation="tanh", lr=actor_lr,
98
+ random_state=random_state)
99
+ self.actor_target = MLP([state_dim] + hidden + [action_dim],
100
+ output_activation="tanh", lr=actor_lr,
101
+ random_state=random_state)
102
+ self.actor.hard_update(self.actor_target)
103
+
104
+ # Critic: (s, a) → Q
105
+ self.critic = MLP([state_dim + action_dim] + hidden + [1],
106
+ output_activation="linear", lr=critic_lr,
107
+ random_state=random_state)
108
+ self.critic_target = MLP([state_dim + action_dim] + hidden + [1],
109
+ output_activation="linear", lr=critic_lr,
110
+ random_state=random_state)
111
+ self.critic.hard_update(self.critic_target)
112
+
113
+ # Replay
114
+ self.buffer = ReplayBuffer(buffer_capacity)
115
+
116
+ # Exploration noise
117
+ if noise_type == "ou":
118
+ self.noise = OrnsteinUhlenbeckNoise(
119
+ action_dim, sigma=noise_sigma, random_state=random_state
120
+ )
121
+ else:
122
+ self.noise = GaussianNoise(action_dim, sigma=noise_sigma,
123
+ random_state=random_state)
124
+
125
+ # Logging
126
+ self.actor_losses_: list[float] = []
127
+ self.critic_losses_: list[float] = []
128
+ self.episode_rewards_: list[float] = []
129
+
130
+ # ------------------------------------------------------------------
131
+ # Action
132
+ # ------------------------------------------------------------------
133
+
134
+ def _scale_action(self, a_tanh: np.ndarray) -> np.ndarray:
135
+ return a_tanh * self._act_scale + self._act_bias
136
+
137
+ def select_action(self, state: np.ndarray, add_noise: bool = True) -> np.ndarray:
138
+ a = self.actor.forward(state) # tanh in [-1,1]
139
+ if add_noise:
140
+ a = a + self.noise.sample()
141
+ a = np.clip(a, -1.0, 1.0)
142
+ return self._scale_action(a)
143
+
144
+ # ------------------------------------------------------------------
145
+ # Learning
146
+ # ------------------------------------------------------------------
147
+
148
+ def _learn(self) -> tuple[float, float] | tuple[None, None]:
149
+ if len(self.buffer) < self.batch_size:
150
+ return None, None
151
+
152
+ states, actions, rewards, next_states, dones = \
153
+ self.buffer.sample(self.batch_size, self._rng)
154
+
155
+ # ── Critic update ──────────────────────────────────────────────
156
+ # Target action from actor_target
157
+ a_next = self.actor_target.forward(next_states) # (B, A_dim)
158
+ a_next = np.clip(a_next, -1.0, 1.0)
159
+ sa_next = np.concatenate([next_states,
160
+ a_next * self._act_scale + self._act_bias], axis=1)
161
+
162
+ q_next = self.critic_target.forward(sa_next).ravel() # (B,)
163
+ y = rewards + self.gamma * (1.0 - dones) * q_next # (B,)
164
+
165
+ # Normalise stored actions back to [-1,1] for concat
166
+ a_norm = (actions - self._act_bias) / self._act_scale
167
+ sa = np.concatenate([states, actions], axis=1)
168
+ q_pred = self.critic.forward(sa, training=True).ravel() # (B,)
169
+
170
+ td_errors = y - q_pred
171
+ critic_loss = float(np.mean(td_errors ** 2))
172
+
173
+ d_critic = -2.0 * td_errors[:, np.newaxis] / self.batch_size
174
+ self.critic.backward(d_critic)
175
+
176
+ # ── Actor update ───────────────────────────────────────────────
177
+ a_pred = self.actor.forward(states, training=True) # (B, A_dim)
178
+ a_scaled = a_pred * self._act_scale + self._act_bias
179
+ sa_pred = np.concatenate([states, a_scaled], axis=1)
180
+
181
+ q_actor = self.critic.forward(sa_pred, training=True).ravel()
182
+ actor_loss = float(-np.mean(q_actor))
183
+
184
+ # dL/da = -dQ/da (chain through critic → actor)
185
+ d_q_wrt_sa = np.ones((self.batch_size, 1)) / self.batch_size
186
+ # Gradient w.r.t. action part only
187
+ d_a = d_q_wrt_sa * (-1.0) * self._act_scale # (B, A_dim)
188
+ self.actor.backward(d_a)
189
+
190
+ # ── Soft target updates ────────────────────────────────────────
191
+ self.actor.soft_update(self.actor_target, self.tau)
192
+ self.critic.soft_update(self.critic_target, self.tau)
193
+
194
+ return actor_loss, critic_loss
195
+
196
+ # ------------------------------------------------------------------
197
+ # Step
198
+ # ------------------------------------------------------------------
199
+
200
+ def step(
201
+ self,
202
+ state: np.ndarray,
203
+ action: np.ndarray,
204
+ reward: float,
205
+ next_state: np.ndarray,
206
+ done: bool,
207
+ ) -> tuple[float | None, float | None]:
208
+ self.buffer.push(state, action, reward, next_state, done)
209
+ self._step += 1
210
+
211
+ if self._step < self.warmup_steps:
212
+ return None, None
213
+
214
+ actor_loss, critic_loss = self._learn()
215
+ if actor_loss is not None:
216
+ self.actor_losses_.append(actor_loss)
217
+ self.critic_losses_.append(critic_loss)
218
+ return actor_loss, critic_loss
219
+
220
+ def train_episode(self, env) -> float:
221
+ state = env.reset(self._rng)
222
+ self.noise.reset() if hasattr(self.noise, 'reset') else None
223
+ total_reward = 0.0
224
+ done = False
225
+
226
+ while not done:
227
+ if self._step < self.warmup_steps:
228
+ action = self._rng.uniform(self.action_low, self.action_high,
229
+ self.action_dim)
230
+ else:
231
+ action = self.select_action(state)
232
+ next_state, reward, done = env.step(action)
233
+ self.step(state, action, reward, next_state, done)
234
+ state = next_state
235
+ total_reward += reward
236
+
237
+ self.episode_rewards_.append(total_reward)
238
+ return total_reward
239
+
240
+ def train(self, env, n_episodes: int) -> "DDPG":
241
+ for _ in range(n_episodes):
242
+ self.train_episode(env)
243
+ return self
244
+
245
+
246
+ # ============================================================
247
+ # TD3
248
+ # ============================================================
249
+
250
+ class TD3(DDPG):
251
+ """
252
+ Twin Delayed Deep Deterministic Policy Gradient (TD3).
253
+
254
+ Inherits from DDPG and adds:
255
+ - Second critic (critic2 + critic2_target)
256
+ - Policy delay: actor updated every `policy_delay` critic steps
257
+ - Target policy smoothing: Gaussian noise clipped to ±noise_clip
258
+
259
+ Parameters (additional to DDPG)
260
+ --------------------------------
261
+ policy_delay : int critic updates per actor update (default 2)
262
+ target_noise : float std of smoothing noise on target actions
263
+ noise_clip : float clipping bound for smoothing noise
264
+ """
265
+
266
+ def __init__(
267
+ self,
268
+ state_dim: int,
269
+ action_dim: int,
270
+ action_low: float = -1.0,
271
+ action_high: float = 1.0,
272
+ hidden_sizes: list[int] | None = None,
273
+ actor_lr: float = 1e-3,
274
+ critic_lr: float = 1e-3,
275
+ gamma: float = 0.99,
276
+ tau: float = 0.005,
277
+ buffer_capacity: int = 100_000,
278
+ batch_size: int = 64,
279
+ noise_type: str = "gaussian",
280
+ noise_sigma: float = 0.1,
281
+ warmup_steps: int = 1000,
282
+ policy_delay: int = 2,
283
+ target_noise: float = 0.2,
284
+ noise_clip: float = 0.5,
285
+ random_state: int | None = None,
286
+ ):
287
+ super().__init__(
288
+ state_dim, action_dim, action_low, action_high, hidden_sizes,
289
+ actor_lr, critic_lr, gamma, tau, buffer_capacity, batch_size,
290
+ noise_type, noise_sigma, warmup_steps, random_state,
291
+ )
292
+ self.policy_delay = policy_delay
293
+ self.target_noise = target_noise
294
+ self.noise_clip = noise_clip
295
+ self._critic_steps = 0
296
+
297
+ hidden = hidden_sizes or [256, 256]
298
+ # Second critic pair
299
+ self.critic2 = MLP([state_dim + action_dim] + hidden + [1],
300
+ output_activation="linear", lr=critic_lr,
301
+ random_state=random_state)
302
+ self.critic2_target = MLP([state_dim + action_dim] + hidden + [1],
303
+ output_activation="linear", lr=critic_lr,
304
+ random_state=random_state)
305
+ self.critic2.hard_update(self.critic2_target)
306
+
307
+ def _learn(self) -> tuple[float, float] | tuple[None, None]:
308
+ if len(self.buffer) < self.batch_size:
309
+ return None, None
310
+
311
+ states, actions, rewards, next_states, dones = \
312
+ self.buffer.sample(self.batch_size, self._rng)
313
+
314
+ self._critic_steps += 1
315
+
316
+ # ── Target action with smoothing noise ────────────────────────
317
+ a_next = self.actor_target.forward(next_states)
318
+ noise = np.clip(
319
+ self._rng.normal(0, self.target_noise, a_next.shape),
320
+ -self.noise_clip, self.noise_clip
321
+ )
322
+ a_next = np.clip(a_next + noise, -1.0, 1.0)
323
+ a_next_scaled = a_next * self._act_scale + self._act_bias
324
+ sa_next = np.concatenate([next_states, a_next_scaled], axis=1)
325
+
326
+ # ── Twin critics targets (take min) ───────────────────────────
327
+ q1_next = self.critic_target.forward(sa_next).ravel()
328
+ q2_next = self.critic2_target.forward(sa_next).ravel()
329
+ q_next = np.minimum(q1_next, q2_next)
330
+ y = rewards + self.gamma * (1.0 - dones) * q_next
331
+
332
+ # ── Update both critics ───────────────────────────────────────
333
+ sa = np.concatenate([states, actions], axis=1)
334
+
335
+ q1_pred = self.critic.forward(sa, training=True).ravel()
336
+ td1 = y - q1_pred
337
+ critic1_loss = float(np.mean(td1 ** 2))
338
+ self.critic.backward(-2.0 * td1[:, np.newaxis] / self.batch_size)
339
+
340
+ q2_pred = self.critic2.forward(sa, training=True).ravel()
341
+ td2 = y - q2_pred
342
+ critic2_loss = float(np.mean(td2 ** 2))
343
+ self.critic2.backward(-2.0 * td2[:, np.newaxis] / self.batch_size)
344
+
345
+ critic_loss = (critic1_loss + critic2_loss) / 2.0
346
+ actor_loss = None
347
+
348
+ # ── Delayed actor update ──────────────────────────────────────
349
+ if self._critic_steps % self.policy_delay == 0:
350
+ a_pred = self.actor.forward(states, training=True)
351
+ a_scaled = a_pred * self._act_scale + self._act_bias
352
+ sa_pred = np.concatenate([states, a_scaled], axis=1)
353
+ q_actor = self.critic.forward(sa_pred, training=True).ravel()
354
+ actor_loss = float(-np.mean(q_actor))
355
+ d_a = -np.ones((self.batch_size, self.action_dim)) * self._act_scale \
356
+ / self.batch_size
357
+ self.actor.backward(d_a)
358
+ self.actor.soft_update(self.actor_target, self.tau)
359
+
360
+ self.critic.soft_update(self.critic_target, self.tau)
361
+ self.critic2.soft_update(self.critic2_target, self.tau)
362
+
363
+ return actor_loss, critic_loss
@@ -0,0 +1,319 @@
1
+ """
2
+ Deep Q-Network (DQN)
3
+ =====================
4
+ Neural-network function approximator for Q-learning, with three
5
+ production-grade enhancements:
6
+
7
+ 1. Experience Replay — breaks temporal correlations (Mnih et al., 2013)
8
+ 2. Target Network — stabilises training targets (Mnih et al., 2015)
9
+ 3. Double DQN — removes maximisation bias (van Hasselt et al., 2016)
10
+
11
+ Optional:
12
+ 4. Dueling Network — separate V(s) and A(s,a) streams
13
+ (Wang et al., 2016)
14
+ 5. Prioritised Replay — focuses on high-TD-error transitions
15
+ (Schaul et al., 2015)
16
+
17
+ Update rule (Double DQN):
18
+ a* = argmax_a Q_online(s', a)
19
+ y = r + γ (1-done) Q_target(s', a*)
20
+ L = (y - Q_online(s, a))²
21
+
22
+ Only numpy and Python stdlib are used.
23
+ """
24
+
25
+ from __future__ import annotations
26
+ import numpy as np
27
+ from copy import deepcopy
28
+
29
+ from .utils import ReplayBuffer, PrioritizedReplayBuffer, MLP
30
+
31
+
32
+ # ============================================================
33
+ # Dueling MLP
34
+ # ============================================================
35
+
36
+ class DuelingMLP:
37
+ """
38
+ Dueling network: two heads sharing a common feature trunk.
39
+
40
+ Q(s,a) = V(s) + A(s,a) - mean_a A(s,a)
41
+
42
+ Parameters
43
+ ----------
44
+ state_dim : int
45
+ n_actions : int
46
+ hidden_sizes: list[int] size of shared hidden layers
47
+ lr : float
48
+ """
49
+
50
+ def __init__(
51
+ self,
52
+ state_dim: int,
53
+ n_actions: int,
54
+ hidden_sizes: list[int] | None = None,
55
+ lr: float = 1e-3,
56
+ random_state: int | None = None,
57
+ ):
58
+ hidden_sizes = hidden_sizes or [128, 128]
59
+ self.n_actions = n_actions
60
+ rng = np.random.default_rng(random_state)
61
+
62
+ # Shared trunk
63
+ trunk_sizes = [state_dim] + hidden_sizes
64
+ self._trunk = MLP(trunk_sizes, output_activation="linear",
65
+ lr=lr, random_state=random_state)
66
+
67
+ # Value head: hidden[-1] → 1
68
+ self._value_head = MLP([hidden_sizes[-1], 64, 1],
69
+ output_activation="linear", lr=lr,
70
+ random_state=random_state)
71
+ # Advantage head: hidden[-1] → n_actions
72
+ self._adv_head = MLP([hidden_sizes[-1], 64, n_actions],
73
+ output_activation="linear", lr=lr,
74
+ random_state=random_state)
75
+
76
+ def forward(self, x: np.ndarray, training: bool = False) -> np.ndarray:
77
+ scalar = x.ndim == 1
78
+ if scalar:
79
+ x = x[np.newaxis, :]
80
+ h = self._trunk.forward(x, training=training)
81
+ V = self._value_head.forward(h, training=training) # (B,1)
82
+ A = self._adv_head.forward(h, training=training) # (B,A)
83
+ Q = V + A - A.mean(axis=1, keepdims=True)
84
+ return Q[0] if scalar else Q
85
+
86
+ def soft_update(self, target: "DuelingMLP", tau: float) -> None:
87
+ self._trunk.soft_update(target._trunk, tau)
88
+ self._value_head.soft_update(target._value_head, tau)
89
+ self._adv_head.soft_update(target._adv_head, tau)
90
+
91
+ def hard_update(self, target: "DuelingMLP") -> None:
92
+ self._trunk.hard_update(target._trunk)
93
+ self._value_head.hard_update(target._value_head)
94
+ self._adv_head.hard_update(target._adv_head)
95
+
96
+ def copy_weights_from(self, source: "DuelingMLP") -> None:
97
+ self._trunk.copy_weights_from(source._trunk)
98
+ self._value_head.copy_weights_from(source._value_head)
99
+ self._adv_head.copy_weights_from(source._adv_head)
100
+
101
+ def backward(self, d_out: np.ndarray) -> None:
102
+ # Simplified backward: treat Q output as direct loss gradient
103
+ # into the advantage head (standard approach for DQN)
104
+ d_A = d_out - d_out.mean(axis=1, keepdims=True)
105
+ d_V = d_out.mean(axis=1, keepdims=True) * np.ones((d_out.shape[0], 1))
106
+ self._adv_head.backward(d_A)
107
+ self._value_head.backward(d_V)
108
+ # Trunk gradient = sum of both heads (simplified)
109
+ self._trunk.backward(d_out.mean(axis=1, keepdims=True) *
110
+ np.ones((d_out.shape[0],
111
+ self._trunk.layer_sizes[-1])))
112
+
113
+
114
+ # ============================================================
115
+ # DQN Agent
116
+ # ============================================================
117
+
118
+ class DQN:
119
+ """
120
+ Deep Q-Network agent.
121
+
122
+ Parameters
123
+ ----------
124
+ state_dim : int
125
+ n_actions : int
126
+ hidden_sizes : list[int]
127
+ lr : float learning rate
128
+ gamma : float discount factor
129
+ epsilon : float initial exploration ε
130
+ epsilon_min : float minimum ε
131
+ epsilon_decay : float multiplicative decay per step
132
+ batch_size : int
133
+ buffer_capacity : int
134
+ target_update : int hard target update every N steps
135
+ tau : float | None soft update coeff; None → hard update
136
+ double_dqn : bool use Double DQN
137
+ dueling : bool use Dueling Network
138
+ prioritized : bool use Prioritised Replay
139
+ random_state : int | None
140
+ """
141
+
142
+ def __init__(
143
+ self,
144
+ state_dim: int,
145
+ n_actions: int,
146
+ hidden_sizes: list[int] | None = None,
147
+ lr: float = 1e-3,
148
+ gamma: float = 0.99,
149
+ epsilon: float = 1.0,
150
+ epsilon_min: float = 0.01,
151
+ epsilon_decay: float = 0.995,
152
+ batch_size: int = 64,
153
+ buffer_capacity: int = 50_000,
154
+ target_update: int = 100,
155
+ tau: float | None = None,
156
+ double_dqn: bool = True,
157
+ dueling: bool = False,
158
+ prioritized: bool = False,
159
+ random_state: int | None = None,
160
+ ):
161
+ self.n_actions = n_actions
162
+ self.gamma = gamma
163
+ self.epsilon = epsilon
164
+ self.epsilon_min = epsilon_min
165
+ self.epsilon_decay = epsilon_decay
166
+ self.batch_size = batch_size
167
+ self.target_update = target_update
168
+ self.tau = tau
169
+ self.double_dqn = double_dqn
170
+ self.prioritized = prioritized
171
+ self._rng = np.random.default_rng(random_state)
172
+ self._step = 0
173
+
174
+ hidden = hidden_sizes or [128, 128]
175
+
176
+ # Online and target networks
177
+ if dueling:
178
+ self.online_net = DuelingMLP(state_dim, n_actions, hidden, lr, random_state)
179
+ self.target_net = DuelingMLP(state_dim, n_actions, hidden, lr, random_state)
180
+ else:
181
+ self.online_net = MLP([state_dim] + hidden + [n_actions],
182
+ output_activation="linear", lr=lr,
183
+ random_state=random_state)
184
+ self.target_net = MLP([state_dim] + hidden + [n_actions],
185
+ output_activation="linear", lr=lr,
186
+ random_state=random_state)
187
+
188
+ # Sync target = online at init
189
+ self.online_net.hard_update(self.target_net)
190
+
191
+ # Replay buffer
192
+ if prioritized:
193
+ self.buffer = PrioritizedReplayBuffer(buffer_capacity)
194
+ else:
195
+ self.buffer = ReplayBuffer(buffer_capacity)
196
+
197
+ # Logging
198
+ self.losses_: list[float] = []
199
+ self.episode_rewards_: list[float] = []
200
+ self.epsilons_: list[float] = []
201
+
202
+ # ------------------------------------------------------------------
203
+ # Action selection
204
+ # ------------------------------------------------------------------
205
+
206
+ def select_action(self, state: np.ndarray, greedy: bool = False) -> int:
207
+ """ε-greedy action with linear annealing."""
208
+ if not greedy and self._rng.random() < self.epsilon:
209
+ return int(self._rng.integers(self.n_actions))
210
+ q = self.online_net.forward(state)
211
+ return int(np.argmax(q))
212
+
213
+ # ------------------------------------------------------------------
214
+ # Learning step
215
+ # ------------------------------------------------------------------
216
+
217
+ def _learn(self) -> float | None:
218
+ if len(self.buffer) < self.batch_size:
219
+ return None
220
+
221
+ if self.prioritized:
222
+ states, actions, rewards, next_states, dones, weights, idxs = \
223
+ self.buffer.sample(self.batch_size, self._rng)
224
+ else:
225
+ states, actions, rewards, next_states, dones = \
226
+ self.buffer.sample(self.batch_size, self._rng)
227
+ weights = np.ones(self.batch_size)
228
+
229
+ actions = actions.ravel().astype(int)
230
+
231
+ # Compute targets
232
+ with_no_grad = True # conceptually; we don't call backward on target_net
233
+ q_next_target = self.target_net.forward(next_states) # (B, A)
234
+
235
+ if self.double_dqn:
236
+ q_next_online = self.online_net.forward(next_states)
237
+ a_star = np.argmax(q_next_online, axis=1) # online selects
238
+ q_next_val = q_next_target[np.arange(self.batch_size), a_star]
239
+ else:
240
+ q_next_val = q_next_target.max(axis=1)
241
+
242
+ targets = rewards + self.gamma * (1.0 - dones) * q_next_val # (B,)
243
+
244
+ # Compute predictions and loss gradient
245
+ q_pred_all = self.online_net.forward(states, training=True) # (B, A)
246
+ q_pred = q_pred_all[np.arange(self.batch_size), actions] # (B,)
247
+
248
+ td_errors = targets - q_pred # (B,)
249
+ loss = float(np.mean(weights * td_errors ** 2))
250
+
251
+ # Gradient: dL/dQ_pred = -2 * w * td_error (averaged in backward)
252
+ d_out = np.zeros_like(q_pred_all)
253
+ d_out[np.arange(self.batch_size), actions] = (
254
+ -2.0 * weights * td_errors / self.batch_size
255
+ )
256
+ self.online_net.backward(d_out)
257
+
258
+ # Update priorities
259
+ if self.prioritized:
260
+ self.buffer.update_priorities(idxs, td_errors)
261
+
262
+ return loss
263
+
264
+ # ------------------------------------------------------------------
265
+ # Step
266
+ # ------------------------------------------------------------------
267
+
268
+ def step(
269
+ self,
270
+ state: np.ndarray,
271
+ action: int,
272
+ reward: float,
273
+ next_state: np.ndarray,
274
+ done: bool,
275
+ ) -> float | None:
276
+ """Store transition, learn, update target, decay ε."""
277
+ self.buffer.push(state, np.array([action]), reward, next_state, done)
278
+ self._step += 1
279
+
280
+ loss = self._learn()
281
+ if loss is not None:
282
+ self.losses_.append(loss)
283
+
284
+ # Target update
285
+ if self.tau is not None:
286
+ self.online_net.soft_update(self.target_net, self.tau)
287
+ elif self._step % self.target_update == 0:
288
+ self.online_net.hard_update(self.target_net)
289
+
290
+ # Epsilon decay
291
+ self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
292
+
293
+ return loss
294
+
295
+ # ------------------------------------------------------------------
296
+ # Episode training
297
+ # ------------------------------------------------------------------
298
+
299
+ def train_episode(self, env) -> float:
300
+ """Run one episode and return total reward."""
301
+ state = env.reset()
302
+ total_reward = 0.0
303
+ done = False
304
+
305
+ while not done:
306
+ action = self.select_action(state)
307
+ next_state, reward, done = env.step(action)
308
+ self.step(state, action, reward, next_state, done)
309
+ state = next_state
310
+ total_reward += reward
311
+
312
+ self.episode_rewards_.append(total_reward)
313
+ self.epsilons_.append(self.epsilon)
314
+ return total_reward
315
+
316
+ def train(self, env, n_episodes: int) -> "DQN":
317
+ for _ in range(n_episodes):
318
+ self.train_episode(env)
319
+ return self