scratchkit 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlscratch/__init__.py +56 -0
- mlscratch/__main__.py +118 -0
- mlscratch/bayesian/__init__.py +53 -0
- mlscratch/bayesian/bayesian_linear_regression.py +171 -0
- mlscratch/bayesian/bayesian_network.py +248 -0
- mlscratch/bayesian/bayesian_nn.py +315 -0
- mlscratch/bayesian/gaussian_process.py +207 -0
- mlscratch/bayesian/hmm.py +277 -0
- mlscratch/bayesian/init.py +52 -0
- mlscratch/bayesian/kalman_filter.py +182 -0
- mlscratch/bayesian/naive_bayes.py +209 -0
- mlscratch/metrics/__init__.py +59 -0
- mlscratch/metrics/classification.py +365 -0
- mlscratch/metrics/regression.py +79 -0
- mlscratch/neural/__init__.py +121 -0
- mlscratch/neural/attention.py +420 -0
- mlscratch/neural/autoencoder.py +543 -0
- mlscratch/neural/boltzmann.py +231 -0
- mlscratch/neural/cnn.py +593 -0
- mlscratch/neural/cvnn.py +322 -0
- mlscratch/neural/gan.py +364 -0
- mlscratch/neural/hopfield.py +193 -0
- mlscratch/neural/perceptron.py +398 -0
- mlscratch/neural/rbf_network.py +230 -0
- mlscratch/neural/recurrent.py +569 -0
- mlscratch/preprocessing/__init__.py +38 -0
- mlscratch/preprocessing/encoders.py +140 -0
- mlscratch/preprocessing/model_selection.py +119 -0
- mlscratch/preprocessing/polynomial.py +105 -0
- mlscratch/preprocessing/scalers.py +220 -0
- mlscratch/py.typed +0 -0
- mlscratch/reinforcement/__init__.py +59 -0
- mlscratch/reinforcement/ddpg.py +363 -0
- mlscratch/reinforcement/dqn.py +319 -0
- mlscratch/reinforcement/ppo.py +452 -0
- mlscratch/reinforcement/q_learning.py +352 -0
- mlscratch/reinforcement/sac.py +382 -0
- mlscratch/reinforcement/utils.py +594 -0
- mlscratch/supervised/__init__.py +76 -0
- mlscratch/supervised/_validation.py +50 -0
- mlscratch/supervised/adaboost.py +255 -0
- mlscratch/supervised/decision_tree.py +495 -0
- mlscratch/supervised/gradient_boosting.py +354 -0
- mlscratch/supervised/knn.py +234 -0
- mlscratch/supervised/lasso_regression.py +125 -0
- mlscratch/supervised/linear_models.py +459 -0
- mlscratch/supervised/linear_regression.py +197 -0
- mlscratch/supervised/logistic_regression.py +119 -0
- mlscratch/supervised/naive_bayes.py +113 -0
- mlscratch/supervised/random_forest.py +321 -0
- mlscratch/supervised/ridge_regression.py +93 -0
- mlscratch/supervised/svm.py +356 -0
- mlscratch/unsupervised/__init__.py +39 -0
- mlscratch/unsupervised/apriori.py +178 -0
- mlscratch/unsupervised/dbscan.py +141 -0
- mlscratch/unsupervised/gmm.py +204 -0
- mlscratch/unsupervised/hierarchical_clustering.py +137 -0
- mlscratch/unsupervised/ica.py +167 -0
- mlscratch/unsupervised/kmeans.py +135 -0
- mlscratch/unsupervised/kmedoids.py +133 -0
- mlscratch/unsupervised/pca.py +103 -0
- mlscratch/unsupervised/tsne.py +200 -0
- scratchkit-0.2.0.dist-info/METADATA +241 -0
- scratchkit-0.2.0.dist-info/RECORD +68 -0
- scratchkit-0.2.0.dist-info/WHEEL +5 -0
- scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
- scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
- scratchkit-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deep Deterministic Policy Gradient (DDPG) and TD3
|
|
3
|
+
===================================================
|
|
4
|
+
DDPG (Lillicrap et al., 2015) extends DQN to continuous action spaces:
|
|
5
|
+
- Deterministic actor π_θ(s) → a (output activation: tanh → scaled)
|
|
6
|
+
- Critic Q_φ(s, a) approximates action-value function
|
|
7
|
+
- Target networks (soft update) for both actor and critic
|
|
8
|
+
- Ornstein-Uhlenbeck or Gaussian noise for exploration
|
|
9
|
+
|
|
10
|
+
TD3 — Twin Delayed Deep Deterministic Policy Gradient (Fujimoto et al., 2018)
|
|
11
|
+
-----------------------------------------------------------------------
|
|
12
|
+
Three key improvements over DDPG:
|
|
13
|
+
1. Twin critics — two independent Q-networks; use min for targets
|
|
14
|
+
2. Delayed policy update — actor updated every `policy_delay` critic steps
|
|
15
|
+
3. Target policy noise — smoothed noisy targets prevent over-fitting to peaks
|
|
16
|
+
|
|
17
|
+
Update equations
|
|
18
|
+
----------------
|
|
19
|
+
Critic targets (TD3):
|
|
20
|
+
ã = π_θ'(s') + clip(N(0,σ̃), -c, c) # smoothed target action
|
|
21
|
+
y = r + γ(1-d) min(Q_1'(s',ã), Q_2'(s',ã))
|
|
22
|
+
|
|
23
|
+
Actor loss (DDPG / TD3):
|
|
24
|
+
L_π = -E[Q_1(s, π_θ(s))]
|
|
25
|
+
|
|
26
|
+
Only numpy and Python stdlib are used.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from __future__ import annotations
|
|
30
|
+
import numpy as np
|
|
31
|
+
from .utils import ReplayBuffer, MLP, OrnsteinUhlenbeckNoise, GaussianNoise
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# ============================================================
|
|
35
|
+
# DDPG
|
|
36
|
+
# ============================================================
|
|
37
|
+
|
|
38
|
+
class DDPG:
|
|
39
|
+
"""
|
|
40
|
+
Deep Deterministic Policy Gradient agent.
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
state_dim : int
|
|
45
|
+
action_dim : int
|
|
46
|
+
action_low : float lower bound of action space
|
|
47
|
+
action_high : float upper bound of action space
|
|
48
|
+
hidden_sizes : list[int]
|
|
49
|
+
actor_lr : float
|
|
50
|
+
critic_lr : float
|
|
51
|
+
gamma : float
|
|
52
|
+
tau : float soft update coefficient
|
|
53
|
+
buffer_capacity : int
|
|
54
|
+
batch_size : int
|
|
55
|
+
noise_type : str 'ou' | 'gaussian'
|
|
56
|
+
noise_sigma : float exploration noise scale
|
|
57
|
+
warmup_steps : int random actions before learning starts
|
|
58
|
+
random_state : int | None
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def __init__(
|
|
62
|
+
self,
|
|
63
|
+
state_dim: int,
|
|
64
|
+
action_dim: int,
|
|
65
|
+
action_low: float = -1.0,
|
|
66
|
+
action_high: float = 1.0,
|
|
67
|
+
hidden_sizes: list[int] | None = None,
|
|
68
|
+
actor_lr: float = 1e-3,
|
|
69
|
+
critic_lr: float = 1e-3,
|
|
70
|
+
gamma: float = 0.99,
|
|
71
|
+
tau: float = 0.005,
|
|
72
|
+
buffer_capacity: int = 100_000,
|
|
73
|
+
batch_size: int = 64,
|
|
74
|
+
noise_type: str = "ou",
|
|
75
|
+
noise_sigma: float = 0.1,
|
|
76
|
+
warmup_steps: int = 1000,
|
|
77
|
+
random_state: int | None = None,
|
|
78
|
+
):
|
|
79
|
+
self.action_dim = action_dim
|
|
80
|
+
self.action_low = action_low
|
|
81
|
+
self.action_high = action_high
|
|
82
|
+
self.gamma = gamma
|
|
83
|
+
self.tau = tau
|
|
84
|
+
self.batch_size = batch_size
|
|
85
|
+
self.warmup_steps = warmup_steps
|
|
86
|
+
self._rng = np.random.default_rng(random_state)
|
|
87
|
+
self._step = 0
|
|
88
|
+
|
|
89
|
+
hidden = hidden_sizes or [256, 256]
|
|
90
|
+
act_scale = (action_high - action_low) / 2.0
|
|
91
|
+
act_bias = (action_high + action_low) / 2.0
|
|
92
|
+
self._act_scale = act_scale
|
|
93
|
+
self._act_bias = act_bias
|
|
94
|
+
|
|
95
|
+
# Actor: s → a (tanh output scaled to [low, high])
|
|
96
|
+
self.actor = MLP([state_dim] + hidden + [action_dim],
|
|
97
|
+
output_activation="tanh", lr=actor_lr,
|
|
98
|
+
random_state=random_state)
|
|
99
|
+
self.actor_target = MLP([state_dim] + hidden + [action_dim],
|
|
100
|
+
output_activation="tanh", lr=actor_lr,
|
|
101
|
+
random_state=random_state)
|
|
102
|
+
self.actor.hard_update(self.actor_target)
|
|
103
|
+
|
|
104
|
+
# Critic: (s, a) → Q
|
|
105
|
+
self.critic = MLP([state_dim + action_dim] + hidden + [1],
|
|
106
|
+
output_activation="linear", lr=critic_lr,
|
|
107
|
+
random_state=random_state)
|
|
108
|
+
self.critic_target = MLP([state_dim + action_dim] + hidden + [1],
|
|
109
|
+
output_activation="linear", lr=critic_lr,
|
|
110
|
+
random_state=random_state)
|
|
111
|
+
self.critic.hard_update(self.critic_target)
|
|
112
|
+
|
|
113
|
+
# Replay
|
|
114
|
+
self.buffer = ReplayBuffer(buffer_capacity)
|
|
115
|
+
|
|
116
|
+
# Exploration noise
|
|
117
|
+
if noise_type == "ou":
|
|
118
|
+
self.noise = OrnsteinUhlenbeckNoise(
|
|
119
|
+
action_dim, sigma=noise_sigma, random_state=random_state
|
|
120
|
+
)
|
|
121
|
+
else:
|
|
122
|
+
self.noise = GaussianNoise(action_dim, sigma=noise_sigma,
|
|
123
|
+
random_state=random_state)
|
|
124
|
+
|
|
125
|
+
# Logging
|
|
126
|
+
self.actor_losses_: list[float] = []
|
|
127
|
+
self.critic_losses_: list[float] = []
|
|
128
|
+
self.episode_rewards_: list[float] = []
|
|
129
|
+
|
|
130
|
+
# ------------------------------------------------------------------
|
|
131
|
+
# Action
|
|
132
|
+
# ------------------------------------------------------------------
|
|
133
|
+
|
|
134
|
+
def _scale_action(self, a_tanh: np.ndarray) -> np.ndarray:
|
|
135
|
+
return a_tanh * self._act_scale + self._act_bias
|
|
136
|
+
|
|
137
|
+
def select_action(self, state: np.ndarray, add_noise: bool = True) -> np.ndarray:
|
|
138
|
+
a = self.actor.forward(state) # tanh in [-1,1]
|
|
139
|
+
if add_noise:
|
|
140
|
+
a = a + self.noise.sample()
|
|
141
|
+
a = np.clip(a, -1.0, 1.0)
|
|
142
|
+
return self._scale_action(a)
|
|
143
|
+
|
|
144
|
+
# ------------------------------------------------------------------
|
|
145
|
+
# Learning
|
|
146
|
+
# ------------------------------------------------------------------
|
|
147
|
+
|
|
148
|
+
def _learn(self) -> tuple[float, float] | tuple[None, None]:
|
|
149
|
+
if len(self.buffer) < self.batch_size:
|
|
150
|
+
return None, None
|
|
151
|
+
|
|
152
|
+
states, actions, rewards, next_states, dones = \
|
|
153
|
+
self.buffer.sample(self.batch_size, self._rng)
|
|
154
|
+
|
|
155
|
+
# ── Critic update ──────────────────────────────────────────────
|
|
156
|
+
# Target action from actor_target
|
|
157
|
+
a_next = self.actor_target.forward(next_states) # (B, A_dim)
|
|
158
|
+
a_next = np.clip(a_next, -1.0, 1.0)
|
|
159
|
+
sa_next = np.concatenate([next_states,
|
|
160
|
+
a_next * self._act_scale + self._act_bias], axis=1)
|
|
161
|
+
|
|
162
|
+
q_next = self.critic_target.forward(sa_next).ravel() # (B,)
|
|
163
|
+
y = rewards + self.gamma * (1.0 - dones) * q_next # (B,)
|
|
164
|
+
|
|
165
|
+
# Normalise stored actions back to [-1,1] for concat
|
|
166
|
+
a_norm = (actions - self._act_bias) / self._act_scale
|
|
167
|
+
sa = np.concatenate([states, actions], axis=1)
|
|
168
|
+
q_pred = self.critic.forward(sa, training=True).ravel() # (B,)
|
|
169
|
+
|
|
170
|
+
td_errors = y - q_pred
|
|
171
|
+
critic_loss = float(np.mean(td_errors ** 2))
|
|
172
|
+
|
|
173
|
+
d_critic = -2.0 * td_errors[:, np.newaxis] / self.batch_size
|
|
174
|
+
self.critic.backward(d_critic)
|
|
175
|
+
|
|
176
|
+
# ── Actor update ───────────────────────────────────────────────
|
|
177
|
+
a_pred = self.actor.forward(states, training=True) # (B, A_dim)
|
|
178
|
+
a_scaled = a_pred * self._act_scale + self._act_bias
|
|
179
|
+
sa_pred = np.concatenate([states, a_scaled], axis=1)
|
|
180
|
+
|
|
181
|
+
q_actor = self.critic.forward(sa_pred, training=True).ravel()
|
|
182
|
+
actor_loss = float(-np.mean(q_actor))
|
|
183
|
+
|
|
184
|
+
# dL/da = -dQ/da (chain through critic → actor)
|
|
185
|
+
d_q_wrt_sa = np.ones((self.batch_size, 1)) / self.batch_size
|
|
186
|
+
# Gradient w.r.t. action part only
|
|
187
|
+
d_a = d_q_wrt_sa * (-1.0) * self._act_scale # (B, A_dim)
|
|
188
|
+
self.actor.backward(d_a)
|
|
189
|
+
|
|
190
|
+
# ── Soft target updates ────────────────────────────────────────
|
|
191
|
+
self.actor.soft_update(self.actor_target, self.tau)
|
|
192
|
+
self.critic.soft_update(self.critic_target, self.tau)
|
|
193
|
+
|
|
194
|
+
return actor_loss, critic_loss
|
|
195
|
+
|
|
196
|
+
# ------------------------------------------------------------------
|
|
197
|
+
# Step
|
|
198
|
+
# ------------------------------------------------------------------
|
|
199
|
+
|
|
200
|
+
def step(
|
|
201
|
+
self,
|
|
202
|
+
state: np.ndarray,
|
|
203
|
+
action: np.ndarray,
|
|
204
|
+
reward: float,
|
|
205
|
+
next_state: np.ndarray,
|
|
206
|
+
done: bool,
|
|
207
|
+
) -> tuple[float | None, float | None]:
|
|
208
|
+
self.buffer.push(state, action, reward, next_state, done)
|
|
209
|
+
self._step += 1
|
|
210
|
+
|
|
211
|
+
if self._step < self.warmup_steps:
|
|
212
|
+
return None, None
|
|
213
|
+
|
|
214
|
+
actor_loss, critic_loss = self._learn()
|
|
215
|
+
if actor_loss is not None:
|
|
216
|
+
self.actor_losses_.append(actor_loss)
|
|
217
|
+
self.critic_losses_.append(critic_loss)
|
|
218
|
+
return actor_loss, critic_loss
|
|
219
|
+
|
|
220
|
+
def train_episode(self, env) -> float:
|
|
221
|
+
state = env.reset(self._rng)
|
|
222
|
+
self.noise.reset() if hasattr(self.noise, 'reset') else None
|
|
223
|
+
total_reward = 0.0
|
|
224
|
+
done = False
|
|
225
|
+
|
|
226
|
+
while not done:
|
|
227
|
+
if self._step < self.warmup_steps:
|
|
228
|
+
action = self._rng.uniform(self.action_low, self.action_high,
|
|
229
|
+
self.action_dim)
|
|
230
|
+
else:
|
|
231
|
+
action = self.select_action(state)
|
|
232
|
+
next_state, reward, done = env.step(action)
|
|
233
|
+
self.step(state, action, reward, next_state, done)
|
|
234
|
+
state = next_state
|
|
235
|
+
total_reward += reward
|
|
236
|
+
|
|
237
|
+
self.episode_rewards_.append(total_reward)
|
|
238
|
+
return total_reward
|
|
239
|
+
|
|
240
|
+
def train(self, env, n_episodes: int) -> "DDPG":
|
|
241
|
+
for _ in range(n_episodes):
|
|
242
|
+
self.train_episode(env)
|
|
243
|
+
return self
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
# ============================================================
|
|
247
|
+
# TD3
|
|
248
|
+
# ============================================================
|
|
249
|
+
|
|
250
|
+
class TD3(DDPG):
|
|
251
|
+
"""
|
|
252
|
+
Twin Delayed Deep Deterministic Policy Gradient (TD3).
|
|
253
|
+
|
|
254
|
+
Inherits from DDPG and adds:
|
|
255
|
+
- Second critic (critic2 + critic2_target)
|
|
256
|
+
- Policy delay: actor updated every `policy_delay` critic steps
|
|
257
|
+
- Target policy smoothing: Gaussian noise clipped to ±noise_clip
|
|
258
|
+
|
|
259
|
+
Parameters (additional to DDPG)
|
|
260
|
+
--------------------------------
|
|
261
|
+
policy_delay : int critic updates per actor update (default 2)
|
|
262
|
+
target_noise : float std of smoothing noise on target actions
|
|
263
|
+
noise_clip : float clipping bound for smoothing noise
|
|
264
|
+
"""
|
|
265
|
+
|
|
266
|
+
def __init__(
|
|
267
|
+
self,
|
|
268
|
+
state_dim: int,
|
|
269
|
+
action_dim: int,
|
|
270
|
+
action_low: float = -1.0,
|
|
271
|
+
action_high: float = 1.0,
|
|
272
|
+
hidden_sizes: list[int] | None = None,
|
|
273
|
+
actor_lr: float = 1e-3,
|
|
274
|
+
critic_lr: float = 1e-3,
|
|
275
|
+
gamma: float = 0.99,
|
|
276
|
+
tau: float = 0.005,
|
|
277
|
+
buffer_capacity: int = 100_000,
|
|
278
|
+
batch_size: int = 64,
|
|
279
|
+
noise_type: str = "gaussian",
|
|
280
|
+
noise_sigma: float = 0.1,
|
|
281
|
+
warmup_steps: int = 1000,
|
|
282
|
+
policy_delay: int = 2,
|
|
283
|
+
target_noise: float = 0.2,
|
|
284
|
+
noise_clip: float = 0.5,
|
|
285
|
+
random_state: int | None = None,
|
|
286
|
+
):
|
|
287
|
+
super().__init__(
|
|
288
|
+
state_dim, action_dim, action_low, action_high, hidden_sizes,
|
|
289
|
+
actor_lr, critic_lr, gamma, tau, buffer_capacity, batch_size,
|
|
290
|
+
noise_type, noise_sigma, warmup_steps, random_state,
|
|
291
|
+
)
|
|
292
|
+
self.policy_delay = policy_delay
|
|
293
|
+
self.target_noise = target_noise
|
|
294
|
+
self.noise_clip = noise_clip
|
|
295
|
+
self._critic_steps = 0
|
|
296
|
+
|
|
297
|
+
hidden = hidden_sizes or [256, 256]
|
|
298
|
+
# Second critic pair
|
|
299
|
+
self.critic2 = MLP([state_dim + action_dim] + hidden + [1],
|
|
300
|
+
output_activation="linear", lr=critic_lr,
|
|
301
|
+
random_state=random_state)
|
|
302
|
+
self.critic2_target = MLP([state_dim + action_dim] + hidden + [1],
|
|
303
|
+
output_activation="linear", lr=critic_lr,
|
|
304
|
+
random_state=random_state)
|
|
305
|
+
self.critic2.hard_update(self.critic2_target)
|
|
306
|
+
|
|
307
|
+
def _learn(self) -> tuple[float, float] | tuple[None, None]:
|
|
308
|
+
if len(self.buffer) < self.batch_size:
|
|
309
|
+
return None, None
|
|
310
|
+
|
|
311
|
+
states, actions, rewards, next_states, dones = \
|
|
312
|
+
self.buffer.sample(self.batch_size, self._rng)
|
|
313
|
+
|
|
314
|
+
self._critic_steps += 1
|
|
315
|
+
|
|
316
|
+
# ── Target action with smoothing noise ────────────────────────
|
|
317
|
+
a_next = self.actor_target.forward(next_states)
|
|
318
|
+
noise = np.clip(
|
|
319
|
+
self._rng.normal(0, self.target_noise, a_next.shape),
|
|
320
|
+
-self.noise_clip, self.noise_clip
|
|
321
|
+
)
|
|
322
|
+
a_next = np.clip(a_next + noise, -1.0, 1.0)
|
|
323
|
+
a_next_scaled = a_next * self._act_scale + self._act_bias
|
|
324
|
+
sa_next = np.concatenate([next_states, a_next_scaled], axis=1)
|
|
325
|
+
|
|
326
|
+
# ── Twin critics targets (take min) ───────────────────────────
|
|
327
|
+
q1_next = self.critic_target.forward(sa_next).ravel()
|
|
328
|
+
q2_next = self.critic2_target.forward(sa_next).ravel()
|
|
329
|
+
q_next = np.minimum(q1_next, q2_next)
|
|
330
|
+
y = rewards + self.gamma * (1.0 - dones) * q_next
|
|
331
|
+
|
|
332
|
+
# ── Update both critics ───────────────────────────────────────
|
|
333
|
+
sa = np.concatenate([states, actions], axis=1)
|
|
334
|
+
|
|
335
|
+
q1_pred = self.critic.forward(sa, training=True).ravel()
|
|
336
|
+
td1 = y - q1_pred
|
|
337
|
+
critic1_loss = float(np.mean(td1 ** 2))
|
|
338
|
+
self.critic.backward(-2.0 * td1[:, np.newaxis] / self.batch_size)
|
|
339
|
+
|
|
340
|
+
q2_pred = self.critic2.forward(sa, training=True).ravel()
|
|
341
|
+
td2 = y - q2_pred
|
|
342
|
+
critic2_loss = float(np.mean(td2 ** 2))
|
|
343
|
+
self.critic2.backward(-2.0 * td2[:, np.newaxis] / self.batch_size)
|
|
344
|
+
|
|
345
|
+
critic_loss = (critic1_loss + critic2_loss) / 2.0
|
|
346
|
+
actor_loss = None
|
|
347
|
+
|
|
348
|
+
# ── Delayed actor update ──────────────────────────────────────
|
|
349
|
+
if self._critic_steps % self.policy_delay == 0:
|
|
350
|
+
a_pred = self.actor.forward(states, training=True)
|
|
351
|
+
a_scaled = a_pred * self._act_scale + self._act_bias
|
|
352
|
+
sa_pred = np.concatenate([states, a_scaled], axis=1)
|
|
353
|
+
q_actor = self.critic.forward(sa_pred, training=True).ravel()
|
|
354
|
+
actor_loss = float(-np.mean(q_actor))
|
|
355
|
+
d_a = -np.ones((self.batch_size, self.action_dim)) * self._act_scale \
|
|
356
|
+
/ self.batch_size
|
|
357
|
+
self.actor.backward(d_a)
|
|
358
|
+
self.actor.soft_update(self.actor_target, self.tau)
|
|
359
|
+
|
|
360
|
+
self.critic.soft_update(self.critic_target, self.tau)
|
|
361
|
+
self.critic2.soft_update(self.critic2_target, self.tau)
|
|
362
|
+
|
|
363
|
+
return actor_loss, critic_loss
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deep Q-Network (DQN)
|
|
3
|
+
=====================
|
|
4
|
+
Neural-network function approximator for Q-learning, with three
|
|
5
|
+
production-grade enhancements:
|
|
6
|
+
|
|
7
|
+
1. Experience Replay — breaks temporal correlations (Mnih et al., 2013)
|
|
8
|
+
2. Target Network — stabilises training targets (Mnih et al., 2015)
|
|
9
|
+
3. Double DQN — removes maximisation bias (van Hasselt et al., 2016)
|
|
10
|
+
|
|
11
|
+
Optional:
|
|
12
|
+
4. Dueling Network — separate V(s) and A(s,a) streams
|
|
13
|
+
(Wang et al., 2016)
|
|
14
|
+
5. Prioritised Replay — focuses on high-TD-error transitions
|
|
15
|
+
(Schaul et al., 2015)
|
|
16
|
+
|
|
17
|
+
Update rule (Double DQN):
|
|
18
|
+
a* = argmax_a Q_online(s', a)
|
|
19
|
+
y = r + γ (1-done) Q_target(s', a*)
|
|
20
|
+
L = (y - Q_online(s, a))²
|
|
21
|
+
|
|
22
|
+
Only numpy and Python stdlib are used.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
import numpy as np
|
|
27
|
+
from copy import deepcopy
|
|
28
|
+
|
|
29
|
+
from .utils import ReplayBuffer, PrioritizedReplayBuffer, MLP
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ============================================================
|
|
33
|
+
# Dueling MLP
|
|
34
|
+
# ============================================================
|
|
35
|
+
|
|
36
|
+
class DuelingMLP:
|
|
37
|
+
"""
|
|
38
|
+
Dueling network: two heads sharing a common feature trunk.
|
|
39
|
+
|
|
40
|
+
Q(s,a) = V(s) + A(s,a) - mean_a A(s,a)
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
state_dim : int
|
|
45
|
+
n_actions : int
|
|
46
|
+
hidden_sizes: list[int] size of shared hidden layers
|
|
47
|
+
lr : float
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
state_dim: int,
|
|
53
|
+
n_actions: int,
|
|
54
|
+
hidden_sizes: list[int] | None = None,
|
|
55
|
+
lr: float = 1e-3,
|
|
56
|
+
random_state: int | None = None,
|
|
57
|
+
):
|
|
58
|
+
hidden_sizes = hidden_sizes or [128, 128]
|
|
59
|
+
self.n_actions = n_actions
|
|
60
|
+
rng = np.random.default_rng(random_state)
|
|
61
|
+
|
|
62
|
+
# Shared trunk
|
|
63
|
+
trunk_sizes = [state_dim] + hidden_sizes
|
|
64
|
+
self._trunk = MLP(trunk_sizes, output_activation="linear",
|
|
65
|
+
lr=lr, random_state=random_state)
|
|
66
|
+
|
|
67
|
+
# Value head: hidden[-1] → 1
|
|
68
|
+
self._value_head = MLP([hidden_sizes[-1], 64, 1],
|
|
69
|
+
output_activation="linear", lr=lr,
|
|
70
|
+
random_state=random_state)
|
|
71
|
+
# Advantage head: hidden[-1] → n_actions
|
|
72
|
+
self._adv_head = MLP([hidden_sizes[-1], 64, n_actions],
|
|
73
|
+
output_activation="linear", lr=lr,
|
|
74
|
+
random_state=random_state)
|
|
75
|
+
|
|
76
|
+
def forward(self, x: np.ndarray, training: bool = False) -> np.ndarray:
|
|
77
|
+
scalar = x.ndim == 1
|
|
78
|
+
if scalar:
|
|
79
|
+
x = x[np.newaxis, :]
|
|
80
|
+
h = self._trunk.forward(x, training=training)
|
|
81
|
+
V = self._value_head.forward(h, training=training) # (B,1)
|
|
82
|
+
A = self._adv_head.forward(h, training=training) # (B,A)
|
|
83
|
+
Q = V + A - A.mean(axis=1, keepdims=True)
|
|
84
|
+
return Q[0] if scalar else Q
|
|
85
|
+
|
|
86
|
+
def soft_update(self, target: "DuelingMLP", tau: float) -> None:
|
|
87
|
+
self._trunk.soft_update(target._trunk, tau)
|
|
88
|
+
self._value_head.soft_update(target._value_head, tau)
|
|
89
|
+
self._adv_head.soft_update(target._adv_head, tau)
|
|
90
|
+
|
|
91
|
+
def hard_update(self, target: "DuelingMLP") -> None:
|
|
92
|
+
self._trunk.hard_update(target._trunk)
|
|
93
|
+
self._value_head.hard_update(target._value_head)
|
|
94
|
+
self._adv_head.hard_update(target._adv_head)
|
|
95
|
+
|
|
96
|
+
def copy_weights_from(self, source: "DuelingMLP") -> None:
|
|
97
|
+
self._trunk.copy_weights_from(source._trunk)
|
|
98
|
+
self._value_head.copy_weights_from(source._value_head)
|
|
99
|
+
self._adv_head.copy_weights_from(source._adv_head)
|
|
100
|
+
|
|
101
|
+
def backward(self, d_out: np.ndarray) -> None:
|
|
102
|
+
# Simplified backward: treat Q output as direct loss gradient
|
|
103
|
+
# into the advantage head (standard approach for DQN)
|
|
104
|
+
d_A = d_out - d_out.mean(axis=1, keepdims=True)
|
|
105
|
+
d_V = d_out.mean(axis=1, keepdims=True) * np.ones((d_out.shape[0], 1))
|
|
106
|
+
self._adv_head.backward(d_A)
|
|
107
|
+
self._value_head.backward(d_V)
|
|
108
|
+
# Trunk gradient = sum of both heads (simplified)
|
|
109
|
+
self._trunk.backward(d_out.mean(axis=1, keepdims=True) *
|
|
110
|
+
np.ones((d_out.shape[0],
|
|
111
|
+
self._trunk.layer_sizes[-1])))
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# ============================================================
|
|
115
|
+
# DQN Agent
|
|
116
|
+
# ============================================================
|
|
117
|
+
|
|
118
|
+
class DQN:
|
|
119
|
+
"""
|
|
120
|
+
Deep Q-Network agent.
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
state_dim : int
|
|
125
|
+
n_actions : int
|
|
126
|
+
hidden_sizes : list[int]
|
|
127
|
+
lr : float learning rate
|
|
128
|
+
gamma : float discount factor
|
|
129
|
+
epsilon : float initial exploration ε
|
|
130
|
+
epsilon_min : float minimum ε
|
|
131
|
+
epsilon_decay : float multiplicative decay per step
|
|
132
|
+
batch_size : int
|
|
133
|
+
buffer_capacity : int
|
|
134
|
+
target_update : int hard target update every N steps
|
|
135
|
+
tau : float | None soft update coeff; None → hard update
|
|
136
|
+
double_dqn : bool use Double DQN
|
|
137
|
+
dueling : bool use Dueling Network
|
|
138
|
+
prioritized : bool use Prioritised Replay
|
|
139
|
+
random_state : int | None
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
def __init__(
|
|
143
|
+
self,
|
|
144
|
+
state_dim: int,
|
|
145
|
+
n_actions: int,
|
|
146
|
+
hidden_sizes: list[int] | None = None,
|
|
147
|
+
lr: float = 1e-3,
|
|
148
|
+
gamma: float = 0.99,
|
|
149
|
+
epsilon: float = 1.0,
|
|
150
|
+
epsilon_min: float = 0.01,
|
|
151
|
+
epsilon_decay: float = 0.995,
|
|
152
|
+
batch_size: int = 64,
|
|
153
|
+
buffer_capacity: int = 50_000,
|
|
154
|
+
target_update: int = 100,
|
|
155
|
+
tau: float | None = None,
|
|
156
|
+
double_dqn: bool = True,
|
|
157
|
+
dueling: bool = False,
|
|
158
|
+
prioritized: bool = False,
|
|
159
|
+
random_state: int | None = None,
|
|
160
|
+
):
|
|
161
|
+
self.n_actions = n_actions
|
|
162
|
+
self.gamma = gamma
|
|
163
|
+
self.epsilon = epsilon
|
|
164
|
+
self.epsilon_min = epsilon_min
|
|
165
|
+
self.epsilon_decay = epsilon_decay
|
|
166
|
+
self.batch_size = batch_size
|
|
167
|
+
self.target_update = target_update
|
|
168
|
+
self.tau = tau
|
|
169
|
+
self.double_dqn = double_dqn
|
|
170
|
+
self.prioritized = prioritized
|
|
171
|
+
self._rng = np.random.default_rng(random_state)
|
|
172
|
+
self._step = 0
|
|
173
|
+
|
|
174
|
+
hidden = hidden_sizes or [128, 128]
|
|
175
|
+
|
|
176
|
+
# Online and target networks
|
|
177
|
+
if dueling:
|
|
178
|
+
self.online_net = DuelingMLP(state_dim, n_actions, hidden, lr, random_state)
|
|
179
|
+
self.target_net = DuelingMLP(state_dim, n_actions, hidden, lr, random_state)
|
|
180
|
+
else:
|
|
181
|
+
self.online_net = MLP([state_dim] + hidden + [n_actions],
|
|
182
|
+
output_activation="linear", lr=lr,
|
|
183
|
+
random_state=random_state)
|
|
184
|
+
self.target_net = MLP([state_dim] + hidden + [n_actions],
|
|
185
|
+
output_activation="linear", lr=lr,
|
|
186
|
+
random_state=random_state)
|
|
187
|
+
|
|
188
|
+
# Sync target = online at init
|
|
189
|
+
self.online_net.hard_update(self.target_net)
|
|
190
|
+
|
|
191
|
+
# Replay buffer
|
|
192
|
+
if prioritized:
|
|
193
|
+
self.buffer = PrioritizedReplayBuffer(buffer_capacity)
|
|
194
|
+
else:
|
|
195
|
+
self.buffer = ReplayBuffer(buffer_capacity)
|
|
196
|
+
|
|
197
|
+
# Logging
|
|
198
|
+
self.losses_: list[float] = []
|
|
199
|
+
self.episode_rewards_: list[float] = []
|
|
200
|
+
self.epsilons_: list[float] = []
|
|
201
|
+
|
|
202
|
+
# ------------------------------------------------------------------
|
|
203
|
+
# Action selection
|
|
204
|
+
# ------------------------------------------------------------------
|
|
205
|
+
|
|
206
|
+
def select_action(self, state: np.ndarray, greedy: bool = False) -> int:
|
|
207
|
+
"""ε-greedy action with linear annealing."""
|
|
208
|
+
if not greedy and self._rng.random() < self.epsilon:
|
|
209
|
+
return int(self._rng.integers(self.n_actions))
|
|
210
|
+
q = self.online_net.forward(state)
|
|
211
|
+
return int(np.argmax(q))
|
|
212
|
+
|
|
213
|
+
# ------------------------------------------------------------------
|
|
214
|
+
# Learning step
|
|
215
|
+
# ------------------------------------------------------------------
|
|
216
|
+
|
|
217
|
+
def _learn(self) -> float | None:
|
|
218
|
+
if len(self.buffer) < self.batch_size:
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
if self.prioritized:
|
|
222
|
+
states, actions, rewards, next_states, dones, weights, idxs = \
|
|
223
|
+
self.buffer.sample(self.batch_size, self._rng)
|
|
224
|
+
else:
|
|
225
|
+
states, actions, rewards, next_states, dones = \
|
|
226
|
+
self.buffer.sample(self.batch_size, self._rng)
|
|
227
|
+
weights = np.ones(self.batch_size)
|
|
228
|
+
|
|
229
|
+
actions = actions.ravel().astype(int)
|
|
230
|
+
|
|
231
|
+
# Compute targets
|
|
232
|
+
with_no_grad = True # conceptually; we don't call backward on target_net
|
|
233
|
+
q_next_target = self.target_net.forward(next_states) # (B, A)
|
|
234
|
+
|
|
235
|
+
if self.double_dqn:
|
|
236
|
+
q_next_online = self.online_net.forward(next_states)
|
|
237
|
+
a_star = np.argmax(q_next_online, axis=1) # online selects
|
|
238
|
+
q_next_val = q_next_target[np.arange(self.batch_size), a_star]
|
|
239
|
+
else:
|
|
240
|
+
q_next_val = q_next_target.max(axis=1)
|
|
241
|
+
|
|
242
|
+
targets = rewards + self.gamma * (1.0 - dones) * q_next_val # (B,)
|
|
243
|
+
|
|
244
|
+
# Compute predictions and loss gradient
|
|
245
|
+
q_pred_all = self.online_net.forward(states, training=True) # (B, A)
|
|
246
|
+
q_pred = q_pred_all[np.arange(self.batch_size), actions] # (B,)
|
|
247
|
+
|
|
248
|
+
td_errors = targets - q_pred # (B,)
|
|
249
|
+
loss = float(np.mean(weights * td_errors ** 2))
|
|
250
|
+
|
|
251
|
+
# Gradient: dL/dQ_pred = -2 * w * td_error (averaged in backward)
|
|
252
|
+
d_out = np.zeros_like(q_pred_all)
|
|
253
|
+
d_out[np.arange(self.batch_size), actions] = (
|
|
254
|
+
-2.0 * weights * td_errors / self.batch_size
|
|
255
|
+
)
|
|
256
|
+
self.online_net.backward(d_out)
|
|
257
|
+
|
|
258
|
+
# Update priorities
|
|
259
|
+
if self.prioritized:
|
|
260
|
+
self.buffer.update_priorities(idxs, td_errors)
|
|
261
|
+
|
|
262
|
+
return loss
|
|
263
|
+
|
|
264
|
+
# ------------------------------------------------------------------
|
|
265
|
+
# Step
|
|
266
|
+
# ------------------------------------------------------------------
|
|
267
|
+
|
|
268
|
+
def step(
|
|
269
|
+
self,
|
|
270
|
+
state: np.ndarray,
|
|
271
|
+
action: int,
|
|
272
|
+
reward: float,
|
|
273
|
+
next_state: np.ndarray,
|
|
274
|
+
done: bool,
|
|
275
|
+
) -> float | None:
|
|
276
|
+
"""Store transition, learn, update target, decay ε."""
|
|
277
|
+
self.buffer.push(state, np.array([action]), reward, next_state, done)
|
|
278
|
+
self._step += 1
|
|
279
|
+
|
|
280
|
+
loss = self._learn()
|
|
281
|
+
if loss is not None:
|
|
282
|
+
self.losses_.append(loss)
|
|
283
|
+
|
|
284
|
+
# Target update
|
|
285
|
+
if self.tau is not None:
|
|
286
|
+
self.online_net.soft_update(self.target_net, self.tau)
|
|
287
|
+
elif self._step % self.target_update == 0:
|
|
288
|
+
self.online_net.hard_update(self.target_net)
|
|
289
|
+
|
|
290
|
+
# Epsilon decay
|
|
291
|
+
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
|
|
292
|
+
|
|
293
|
+
return loss
|
|
294
|
+
|
|
295
|
+
# ------------------------------------------------------------------
|
|
296
|
+
# Episode training
|
|
297
|
+
# ------------------------------------------------------------------
|
|
298
|
+
|
|
299
|
+
def train_episode(self, env) -> float:
|
|
300
|
+
"""Run one episode and return total reward."""
|
|
301
|
+
state = env.reset()
|
|
302
|
+
total_reward = 0.0
|
|
303
|
+
done = False
|
|
304
|
+
|
|
305
|
+
while not done:
|
|
306
|
+
action = self.select_action(state)
|
|
307
|
+
next_state, reward, done = env.step(action)
|
|
308
|
+
self.step(state, action, reward, next_state, done)
|
|
309
|
+
state = next_state
|
|
310
|
+
total_reward += reward
|
|
311
|
+
|
|
312
|
+
self.episode_rewards_.append(total_reward)
|
|
313
|
+
self.epsilons_.append(self.epsilon)
|
|
314
|
+
return total_reward
|
|
315
|
+
|
|
316
|
+
def train(self, env, n_episodes: int) -> "DQN":
|
|
317
|
+
for _ in range(n_episodes):
|
|
318
|
+
self.train_episode(env)
|
|
319
|
+
return self
|