PyPI - scratchkit - Versions diffs - 0.2.0__py3-none-any.whl - Mend

scratchkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

mlscratch/__init__.py +56 -0
mlscratch/__main__.py +118 -0
mlscratch/bayesian/__init__.py +53 -0
mlscratch/bayesian/bayesian_linear_regression.py +171 -0
mlscratch/bayesian/bayesian_network.py +248 -0
mlscratch/bayesian/bayesian_nn.py +315 -0
mlscratch/bayesian/gaussian_process.py +207 -0
mlscratch/bayesian/hmm.py +277 -0
mlscratch/bayesian/init.py +52 -0
mlscratch/bayesian/kalman_filter.py +182 -0
mlscratch/bayesian/naive_bayes.py +209 -0
mlscratch/metrics/__init__.py +59 -0
mlscratch/metrics/classification.py +365 -0
mlscratch/metrics/regression.py +79 -0
mlscratch/neural/__init__.py +121 -0
mlscratch/neural/attention.py +420 -0
mlscratch/neural/autoencoder.py +543 -0
mlscratch/neural/boltzmann.py +231 -0
mlscratch/neural/cnn.py +593 -0
mlscratch/neural/cvnn.py +322 -0
mlscratch/neural/gan.py +364 -0
mlscratch/neural/hopfield.py +193 -0
mlscratch/neural/perceptron.py +398 -0
mlscratch/neural/rbf_network.py +230 -0
mlscratch/neural/recurrent.py +569 -0
mlscratch/preprocessing/__init__.py +38 -0
mlscratch/preprocessing/encoders.py +140 -0
mlscratch/preprocessing/model_selection.py +119 -0
mlscratch/preprocessing/polynomial.py +105 -0
mlscratch/preprocessing/scalers.py +220 -0
mlscratch/py.typed +0 -0
mlscratch/reinforcement/__init__.py +59 -0
mlscratch/reinforcement/ddpg.py +363 -0
mlscratch/reinforcement/dqn.py +319 -0
mlscratch/reinforcement/ppo.py +452 -0
mlscratch/reinforcement/q_learning.py +352 -0
mlscratch/reinforcement/sac.py +382 -0
mlscratch/reinforcement/utils.py +594 -0
mlscratch/supervised/__init__.py +76 -0
mlscratch/supervised/_validation.py +50 -0
mlscratch/supervised/adaboost.py +255 -0
mlscratch/supervised/decision_tree.py +495 -0
mlscratch/supervised/gradient_boosting.py +354 -0
mlscratch/supervised/knn.py +234 -0
mlscratch/supervised/lasso_regression.py +125 -0
mlscratch/supervised/linear_models.py +459 -0
mlscratch/supervised/linear_regression.py +197 -0
mlscratch/supervised/logistic_regression.py +119 -0
mlscratch/supervised/naive_bayes.py +113 -0
mlscratch/supervised/random_forest.py +321 -0
mlscratch/supervised/ridge_regression.py +93 -0
mlscratch/supervised/svm.py +356 -0
mlscratch/unsupervised/__init__.py +39 -0
mlscratch/unsupervised/apriori.py +178 -0
mlscratch/unsupervised/dbscan.py +141 -0
mlscratch/unsupervised/gmm.py +204 -0
mlscratch/unsupervised/hierarchical_clustering.py +137 -0
mlscratch/unsupervised/ica.py +167 -0
mlscratch/unsupervised/kmeans.py +135 -0
mlscratch/unsupervised/kmedoids.py +133 -0
mlscratch/unsupervised/pca.py +103 -0
mlscratch/unsupervised/tsne.py +200 -0
scratchkit-0.2.0.dist-info/METADATA +241 -0
scratchkit-0.2.0.dist-info/RECORD +68 -0
scratchkit-0.2.0.dist-info/WHEEL +5 -0
scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
scratchkit-0.2.0.dist-info/top_level.txt +1 -0

mlscratch/reinforcement/ddpg.py ADDED Viewed

@@ -0,0 +1,363 @@
+"""
+Deep Deterministic Policy Gradient (DDPG) and TD3
+===================================================
+DDPG (Lillicrap et al., 2015) extends DQN to continuous action spaces:
+  - Deterministic actor  π_θ(s) → a   (output activation: tanh → scaled)
+  - Critic Q_φ(s, a)     approximates action-value function
+  - Target networks (soft update) for both actor and critic
+  - Ornstein-Uhlenbeck or Gaussian noise for exploration
+TD3 — Twin Delayed Deep Deterministic Policy Gradient (Fujimoto et al., 2018)
+-----------------------------------------------------------------------
+Three key improvements over DDPG:
+  1. Twin critics          — two independent Q-networks; use min for targets
+  2. Delayed policy update — actor updated every `policy_delay` critic steps
+  3. Target policy noise   — smoothed noisy targets prevent over-fitting to peaks
+Update equations
+----------------
+Critic targets (TD3):
+    ã = π_θ'(s') + clip(N(0,σ̃), -c, c)          # smoothed target action
+    y = r + γ(1-d) min(Q_1'(s',ã), Q_2'(s',ã))
+Actor loss (DDPG / TD3):
+    L_π = -E[Q_1(s, π_θ(s))]
+Only numpy and Python stdlib are used.
+"""
+from __future__ import annotations
+import numpy as np
+from .utils import ReplayBuffer, MLP, OrnsteinUhlenbeckNoise, GaussianNoise
+# ============================================================
+# DDPG
+# ============================================================
+class DDPG:
+    """
+    Deep Deterministic Policy Gradient agent.
+    Parameters
+    ----------
+    state_dim       : int
+    action_dim      : int
+    action_low      : float    lower bound of action space
+    action_high     : float    upper bound of action space
+    hidden_sizes    : list[int]
+    actor_lr        : float
+    critic_lr       : float
+    gamma           : float
+    tau             : float    soft update coefficient
+    buffer_capacity : int
+    batch_size      : int
+    noise_type      : str      'ou' | 'gaussian'
+    noise_sigma     : float    exploration noise scale
+    warmup_steps    : int      random actions before learning starts
+    random_state    : int | None
+    """
+    def __init__(
+        self,
+        state_dim: int,
+        action_dim: int,
+        action_low: float = -1.0,
+        action_high: float = 1.0,
+        hidden_sizes: list[int] | None = None,
+        actor_lr: float = 1e-3,
+        critic_lr: float = 1e-3,
+        gamma: float = 0.99,
+        tau: float = 0.005,
+        buffer_capacity: int = 100_000,
+        batch_size: int = 64,
+        noise_type: str = "ou",
+        noise_sigma: float = 0.1,
+        warmup_steps: int = 1000,
+        random_state: int | None = None,
+    ):
+        self.action_dim  = action_dim
+        self.action_low  = action_low
+        self.action_high = action_high
+        self.gamma       = gamma
+        self.tau         = tau
+        self.batch_size  = batch_size
+        self.warmup_steps = warmup_steps
+        self._rng        = np.random.default_rng(random_state)
+        self._step       = 0
+        hidden = hidden_sizes or [256, 256]
+        act_scale = (action_high - action_low) / 2.0
+        act_bias  = (action_high + action_low) / 2.0
+        self._act_scale = act_scale
+        self._act_bias  = act_bias
+        # Actor: s → a  (tanh output scaled to [low, high])
+        self.actor        = MLP([state_dim] + hidden + [action_dim],
+                                 output_activation="tanh", lr=actor_lr,
+                                 random_state=random_state)
+        self.actor_target = MLP([state_dim] + hidden + [action_dim],
+                                 output_activation="tanh", lr=actor_lr,
+                                 random_state=random_state)
+        self.actor.hard_update(self.actor_target)
+        # Critic: (s, a) → Q
+        self.critic        = MLP([state_dim + action_dim] + hidden + [1],
+                                  output_activation="linear", lr=critic_lr,
+                                  random_state=random_state)
+        self.critic_target = MLP([state_dim + action_dim] + hidden + [1],
+                                  output_activation="linear", lr=critic_lr,
+                                  random_state=random_state)
+        self.critic.hard_update(self.critic_target)
+        # Replay
+        self.buffer = ReplayBuffer(buffer_capacity)
+        # Exploration noise
+        if noise_type == "ou":
+            self.noise = OrnsteinUhlenbeckNoise(
+                action_dim, sigma=noise_sigma, random_state=random_state
+            )
+        else:
+            self.noise = GaussianNoise(action_dim, sigma=noise_sigma,
+                                       random_state=random_state)
+        # Logging
+        self.actor_losses_: list[float] = []
+        self.critic_losses_: list[float] = []
+        self.episode_rewards_: list[float] = []
+    # ------------------------------------------------------------------
+    # Action
+    # ------------------------------------------------------------------
+    def _scale_action(self, a_tanh: np.ndarray) -> np.ndarray:
+        return a_tanh * self._act_scale + self._act_bias
+    def select_action(self, state: np.ndarray, add_noise: bool = True) -> np.ndarray:
+        a = self.actor.forward(state)                      # tanh in [-1,1]
+        if add_noise:
+            a = a + self.noise.sample()
+            a = np.clip(a, -1.0, 1.0)
+        return self._scale_action(a)
+    # ------------------------------------------------------------------
+    # Learning
+    # ------------------------------------------------------------------
+    def _learn(self) -> tuple[float, float] | tuple[None, None]:
+        if len(self.buffer) < self.batch_size:
+            return None, None
+        states, actions, rewards, next_states, dones = \
+            self.buffer.sample(self.batch_size, self._rng)
+        # ── Critic update ──────────────────────────────────────────────
+        # Target action from actor_target
+        a_next = self.actor_target.forward(next_states)            # (B, A_dim)
+        a_next = np.clip(a_next, -1.0, 1.0)
+        sa_next = np.concatenate([next_states,
+                                   a_next * self._act_scale + self._act_bias], axis=1)
+        q_next = self.critic_target.forward(sa_next).ravel()       # (B,)
+        y = rewards + self.gamma * (1.0 - dones) * q_next          # (B,)
+        # Normalise stored actions back to [-1,1] for concat
+        a_norm = (actions - self._act_bias) / self._act_scale
+        sa = np.concatenate([states, actions], axis=1)
+        q_pred = self.critic.forward(sa, training=True).ravel()    # (B,)
+        td_errors = y - q_pred
+        critic_loss = float(np.mean(td_errors ** 2))
+        d_critic = -2.0 * td_errors[:, np.newaxis] / self.batch_size
+        self.critic.backward(d_critic)
+        # ── Actor update ───────────────────────────────────────────────
+        a_pred = self.actor.forward(states, training=True)         # (B, A_dim)
+        a_scaled = a_pred * self._act_scale + self._act_bias
+        sa_pred = np.concatenate([states, a_scaled], axis=1)
+        q_actor = self.critic.forward(sa_pred, training=True).ravel()
+        actor_loss = float(-np.mean(q_actor))
+        # dL/da = -dQ/da  (chain through critic → actor)
+        d_q_wrt_sa = np.ones((self.batch_size, 1)) / self.batch_size
+        # Gradient w.r.t. action part only
+        d_a = d_q_wrt_sa * (-1.0) * self._act_scale               # (B, A_dim)
+        self.actor.backward(d_a)
+        # ── Soft target updates ────────────────────────────────────────
+        self.actor.soft_update(self.actor_target, self.tau)
+        self.critic.soft_update(self.critic_target, self.tau)
+        return actor_loss, critic_loss
+    # ------------------------------------------------------------------
+    # Step
+    # ------------------------------------------------------------------
+    def step(
+        self,
+        state: np.ndarray,
+        action: np.ndarray,
+        reward: float,
+        next_state: np.ndarray,
+        done: bool,
+    ) -> tuple[float | None, float | None]:
+        self.buffer.push(state, action, reward, next_state, done)
+        self._step += 1
+        if self._step < self.warmup_steps:
+            return None, None
+        actor_loss, critic_loss = self._learn()
+        if actor_loss is not None:
+            self.actor_losses_.append(actor_loss)
+            self.critic_losses_.append(critic_loss)
+        return actor_loss, critic_loss
+    def train_episode(self, env) -> float:
+        state = env.reset(self._rng)
+        self.noise.reset() if hasattr(self.noise, 'reset') else None
+        total_reward = 0.0
+        done = False
+        while not done:
+            if self._step < self.warmup_steps:
+                action = self._rng.uniform(self.action_low, self.action_high,
+                                           self.action_dim)
+            else:
+                action = self.select_action(state)
+            next_state, reward, done = env.step(action)
+            self.step(state, action, reward, next_state, done)
+            state = next_state
+            total_reward += reward
+        self.episode_rewards_.append(total_reward)
+        return total_reward
+    def train(self, env, n_episodes: int) -> "DDPG":
+        for _ in range(n_episodes):
+            self.train_episode(env)
+        return self
+# ============================================================
+# TD3
+# ============================================================
+class TD3(DDPG):
+    """
+    Twin Delayed Deep Deterministic Policy Gradient (TD3).
+    Inherits from DDPG and adds:
+    - Second critic (critic2 + critic2_target)
+    - Policy delay: actor updated every `policy_delay` critic steps
+    - Target policy smoothing: Gaussian noise clipped to ±noise_clip
+    Parameters (additional to DDPG)
+    --------------------------------
+    policy_delay     : int    critic updates per actor update (default 2)
+    target_noise     : float  std of smoothing noise on target actions
+    noise_clip       : float  clipping bound for smoothing noise
+    """
+    def __init__(
+        self,
+        state_dim: int,
+        action_dim: int,
+        action_low: float = -1.0,
+        action_high: float = 1.0,
+        hidden_sizes: list[int] | None = None,
+        actor_lr: float = 1e-3,
+        critic_lr: float = 1e-3,
+        gamma: float = 0.99,
+        tau: float = 0.005,
+        buffer_capacity: int = 100_000,
+        batch_size: int = 64,
+        noise_type: str = "gaussian",
+        noise_sigma: float = 0.1,
+        warmup_steps: int = 1000,
+        policy_delay: int = 2,
+        target_noise: float = 0.2,
+        noise_clip: float = 0.5,
+        random_state: int | None = None,
+    ):
+        super().__init__(
+            state_dim, action_dim, action_low, action_high, hidden_sizes,
+            actor_lr, critic_lr, gamma, tau, buffer_capacity, batch_size,
+            noise_type, noise_sigma, warmup_steps, random_state,
+        )
+        self.policy_delay  = policy_delay
+        self.target_noise  = target_noise
+        self.noise_clip    = noise_clip
+        self._critic_steps = 0
+        hidden = hidden_sizes or [256, 256]
+        # Second critic pair
+        self.critic2        = MLP([state_dim + action_dim] + hidden + [1],
+                                   output_activation="linear", lr=critic_lr,
+                                   random_state=random_state)
+        self.critic2_target = MLP([state_dim + action_dim] + hidden + [1],
+                                   output_activation="linear", lr=critic_lr,
+                                   random_state=random_state)
+        self.critic2.hard_update(self.critic2_target)
+    def _learn(self) -> tuple[float, float] | tuple[None, None]:
+        if len(self.buffer) < self.batch_size:
+            return None, None
+        states, actions, rewards, next_states, dones = \
+            self.buffer.sample(self.batch_size, self._rng)
+        self._critic_steps += 1
+        # ── Target action with smoothing noise ────────────────────────
+        a_next = self.actor_target.forward(next_states)
+        noise  = np.clip(
+            self._rng.normal(0, self.target_noise, a_next.shape),
+            -self.noise_clip, self.noise_clip
+        )
+        a_next = np.clip(a_next + noise, -1.0, 1.0)
+        a_next_scaled = a_next * self._act_scale + self._act_bias
+        sa_next = np.concatenate([next_states, a_next_scaled], axis=1)
+        # ── Twin critics targets (take min) ───────────────────────────
+        q1_next = self.critic_target.forward(sa_next).ravel()
+        q2_next = self.critic2_target.forward(sa_next).ravel()
+        q_next  = np.minimum(q1_next, q2_next)
+        y       = rewards + self.gamma * (1.0 - dones) * q_next
+        # ── Update both critics ───────────────────────────────────────
+        sa = np.concatenate([states, actions], axis=1)
+        q1_pred = self.critic.forward(sa, training=True).ravel()
+        td1 = y - q1_pred
+        critic1_loss = float(np.mean(td1 ** 2))
+        self.critic.backward(-2.0 * td1[:, np.newaxis] / self.batch_size)
+        q2_pred = self.critic2.forward(sa, training=True).ravel()
+        td2 = y - q2_pred
+        critic2_loss = float(np.mean(td2 ** 2))
+        self.critic2.backward(-2.0 * td2[:, np.newaxis] / self.batch_size)
+        critic_loss = (critic1_loss + critic2_loss) / 2.0
+        actor_loss = None
+        # ── Delayed actor update ──────────────────────────────────────
+        if self._critic_steps % self.policy_delay == 0:
+            a_pred   = self.actor.forward(states, training=True)
+            a_scaled = a_pred * self._act_scale + self._act_bias
+            sa_pred  = np.concatenate([states, a_scaled], axis=1)
+            q_actor  = self.critic.forward(sa_pred, training=True).ravel()
+            actor_loss = float(-np.mean(q_actor))
+            d_a = -np.ones((self.batch_size, self.action_dim)) * self._act_scale \
+                  / self.batch_size
+            self.actor.backward(d_a)
+            self.actor.soft_update(self.actor_target, self.tau)
+        self.critic.soft_update(self.critic_target, self.tau)
+        self.critic2.soft_update(self.critic2_target, self.tau)
+        return actor_loss, critic_loss

mlscratch/reinforcement/dqn.py ADDED Viewed

@@ -0,0 +1,319 @@
+"""
+Deep Q-Network (DQN)
+=====================
+Neural-network function approximator for Q-learning, with three
+production-grade enhancements:
+  1. Experience Replay         — breaks temporal correlations (Mnih et al., 2013)
+  2. Target Network            — stabilises training targets  (Mnih et al., 2015)
+  3. Double DQN                — removes maximisation bias    (van Hasselt et al., 2016)
+Optional:
+  4. Dueling Network           — separate V(s) and A(s,a) streams
+                                 (Wang et al., 2016)
+  5. Prioritised Replay        — focuses on high-TD-error transitions
+                                 (Schaul et al., 2015)
+Update rule (Double DQN):
+    a* = argmax_a  Q_online(s', a)
+    y  = r + γ (1-done) Q_target(s', a*)
+    L  = (y - Q_online(s, a))²
+Only numpy and Python stdlib are used.
+"""
+from __future__ import annotations
+import numpy as np
+from copy import deepcopy
+from .utils import ReplayBuffer, PrioritizedReplayBuffer, MLP
+# ============================================================
+# Dueling MLP
+# ============================================================
+class DuelingMLP:
+    """
+    Dueling network: two heads sharing a common feature trunk.
+    Q(s,a) = V(s) + A(s,a) - mean_a A(s,a)
+    Parameters
+    ----------
+    state_dim   : int
+    n_actions   : int
+    hidden_sizes: list[int]   size of shared hidden layers
+    lr          : float
+    """
+    def __init__(
+        self,
+        state_dim: int,
+        n_actions: int,
+        hidden_sizes: list[int] | None = None,
+        lr: float = 1e-3,
+        random_state: int | None = None,
+    ):
+        hidden_sizes = hidden_sizes or [128, 128]
+        self.n_actions = n_actions
+        rng = np.random.default_rng(random_state)
+        # Shared trunk
+        trunk_sizes = [state_dim] + hidden_sizes
+        self._trunk = MLP(trunk_sizes, output_activation="linear",
+                          lr=lr, random_state=random_state)
+        # Value head: hidden[-1] → 1
+        self._value_head = MLP([hidden_sizes[-1], 64, 1],
+                                output_activation="linear", lr=lr,
+                                random_state=random_state)
+        # Advantage head: hidden[-1] → n_actions
+        self._adv_head   = MLP([hidden_sizes[-1], 64, n_actions],
+                                output_activation="linear", lr=lr,
+                                random_state=random_state)
+    def forward(self, x: np.ndarray, training: bool = False) -> np.ndarray:
+        scalar = x.ndim == 1
+        if scalar:
+            x = x[np.newaxis, :]
+        h   = self._trunk.forward(x, training=training)
+        V   = self._value_head.forward(h, training=training)          # (B,1)
+        A   = self._adv_head.forward(h, training=training)            # (B,A)
+        Q   = V + A - A.mean(axis=1, keepdims=True)
+        return Q[0] if scalar else Q
+    def soft_update(self, target: "DuelingMLP", tau: float) -> None:
+        self._trunk.soft_update(target._trunk, tau)
+        self._value_head.soft_update(target._value_head, tau)
+        self._adv_head.soft_update(target._adv_head, tau)
+    def hard_update(self, target: "DuelingMLP") -> None:
+        self._trunk.hard_update(target._trunk)
+        self._value_head.hard_update(target._value_head)
+        self._adv_head.hard_update(target._adv_head)
+    def copy_weights_from(self, source: "DuelingMLP") -> None:
+        self._trunk.copy_weights_from(source._trunk)
+        self._value_head.copy_weights_from(source._value_head)
+        self._adv_head.copy_weights_from(source._adv_head)
+    def backward(self, d_out: np.ndarray) -> None:
+        # Simplified backward: treat Q output as direct loss gradient
+        # into the advantage head (standard approach for DQN)
+        d_A = d_out - d_out.mean(axis=1, keepdims=True)
+        d_V = d_out.mean(axis=1, keepdims=True) * np.ones((d_out.shape[0], 1))
+        self._adv_head.backward(d_A)
+        self._value_head.backward(d_V)
+        # Trunk gradient = sum of both heads (simplified)
+        self._trunk.backward(d_out.mean(axis=1, keepdims=True) *
+                             np.ones((d_out.shape[0],
+                                      self._trunk.layer_sizes[-1])))
+# ============================================================
+# DQN Agent
+# ============================================================
+class DQN:
+    """
+    Deep Q-Network agent.
+    Parameters
+    ----------
+    state_dim       : int
+    n_actions       : int
+    hidden_sizes    : list[int]
+    lr              : float          learning rate
+    gamma           : float          discount factor
+    epsilon         : float          initial exploration ε
+    epsilon_min     : float          minimum ε
+    epsilon_decay   : float          multiplicative decay per step
+    batch_size      : int
+    buffer_capacity : int
+    target_update   : int            hard target update every N steps
+    tau             : float | None   soft update coeff; None → hard update
+    double_dqn      : bool           use Double DQN
+    dueling         : bool           use Dueling Network
+    prioritized     : bool           use Prioritised Replay
+    random_state    : int | None
+    """
+    def __init__(
+        self,
+        state_dim: int,
+        n_actions: int,
+        hidden_sizes: list[int] | None = None,
+        lr: float = 1e-3,
+        gamma: float = 0.99,
+        epsilon: float = 1.0,
+        epsilon_min: float = 0.01,
+        epsilon_decay: float = 0.995,
+        batch_size: int = 64,
+        buffer_capacity: int = 50_000,
+        target_update: int = 100,
+        tau: float | None = None,
+        double_dqn: bool = True,
+        dueling: bool = False,
+        prioritized: bool = False,
+        random_state: int | None = None,
+    ):
+        self.n_actions     = n_actions
+        self.gamma         = gamma
+        self.epsilon       = epsilon
+        self.epsilon_min   = epsilon_min
+        self.epsilon_decay = epsilon_decay
+        self.batch_size    = batch_size
+        self.target_update = target_update
+        self.tau           = tau
+        self.double_dqn    = double_dqn
+        self.prioritized   = prioritized
+        self._rng          = np.random.default_rng(random_state)
+        self._step         = 0
+        hidden = hidden_sizes or [128, 128]
+        # Online and target networks
+        if dueling:
+            self.online_net = DuelingMLP(state_dim, n_actions, hidden, lr, random_state)
+            self.target_net = DuelingMLP(state_dim, n_actions, hidden, lr, random_state)
+        else:
+            self.online_net = MLP([state_dim] + hidden + [n_actions],
+                                   output_activation="linear", lr=lr,
+                                   random_state=random_state)
+            self.target_net = MLP([state_dim] + hidden + [n_actions],
+                                   output_activation="linear", lr=lr,
+                                   random_state=random_state)
+        # Sync target = online at init
+        self.online_net.hard_update(self.target_net)
+        # Replay buffer
+        if prioritized:
+            self.buffer = PrioritizedReplayBuffer(buffer_capacity)
+        else:
+            self.buffer = ReplayBuffer(buffer_capacity)
+        # Logging
+        self.losses_: list[float] = []
+        self.episode_rewards_: list[float] = []
+        self.epsilons_: list[float] = []
+    # ------------------------------------------------------------------
+    # Action selection
+    # ------------------------------------------------------------------
+    def select_action(self, state: np.ndarray, greedy: bool = False) -> int:
+        """ε-greedy action with linear annealing."""
+        if not greedy and self._rng.random() < self.epsilon:
+            return int(self._rng.integers(self.n_actions))
+        q = self.online_net.forward(state)
+        return int(np.argmax(q))
+    # ------------------------------------------------------------------
+    # Learning step
+    # ------------------------------------------------------------------
+    def _learn(self) -> float | None:
+        if len(self.buffer) < self.batch_size:
+            return None
+        if self.prioritized:
+            states, actions, rewards, next_states, dones, weights, idxs = \
+                self.buffer.sample(self.batch_size, self._rng)
+        else:
+            states, actions, rewards, next_states, dones = \
+                self.buffer.sample(self.batch_size, self._rng)
+            weights = np.ones(self.batch_size)
+        actions = actions.ravel().astype(int)
+        # Compute targets
+        with_no_grad = True   # conceptually; we don't call backward on target_net
+        q_next_target = self.target_net.forward(next_states)  # (B, A)
+        if self.double_dqn:
+            q_next_online = self.online_net.forward(next_states)
+            a_star = np.argmax(q_next_online, axis=1)           # online selects
+            q_next_val = q_next_target[np.arange(self.batch_size), a_star]
+        else:
+            q_next_val = q_next_target.max(axis=1)
+        targets = rewards + self.gamma * (1.0 - dones) * q_next_val  # (B,)
+        # Compute predictions and loss gradient
+        q_pred_all = self.online_net.forward(states, training=True)   # (B, A)
+        q_pred = q_pred_all[np.arange(self.batch_size), actions]      # (B,)
+        td_errors = targets - q_pred                                   # (B,)
+        loss = float(np.mean(weights * td_errors ** 2))
+        # Gradient: dL/dQ_pred = -2 * w * td_error (averaged in backward)
+        d_out = np.zeros_like(q_pred_all)
+        d_out[np.arange(self.batch_size), actions] = (
+            -2.0 * weights * td_errors / self.batch_size
+        )
+        self.online_net.backward(d_out)
+        # Update priorities
+        if self.prioritized:
+            self.buffer.update_priorities(idxs, td_errors)
+        return loss
+    # ------------------------------------------------------------------
+    # Step
+    # ------------------------------------------------------------------
+    def step(
+        self,
+        state: np.ndarray,
+        action: int,
+        reward: float,
+        next_state: np.ndarray,
+        done: bool,
+    ) -> float | None:
+        """Store transition, learn, update target, decay ε."""
+        self.buffer.push(state, np.array([action]), reward, next_state, done)
+        self._step += 1
+        loss = self._learn()
+        if loss is not None:
+            self.losses_.append(loss)
+        # Target update
+        if self.tau is not None:
+            self.online_net.soft_update(self.target_net, self.tau)
+        elif self._step % self.target_update == 0:
+            self.online_net.hard_update(self.target_net)
+        # Epsilon decay
+        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
+        return loss
+    # ------------------------------------------------------------------
+    # Episode training
+    # ------------------------------------------------------------------
+    def train_episode(self, env) -> float:
+        """Run one episode and return total reward."""
+        state = env.reset()
+        total_reward = 0.0
+        done = False
+        while not done:
+            action = self.select_action(state)
+            next_state, reward, done = env.step(action)
+            self.step(state, action, reward, next_state, done)
+            state = next_state
+            total_reward += reward
+        self.episode_rewards_.append(total_reward)
+        self.epsilons_.append(self.epsilon)
+        return total_reward
+    def train(self, env, n_episodes: int) -> "DQN":
+        for _ in range(n_episodes):
+            self.train_episode(env)
+        return self