PyPI - agilerl - Versions diffs - 2.4.2.dev0__tar.gz → 2.4.2.dev1__tar.gz - Mend

agilerl 2.4.2.dev0tar.gz → 2.4.2.dev1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (351) hide show

{agilerl-2.4.2.dev0 → agilerl-2.4.2.dev1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: agilerl
-Version: 2.4.2.dev0
+Version: 2.4.2.dev1
 Summary: AgileRL is a deep reinforcement learning library focused on improving RL development through RLOps.
 Author-email: Nick Ustaran-Anderegg <dev@agilerl.com>
 License-Expression: Apache-2.0
@@ -22,7 +22,7 @@ Requires-Dist: omegaconf~=2.3.0
 Requires-Dist: packaging>=20.0
 Requires-Dist: pandas~=2.2.3
 Requires-Dist: pettingzoo~=1.23.1
-Requires-Dist: pre-commit~=3.4.0
+Requires-Dist: pre-commit~=3.8.0
 Requires-Dist: pygame~=2.6.0
 Requires-Dist: pymunk~=6.2.0
 Requires-Dist: redis~=4.4.4

{agilerl-2.4.2.dev0 → agilerl-2.4.2.dev1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "agilerl"
-version = "2.4.2.dev0"
+version = "2.4.2.dev1"
 description = "AgileRL is a deep reinforcement learning library focused on improving RL development through RLOps."
 authors = [{ name = "Nick Ustaran-Anderegg", email = "dev@agilerl.com" }]
 license = "Apache-2.0"
@@ -24,7 +24,7 @@ dependencies = [
     "pettingzoo~=1.23.1",
     "jax[cpu]~=0.4.31",
     "packaging>=20.0",
-    "pre-commit~=3.4.0",
+    "pre-commit~=3.8.0",
     "pygame~=2.6.0",
     "pymunk~=6.2.0",
     "redis~=4.4.4",

{agilerl-2.4.2.dev0 → agilerl-2.4.2.dev1}/uv.lock RENAMED Viewed

@@ -53,7 +53,7 @@ wheels = [
 [[package]]
 name = "agilerl"
-version = "2.4.2.dev0"
+version = "2.4.2.dev1"
 source = { editable = "." }
 dependencies = [
     { name = "accelerate" },
@@ -139,7 +139,7 @@ requires-dist = [
     { name = "peft", marker = "extra == 'all'", specifier = "~=0.18.0" },
     { name = "peft", marker = "extra == 'llm'", specifier = "~=0.18.0" },
     { name = "pettingzoo", specifier = "~=1.23.1" },
-    { name = "pre-commit", specifier = "~=3.4.0" },
+    { name = "pre-commit", specifier = "~=3.8.0" },
     { name = "pygame", specifier = "~=2.6.0" },
     { name = "pymunk", specifier = "~=6.2.0" },
     { name = "redis", specifier = "~=4.4.4" },
@@ -2261,13 +2261,13 @@ name = "mlx-lm"
 version = "0.29.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "jinja2", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32')" },
+    { name = "jinja2", marker = "sys_platform == 'darwin'" },
     { name = "mlx", marker = "sys_platform == 'darwin'" },
-    { name = "numpy", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32')" },
-    { name = "protobuf", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32')" },
-    { name = "pyyaml", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32')" },
-    { name = "sentencepiece", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32')" },
-    { name = "transformers", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32')" },
+    { name = "numpy", marker = "sys_platform == 'darwin'" },
+    { name = "protobuf", marker = "sys_platform == 'darwin'" },
+    { name = "pyyaml", marker = "sys_platform == 'darwin'" },
+    { name = "sentencepiece", marker = "sys_platform == 'darwin'" },
+    { name = "transformers", marker = "sys_platform == 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/e3/62/f46e1355256a114808517947f8e83ad6be310c7288c551db0fa678f47923/mlx_lm-0.29.1.tar.gz", hash = "sha256:b99180d8f33d33a077b814e550bfb2d8a59ae003d668fd1f4b3fff62a381d34b", size = 232302, upload-time = "2025-12-16T16:58:27.959Z" }
 wheels = [
@@ -2634,7 +2634,7 @@ name = "nvidia-cudnn-cu12"
 version = "9.5.1.17"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/78/4535c9c7f859a64781e43c969a3a7e84c54634e319a996d43ef32ce46f83/nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:30ac3869f6db17d170e0e556dd6cc5eee02647abc31ca856634d5a40f82c15b2", size = 570988386, upload-time = "2024-10-25T19:54:26.39Z" },
@@ -2645,7 +2645,7 @@ name = "nvidia-cufft-cu12"
 version = "11.3.0.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8f/16/73727675941ab8e6ffd86ca3a4b7b47065edcca7a997920b831f8147c99d/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ccba62eb9cef5559abd5e0d54ceed2d9934030f51163df018532142a8ec533e5", size = 200221632, upload-time = "2024-11-20T17:41:32.357Z" },
@@ -2674,9 +2674,9 @@ name = "nvidia-cusolver-cu12"
 version = "11.7.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/f0/6e/c2cf12c9ff8b872e92b4a5740701e51ff17689c4d726fca91875b07f655d/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e9e49843a7707e42022babb9bcfa33c29857a93b88020c4e4434656a655b698c", size = 158229790, upload-time = "2024-11-20T17:43:43.211Z" },
@@ -2688,7 +2688,7 @@ name = "nvidia-cusparse-cu12"
 version = "12.5.4.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/06/1e/b8b7c2f4099a37b96af5c9bb158632ea9e5d9d27d7391d7eb8fc45236674/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7556d9eca156e18184b94947ade0fba5bb47d69cec46bf8660fd2c71a4b48b73", size = 216561367, upload-time = "2024-11-20T17:44:54.824Z" },
@@ -3024,7 +3024,7 @@ wheels = [
 [[package]]
 name = "pre-commit"
-version = "3.4.0"
+version = "3.8.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cfgv" },
@@ -3033,9 +3033,9 @@ dependencies = [
     { name = "pyyaml" },
     { name = "virtualenv" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/56/a5/cb576829ab7c94e768221cf0629e0da8519e744d993e0c99a6ae9803babd/pre_commit-3.4.0.tar.gz", hash = "sha256:6bbd5129a64cad4c0dfaeeb12cd8f7ea7e15b77028d985341478c8af3c759522", size = 177006, upload-time = "2023-09-02T17:09:41.966Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/64/10/97ee2fa54dff1e9da9badbc5e35d0bbaef0776271ea5907eccf64140f72f/pre_commit-3.8.0.tar.gz", hash = "sha256:8bb6494d4a20423842e198980c9ecf9f96607a07ea29549e180eef9ae80fe7af", size = 177815, upload-time = "2024-07-28T19:59:01.538Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/58/56/3b24f8641c39021218ca16115a9cd88512ae16eab790513e832a36269e90/pre_commit-3.4.0-py2.py3-none-any.whl", hash = "sha256:96d529a951f8b677f730a7212442027e8ba53f9b04d217c4c67dc56c393ad945", size = 203701, upload-time = "2023-09-02T17:09:40.215Z" },
+    { url = "https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl", hash = "sha256:9a90a53bf82fdd8778d58085faf8d83df56e40dfe18f45b19446e26bf1b3a63f", size = 204643, upload-time = "2024-07-28T19:58:59.335Z" },
 ]
 [[package]]
@@ -5030,8 +5030,8 @@ name = "triton"
 version = "3.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "setuptools", version = "79.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' and sys_platform == 'linux'" },
-    { name = "setuptools", version = "80.10.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12' and sys_platform == 'linux'" },
+    { name = "setuptools", version = "79.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "setuptools", version = "80.10.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8d/a9/549e51e9b1b2c9b854fd761a1d23df0ba2fbc60bd0c13b489ffa518cfcb7/triton-3.3.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b74db445b1c562844d3cfad6e9679c72e93fdfb1a90a24052b03bb5c49d1242e", size = 155600257, upload-time = "2025-05-29T23:39:36.085Z" },
@@ -5378,8 +5378,8 @@ name = "xformers"
 version = "0.0.31"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", marker = "sys_platform == 'linux'" },
-    { name = "torch", marker = "sys_platform == 'linux'" },
+    { name = "numpy", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "torch", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/33/35/91c172a57681e1c03de5ad1ca654dc87c282279b941052ed04e616ae5bcd/xformers-0.0.31.tar.gz", hash = "sha256:3fccb159c6327c13fc1b08f8b963c2779ca526e2e50755dee9bcc1bac67d20c6", size = 12102740, upload-time = "2025-06-25T15:12:10.241Z" }
 wheels = [

agilerl-2.4.2.dev0/DQN_LEARNING_ALGORITHM_ANALYSIS.md DELETED Viewed

@@ -1,309 +0,0 @@
-# DQN Learning Algorithm Analysis
-## Overview
-Detailed analysis of the DQN learning algorithm implementation, focusing on the `learn()`, `update()`, and `soft_update()` methods.
-## Algorithm Flow
-### 1. `learn()` Method (lines 338-359)
-```python
-def learn(self, experiences: ExperiencesType) -> float:
-    obs = experiences["obs"]
-    actions = experiences["action"]
-    rewards = experiences["reward"]
-    next_obs = experiences["next_obs"]
-    dones = experiences["done"]
-    obs = self.preprocess_observation(obs)
-    next_obs = self.preprocess_observation(next_obs)
-    loss = self.update(obs, actions, rewards, next_obs, dones)
-    # soft update target network
-    self.soft_update()
-    return loss.item()
-```
-**Analysis**: ✅ Looks correct
-- Extracts experiences correctly
-- Preprocesses observations
-- Calls `update()` to compute loss and backpropagate
-- Calls `soft_update()` after each learning step
-- Returns scalar loss value
-### 2. `update()` Method (lines 286-336)
-```python
-def update(self, obs, actions, rewards, next_obs, dones) -> torch.Tensor:
-    with torch.no_grad():
-        if self.double:  # Double Q-learning
-            q_idx = self.actor(next_obs).argmax(dim=1).unsqueeze(1)
-            q_target = (
-                self.actor_target(next_obs).gather(dim=1, index=q_idx).detach()
-            )
-        else:
-            q_target = self.actor_target(next_obs).max(axis=1)[0].unsqueeze(1)
-        # target, if terminal then y_j = rewards
-        y_j = rewards + self.gamma * q_target * (1 - dones)
-    if actions.ndim == 1:
-        actions = actions.unsqueeze(-1)
-    # Compute Q-values for actions taken and loss
-    q_eval = self.actor(obs).gather(1, actions.long())
-    loss: torch.Tensor = self.criterion(q_eval, y_j)
-    # zero gradients, perform a backward pass, and update the weights
-    self.optimizer.zero_grad()
-    if self.accelerator is not None:
-        self.accelerator.backward(loss)
-    else:
-        loss.backward()
-    self.optimizer.step()
-    return loss.detach()
-```
-## Issues Found
-### ⚠️ Issue 1: Inconsistent `max()` Usage (Line 316)
-**Problem**: Uses `axis=1` instead of `dim=1`
-```python
-q_target = self.actor_target(next_obs).max(axis=1)[0].unsqueeze(1)
-```
-**Impact**:
-- PyTorch's `max()` accepts `axis` but it's deprecated
-- Should use `dim=1` for consistency
-- **However**: This shouldn't prevent learning, just causes deprecation warning
-**Comparison**:
-- Line 311: Uses `.argmax(dim=1)` ✅ (correct)
-- Line 316: Uses `.max(axis=1)` ❌ (should be `dim=1`)
-**Fix**:
-```python
-q_target = self.actor_target(next_obs).max(dim=1)[0].unsqueeze(1)
-```
-### ⚠️ Issue 2: Target Network Initialization Method
-**Problem**: DQN uses a complex TensorDict-based initialization via `init_hook()`, while other algorithms use simple `load_state_dict()`
-**DQN Approach** (lines 185-203):
-```python
-def init_hook(self) -> None:
-    param_vals: TensorDict = from_module(self.actor).detach()
-    target_params: TensorDict = param_vals.clone().lock_()
-    try:
-        target_params.to_module(self.actor_target)
-    except KeyError:
-        pass
-    finally:
-        self.param_vals = param_vals
-        self.target_params = target_params
-```
-**RainbowDQN/CQN Approach**:
-```python
-self.actor_target.load_state_dict(self.actor.state_dict())
-```
-**Potential Issues**:
-1. The `lock_()` creates a locked TensorDict that's detached from computation graph
-2. If `to_module()` fails silently (caught by `except KeyError: pass`), target network might not be initialized
-3. The locked TensorDict might interfere with `soft_update()` parameter updates
-**Impact**:
-- If `to_module()` fails, target network starts with random weights instead of copying from actor
-- This would cause incorrect Q-targets and prevent learning
-- The silent exception handling makes this hard to detect
-**Recommendation**: Add logging or assertion to verify target network is initialized:
-```python
-def init_hook(self) -> None:
-    param_vals: TensorDict = from_module(self.actor).detach()
-    target_params: TensorDict = param_vals.clone().lock_()
-    try:
-        target_params.to_module(self.actor_target)
-    except KeyError as e:
-        # Log the error instead of silently passing
-        warnings.warn(f"Failed to initialize target network: {e}. Using load_state_dict fallback.")
-        self.actor_target.load_state_dict(self.actor.state_dict())
-    finally:
-        self.param_vals = param_vals
-        self.target_params = target_params
-```
-### ⚠️ Issue 3: Missing Gradient Clipping
-**Problem**: DQN doesn't clip gradients, while RainbowDQN does
-**RainbowDQN** (line 442):
-```python
-clip_grad_norm_(self.actor.parameters(), 10.0)
-```
-**DQN**: No gradient clipping
-**Impact**:
-- Could lead to gradient explosion in some cases
-- Not necessarily a bug, but could cause instability
-**Recommendation**: Consider adding gradient clipping:
-```python
-from torch.nn.utils import clip_grad_norm_
-# After loss.backward(), before optimizer.step()
-clip_grad_norm_(self.actor.parameters(), max_norm=10.0)
-self.optimizer.step()
-```
-### ✅ Correct Implementations
-1. **Q-Learning Update Formula** (line 319): ✅ Correct
-   ```python
-   y_j = rewards + self.gamma * q_target * (1 - dones)
-   ```
-2. **Double Q-Learning** (lines 310-314): ✅ Correct
-   - Uses actor to select action, target to evaluate
-3. **Loss Computation** (line 326): ✅ Correct
-   ```python
-   q_eval = self.actor(obs).gather(1, actions.long())
-   loss = self.criterion(q_eval, y_j)
-   ```
-4. **Gradient Flow** (lines 329-335): ✅ Correct
-   - Zero gradients
-   - Backward pass
-   - Optimizer step
-5. **Soft Update** (lines 361-368): ✅ Correct formula
-   ```python
-   target_param.data.copy_(
-       self.tau * eval_param.data + (1.0 - self.tau) * target_param.data
-   )
-   ```
-## Potential Learning Issues
-### 1. Target Network Not Initialized Properly
-**Most Likely Issue**: If `init_hook()` fails silently, target network has random weights, causing:
-- Incorrect Q-targets
-- No learning signal
-- Random behavior
-**How to Verify**:
-```python
-# After initialization, check if target network matches actor
-actor_params = list(agent.actor.parameters())
-target_params = list(agent.actor_target.parameters())
-for a, t in zip(actor_params, target_params):
-    if not torch.allclose(a.data, t.data, atol=1e-6):
-        print("WARNING: Target network not initialized correctly!")
-```
-### 2. Tau Too Small
-**Config**: `TAU: 0.001` (line 18)
-**Impact**:
-- Very slow target network updates
-- Target network stays close to initial values for a long time
-- Slower learning convergence
-**Typical Values**:
-- DQN papers often use `tau=0.01` or `tau=0.005`
-- `tau=0.001` means only 0.1% update per step
-**Recommendation**: Try `tau=0.01` or `tau=0.005`
-### 3. Learning Rate
-**Config**: `LR: 0.001` (line 12)
-**Impact**:
-- Might be too high for some environments
-- Could cause instability
-**Typical Values**:
-- DQN often uses `lr=1e-4` to `lr=5e-4`
-- `lr=0.001` is on the higher side
-**Recommendation**: Try `lr=5e-4` or `lr=1e-4`
-### 4. Learn Step Frequency
-**Config**: `LEARN_STEP: 1` (line 17)
-**Impact**:
-- Learning every step (with 16 parallel envs, that's 16 steps per environment step)
-- Very frequent learning might cause instability
-- Typical DQN learns every 4-5 steps
-**Recommendation**: Try `LEARN_STEP: 4` or `LEARN_STEP: 5`
-## Summary of Critical Issues
-1. **🔴 HIGH PRIORITY**: Target network initialization might fail silently
-   - Check if `init_hook()` actually initializes target network
-   - Add fallback to `load_state_dict()` if TensorDict method fails
-2. **🟡 MEDIUM PRIORITY**: Inconsistent `max()` usage
-   - Change `axis=1` to `dim=1` for consistency
-3. **🟡 MEDIUM PRIORITY**: Consider adding gradient clipping
-   - Prevents gradient explosion
-4. **🟡 MEDIUM PRIORITY**: Hyperparameter tuning
-   - `tau=0.001` might be too small
-   - `lr=0.001` might be too high
-   - `learn_step=1` might be too frequent
-## Recommended Fixes
-### Fix 1: Improve Target Network Initialization
-```python
-def init_hook(self) -> None:
-    """Resets module parameters for the detached and target networks."""
-    param_vals: TensorDict = from_module(self.actor).detach()
-    target_params: TensorDict = param_vals.clone().lock_()
-    try:
-        target_params.to_module(self.actor_target)
-        # Verify initialization succeeded
-        actor_first_param = next(self.actor.parameters()).data
-        target_first_param = next(self.actor_target.parameters()).data
-        if not torch.allclose(actor_first_param, target_first_param, atol=1e-5):
-            raise RuntimeError("Target network initialization verification failed")
-    except (KeyError, RuntimeError) as e:
-        warnings.warn(f"TensorDict initialization failed ({e}), using load_state_dict fallback")
-        self.actor_target.load_state_dict(self.actor.state_dict())
-    finally:
-        self.param_vals = param_vals
-        self.target_params = target_params
-```
-### Fix 2: Fix max() Usage
-```python
-# Line 316
-q_target = self.actor_target(next_obs).max(dim=1)[0].unsqueeze(1)
-```
-### Fix 3: Add Gradient Clipping (Optional)
-```python
-# After line 333 (loss.backward())
-from torch.nn.utils import clip_grad_norm_
-clip_grad_norm_(self.actor.parameters(), max_norm=10.0)
-self.optimizer.step()
-```

agilerl-2.4.2.dev0/DQN_LEARNING_ANALYSIS.md DELETED Viewed

@@ -1,168 +0,0 @@
-# DQN Learning Analysis
-## Summary
-Analysis of the DQN algorithm and its parent class `RLAlgorithm` to identify potential reasons why the algorithm might not be learning.
-## Critical Finding: `set_training_mode()` Does NOT Affect DQN Training
-**Important Discovery**: After analyzing the DQN network architecture, **`set_training_mode()` realistically does NOT affect DQN training** because:
-1. **DQN uses `nn.LayerNorm`, NOT `nn.BatchNorm`**:
-   - LayerNorm normalizes across features (not batches), so it behaves identically in train and eval modes
-   - LayerNorm does NOT use running statistics like BatchNorm does
-   - See `agilerl/utils/evolvable_networks.py` line 560-562: `nn.LayerNorm` is used
-2. **DQN does NOT use Dropout layers**:
-   - No dropout layers are present in the QNetwork architecture
-   - Dropout is the other main layer affected by train/eval mode
-3. **`set_training_mode()` doesn't even call `.train()` or `.eval()`**:
-   - The method only sets `self.training = training` flag
-   - It doesn't actually change the network's training mode
-**Conclusion**: The missing `set_training_mode(True)` call and the fact that it doesn't set network mode are **NOT the root cause** of DQN not learning. These are code inconsistencies but don't affect functionality.
-## Code Inconsistencies Found (Non-Critical)
-### 1. **Missing Training Mode Setting in `train_off_policy.py`**
-**Location**: `agilerl/training/train_off_policy.py`, line 238
-**Issue**: The training loop does NOT call `agent.set_training_mode(True)` for DQN agents before training begins.
-**Comparison**:
-- `train_on_policy.py` (line 203): ✅ Calls `agent.set_training_mode(True)`
-- `train_multi_agent_off_policy.py` (line 210): ✅ Calls `agent.set_training_mode(True)`
-- `train_off_policy.py` (line 238): ❌ **MISSING** `agent.set_training_mode(True)`
-**Impact**: **NONE** - This doesn't affect training because DQN networks don't have layers that are affected by train/eval mode.
-### 2. **`set_training_mode()` Doesn't Actually Set Network Training Mode**
-**Location**: `agilerl/algorithms/core/base.py`, line 648-654
-**Current Implementation**:
-```python
-def set_training_mode(self, training: bool) -> None:
-    """Sets the training mode of the algorithm."""
-    self.training = training
-```
-**Issue**: This method only sets a flag `self.training` but does NOT call `.train()` or `.eval()` on the actual neural networks (`self.actor`, `self.actor_target`).
-**Impact**: **NONE for DQN** - Since DQN uses LayerNorm (not BatchNorm) and no Dropout, this doesn't affect training.
-**Comparison with RainbowDQN**:
-- RainbowDQN explicitly calls `self.actor.train()` and `self.actor_target.train()` in `__init__` (lines 221-222)
-- RainbowDQN also calls `self.actor.train(mode=training)` in `get_action()` method (line 252)
-- DQN does NOT set networks to train mode in `__init__`
-**Note**: While this doesn't affect DQN, it's still inconsistent code that could cause issues if BatchNorm or Dropout are added in the future.
-## Additional Observations
-### 5. **Soft Update Implementation**
-**Location**: `agilerl/algorithms/dqn.py`, lines 361-368
-The `soft_update()` method looks correct:
-```python
-def soft_update(self) -> None:
-    """Soft updates target network."""
-    for eval_param, target_param in zip(
-        self.actor.parameters(), self.actor_target.parameters()
-    ):
-        target_param.data.copy_(
-            self.tau * eval_param.data + (1.0 - self.tau) * target_param.data
-        )
-```
-This is called after each learning step (line 358), which is correct.
-### 6. **Loss Computation**
-**Location**: `agilerl/algorithms/dqn.py`, lines 309-326
-The loss computation looks correct:
-- Target Q-values are computed with `torch.no_grad()` ✅
-- Q-learning update formula is correct: `y_j = rewards + self.gamma * q_target * (1 - dones)` ✅
-- Loss is computed and backpropagated correctly ✅
-### 7. **Optimizer Setup**
-**Location**: `agilerl/algorithms/dqn.py`, lines 151-156
-The optimizer is set up correctly with `OptimizerWrapper` and includes the actor network.
-## Recommended Fixes
-### Fix 1: Add `set_training_mode(True)` to Training Loop
-**File**: `agilerl/training/train_off_policy.py`
-**Change**: Add after line 238:
-```python
-for agent_idx, agent in enumerate(pop):  # Loop through population
-    agent.set_training_mode(True)  # ADD THIS LINE
-    state, info = env.reset()  # Reset environment at start of episode
-```
-### Fix 2: Override `set_training_mode()` in DQN Class
-**File**: `agilerl/algorithms/dqn.py`
-**Change**: Add method after `init_hook()`:
-```python
-def set_training_mode(self, training: bool) -> None:
-    """Sets the training mode of the algorithm and networks."""
-    super().set_training_mode(training)
-    if training:
-        self.actor.train()
-        self.actor_target.train()
-    else:
-        self.actor.eval()
-        self.actor_target.eval()
-```
-### Fix 3: Initialize Networks to Train Mode in `__init__`
-**File**: `agilerl/algorithms/dqn.py`
-**Change**: Add after line 159 (after `self.criterion = nn.MSELoss()`):
-```python
-# Put the nets into training mode
-self.actor.train()
-self.actor_target.train()
-```
-## What Could Actually Be Causing Learning Issues?
-Since `set_training_mode()` doesn't affect DQN training, the real issues might be:
-1. **Learning rate too high/low**: Check if `lr=0.001` in config is appropriate
-2. **Target network update frequency**: `tau=0.001` means very slow updates - is this intentional?
-3. **Replay buffer issues**: Is the buffer being populated correctly? Is `learning_delay=0` appropriate?
-4. **Epsilon decay**: Check if exploration is decaying too fast/slow
-5. **Network initialization**: Are weights initialized properly?
-6. **Gradient clipping**: Is there any gradient explosion/vanishing?
-7. **Loss computation**: Verify the loss is actually being computed and backpropagated correctly
-8. **Experience sampling**: Are experiences being sampled correctly from the replay buffer?
-## Priority
-1. **LOW**: Fix 1 (add `set_training_mode(True)` to training loop) - For code consistency only
-2. **LOW**: Fix 2 (override `set_training_mode()` to actually set network mode) - For future-proofing only
-3. **LOW**: Fix 3 (initialize networks to train mode) - For code consistency only
-**These fixes are NOT urgent** - they won't fix the learning issue, but they're good practice for code consistency and future-proofing.
-## Testing Recommendations
-To debug why DQN isn't learning, check:
-1. **Loss values**: Are they decreasing? Are they NaN or exploding?
-2. **Q-values**: Are they reasonable? Are they converging?
-3. **Gradients**: Are gradients flowing? Check `agent.actor.parameters()` gradients
-4. **Replay buffer**: Is it filling up? Are experiences diverse?
-5. **Target network**: Is it updating? Check if `soft_update()` is being called
-6. **Exploration**: Is epsilon decaying appropriately? Are actions diverse?
-7. **Rewards**: Are rewards being collected? Are they normalized/scaled appropriately?

agilerl 2.4.2.dev0__tar.gz → 2.4.2.dev1__tar.gz

agilerl 2.4.2.dev0tar.gz → 2.4.2.dev1tar.gz