PyPI - agilerl - Versions diffs - 2.4.2.dev0__tar.gz → 2.4.3.dev0__tar.gz - Mend

agilerl 2.4.2.dev0tar.gz → 2.4.3.dev0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (351) hide show

{agilerl-2.4.2.dev0 → agilerl-2.4.3.dev0}/.pre-commit-config.yaml RENAMED Viewed

@@ -24,7 +24,7 @@ repos:
           - id: mixed-line-ending
             args: [--fix=lf]
     - repo: https://github.com/psf/black-pre-commit-mirror
-      rev: 25.12.0
+      rev: 26.1.0
       hooks:
           - id: black
     - repo: https://github.com/codespell-project/codespell
@@ -35,7 +35,7 @@ repos:
                 - --skip=*.css,*.js,*.map,*.scss,*.svg
                 - --ignore-words-list=magent,pres,roate
     - repo: https://github.com/astral-sh/ruff-pre-commit
-      rev: v0.14.9
+      rev: v0.14.14
       hooks:
           - id: ruff-check
             args:
@@ -53,6 +53,6 @@ repos:
           - id: yamlfmt
     - repo: https://github.com/astral-sh/uv-pre-commit
       # uv version.
-      rev: 0.9.27
+      rev: 0.9.28
       hooks:
           - id: uv-lock

{agilerl-2.4.2.dev0 → agilerl-2.4.3.dev0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: agilerl
-Version: 2.4.2.dev0
+Version: 2.4.3.dev0
 Summary: AgileRL is a deep reinforcement learning library focused on improving RL development through RLOps.
 Author-email: Nick Ustaran-Anderegg <dev@agilerl.com>
 License-Expression: Apache-2.0
@@ -22,7 +22,7 @@ Requires-Dist: omegaconf~=2.3.0
 Requires-Dist: packaging>=20.0
 Requires-Dist: pandas~=2.2.3
 Requires-Dist: pettingzoo~=1.23.1
-Requires-Dist: pre-commit~=3.4.0
+Requires-Dist: pre-commit~=3.8.0
 Requires-Dist: pygame~=2.6.0
 Requires-Dist: pymunk~=6.2.0
 Requires-Dist: redis~=4.4.4

{agilerl-2.4.2.dev0 → agilerl-2.4.3.dev0}/agilerl/algorithms/core/base.py RENAMED Viewed

@@ -2066,8 +2066,7 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
         accelerator: Optional[Accelerator] = None,
     ) -> None:
         raise NotImplementedError(
-            "The load class method is not supported for this algorithm class."
-            """
+            "The load class method is not supported for this algorithm class." """
             To load a saved LLM, please load the model as follows, and then re-instantiate the GRPO
             class, using the pre-trained model.

{agilerl-2.4.2.dev0 → agilerl-2.4.3.dev0}/agilerl/algorithms/ippo.py RENAMED Viewed

@@ -671,7 +671,7 @@ class IPPO(MultiAgentRLAlgorithm):
         :param action_space: Action space for the agent
         :type action_space: gymnasium.spaces
         """
-        (states, actions, log_probs, rewards, dones, values, next_state, next_done) = (
+        states, actions, log_probs, rewards, dones, values, next_state, next_done = (
             experiences
         )

{agilerl-2.4.2.dev0 → agilerl-2.4.3.dev0}/agilerl/wrappers/agent.py RENAMED Viewed

@@ -597,8 +597,8 @@ class AsyncAgentsWrapper(AgentWrapper[MultiAgentRLAlgorithm]):
         :return: Learning information
         :rtype: Any
         """
-        (states, actions, log_probs, rewards, dones, values, next_state, next_done) = (
-            map(self.stack_experiences, experiences)
+        states, actions, log_probs, rewards, dones, values, next_state, next_done = map(
+            self.stack_experiences, experiences
         )
         # Handle case where we haven't collected a next state for each sub-agent

{agilerl-2.4.2.dev0 → agilerl-2.4.3.dev0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "agilerl"
-version = "2.4.2.dev0"
+version = "2.4.3.dev0"
 description = "AgileRL is a deep reinforcement learning library focused on improving RL development through RLOps."
 authors = [{ name = "Nick Ustaran-Anderegg", email = "dev@agilerl.com" }]
 license = "Apache-2.0"
@@ -24,7 +24,7 @@ dependencies = [
     "pettingzoo~=1.23.1",
     "jax[cpu]~=0.4.31",
     "packaging>=20.0",
-    "pre-commit~=3.4.0",
+    "pre-commit~=3.8.0",
     "pygame~=2.6.0",
     "pymunk~=6.2.0",
     "redis~=4.4.4",

{agilerl-2.4.2.dev0 → agilerl-2.4.3.dev0}/tests/test_modules/test_make_evolvable.py RENAMED Viewed

@@ -194,9 +194,7 @@ def test_instantiation_with_rainbow():
         network, input_tensor, support=support, rainbow=True
     )
     assert isinstance(evolvable_network, MakeEvolvable)
-    assert (
-        str(evolvable_network)
-        == """MakeEvolvable(
+    assert str(evolvable_network) == """MakeEvolvable(
   (feature_net): Sequential(
     (feature_linear_layer_0): Linear(in_features=3, out_features=128, bias=True)
     (feature_activation_0): ReLU()
@@ -212,7 +210,6 @@ def test_instantiation_with_rainbow():
     (advantage_linear_layer_output): NoisyLinear(in_features=8, out_features=102)
   )
 )"""
-    )
     del network, evolvable_network

{agilerl-2.4.2.dev0 → agilerl-2.4.3.dev0}/tests/test_train/test_train.py RENAMED Viewed

@@ -1498,8 +1498,10 @@ def test_train_off_policy_agent_calls_made_rainbow(
 def test_train_off_policy_save_elite_warning(
     env, population_off_policy, tournament, mutations, memory
 ):
-    warning_string = "'save_elite' set to False but 'elite_path' has been defined, elite will not\
+    warning_string = (
+        "'save_elite' set to False but 'elite_path' has been defined, elite will not\
                       be saved unless 'save_elite' is set to True."
+    )
     with pytest.warns(match=warning_string):
         pop, pop_fitnesses = train_off_policy(
             env,
@@ -2137,8 +2139,10 @@ def test_train_on_policy_save_elite_warning(
     tournament,
     mutations,
 ):
-    warning_string = "'save_elite' set to False but 'elite_path' has been defined, elite will not\
+    warning_string = (
+        "'save_elite' set to False but 'elite_path' has been defined, elite will not\
                       be saved unless 'save_elite' is set to True."
+    )
     with pytest.warns(match=warning_string):
         pop, pop_fitnesses = train_on_policy(
             env,
@@ -2703,8 +2707,10 @@ def test_train_multi_agent_on_policy_rgb_vectorized(
 def test_train_multi_save_elite_warning(
     multi_env, population_multi_agent, on_policy, multi_memory, tournament, mutations
 ):
-    warning_string = "'save_elite' set to False but 'elite_path' has been defined, elite will not\
+    warning_string = (
+        "'save_elite' set to False but 'elite_path' has been defined, elite will not\
                       be saved unless 'save_elite' is set to True."
+    )
     with pytest.warns(match=warning_string):
         pop, pop_fitnesses = train_multi_agent_off_policy(
             multi_env,
@@ -2730,8 +2736,10 @@ def test_train_multi_save_elite_warning(
 def test_train_multi_save_elite_warning_on_policy(
     multi_env, population_multi_agent, on_policy, multi_memory, tournament, mutations
 ):
-    warning_string = "'save_elite' set to False but 'elite_path' has been defined, elite will not\
+    warning_string = (
+        "'save_elite' set to False but 'elite_path' has been defined, elite will not\
                       be saved unless 'save_elite' is set to True."
+    )
     with pytest.warns(match=warning_string):
         pop, pop_fitnesses = train_multi_agent_on_policy(
             multi_env,
@@ -3567,8 +3575,10 @@ def test_train_offline_save_elite_warning(
     offline_init_hp,
     dummy_h5py_data,
 ):
-    warning_string = "'save_elite' set to False but 'elite_path' has been defined, elite will not\
+    warning_string = (
+        "'save_elite' set to False but 'elite_path' has been defined, elite will not\
                       be saved unless 'save_elite' is set to True."
+    )
     with pytest.warns(match=warning_string):
         pop, pop_fitness = train_offline(
             env,
@@ -4057,8 +4067,10 @@ def test_train_bandit_agent_calls_made(
 def test_train_bandit_save_elite_warning(
     bandit_env, population_bandit, tournament, mutations, bandit_memory
 ):
-    warning_string = "'save_elite' set to False but 'elite_path' has been defined, elite will not\
+    warning_string = (
+        "'save_elite' set to False but 'elite_path' has been defined, elite will not\
                       be saved unless 'save_elite' is set to True."
+    )
     with pytest.warns(match=warning_string):
         pop, pop_fitnesses = train_bandits(
             bandit_env,

{agilerl-2.4.2.dev0 → agilerl-2.4.3.dev0}/uv.lock RENAMED Viewed

@@ -53,7 +53,7 @@ wheels = [
 [[package]]
 name = "agilerl"
-version = "2.4.2.dev0"
+version = "2.4.3.dev0"
 source = { editable = "." }
 dependencies = [
     { name = "accelerate" },
@@ -139,7 +139,7 @@ requires-dist = [
     { name = "peft", marker = "extra == 'all'", specifier = "~=0.18.0" },
     { name = "peft", marker = "extra == 'llm'", specifier = "~=0.18.0" },
     { name = "pettingzoo", specifier = "~=1.23.1" },
-    { name = "pre-commit", specifier = "~=3.4.0" },
+    { name = "pre-commit", specifier = "~=3.8.0" },
     { name = "pygame", specifier = "~=2.6.0" },
     { name = "pymunk", specifier = "~=6.2.0" },
     { name = "redis", specifier = "~=4.4.4" },
@@ -2261,13 +2261,13 @@ name = "mlx-lm"
 version = "0.29.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "jinja2", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32')" },
+    { name = "jinja2", marker = "sys_platform == 'darwin'" },
     { name = "mlx", marker = "sys_platform == 'darwin'" },
-    { name = "numpy", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32')" },
-    { name = "protobuf", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32')" },
-    { name = "pyyaml", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32')" },
-    { name = "sentencepiece", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32')" },
-    { name = "transformers", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32')" },
+    { name = "numpy", marker = "sys_platform == 'darwin'" },
+    { name = "protobuf", marker = "sys_platform == 'darwin'" },
+    { name = "pyyaml", marker = "sys_platform == 'darwin'" },
+    { name = "sentencepiece", marker = "sys_platform == 'darwin'" },
+    { name = "transformers", marker = "sys_platform == 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/e3/62/f46e1355256a114808517947f8e83ad6be310c7288c551db0fa678f47923/mlx_lm-0.29.1.tar.gz", hash = "sha256:b99180d8f33d33a077b814e550bfb2d8a59ae003d668fd1f4b3fff62a381d34b", size = 232302, upload-time = "2025-12-16T16:58:27.959Z" }
 wheels = [
@@ -2634,7 +2634,7 @@ name = "nvidia-cudnn-cu12"
 version = "9.5.1.17"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/78/4535c9c7f859a64781e43c969a3a7e84c54634e319a996d43ef32ce46f83/nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:30ac3869f6db17d170e0e556dd6cc5eee02647abc31ca856634d5a40f82c15b2", size = 570988386, upload-time = "2024-10-25T19:54:26.39Z" },
@@ -2645,7 +2645,7 @@ name = "nvidia-cufft-cu12"
 version = "11.3.0.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8f/16/73727675941ab8e6ffd86ca3a4b7b47065edcca7a997920b831f8147c99d/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ccba62eb9cef5559abd5e0d54ceed2d9934030f51163df018532142a8ec533e5", size = 200221632, upload-time = "2024-11-20T17:41:32.357Z" },
@@ -2674,9 +2674,9 @@ name = "nvidia-cusolver-cu12"
 version = "11.7.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/f0/6e/c2cf12c9ff8b872e92b4a5740701e51ff17689c4d726fca91875b07f655d/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e9e49843a7707e42022babb9bcfa33c29857a93b88020c4e4434656a655b698c", size = 158229790, upload-time = "2024-11-20T17:43:43.211Z" },
@@ -2688,7 +2688,7 @@ name = "nvidia-cusparse-cu12"
 version = "12.5.4.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/06/1e/b8b7c2f4099a37b96af5c9bb158632ea9e5d9d27d7391d7eb8fc45236674/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7556d9eca156e18184b94947ade0fba5bb47d69cec46bf8660fd2c71a4b48b73", size = 216561367, upload-time = "2024-11-20T17:44:54.824Z" },
@@ -3024,7 +3024,7 @@ wheels = [
 [[package]]
 name = "pre-commit"
-version = "3.4.0"
+version = "3.8.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cfgv" },
@@ -3033,9 +3033,9 @@ dependencies = [
     { name = "pyyaml" },
     { name = "virtualenv" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/56/a5/cb576829ab7c94e768221cf0629e0da8519e744d993e0c99a6ae9803babd/pre_commit-3.4.0.tar.gz", hash = "sha256:6bbd5129a64cad4c0dfaeeb12cd8f7ea7e15b77028d985341478c8af3c759522", size = 177006, upload-time = "2023-09-02T17:09:41.966Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/64/10/97ee2fa54dff1e9da9badbc5e35d0bbaef0776271ea5907eccf64140f72f/pre_commit-3.8.0.tar.gz", hash = "sha256:8bb6494d4a20423842e198980c9ecf9f96607a07ea29549e180eef9ae80fe7af", size = 177815, upload-time = "2024-07-28T19:59:01.538Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/58/56/3b24f8641c39021218ca16115a9cd88512ae16eab790513e832a36269e90/pre_commit-3.4.0-py2.py3-none-any.whl", hash = "sha256:96d529a951f8b677f730a7212442027e8ba53f9b04d217c4c67dc56c393ad945", size = 203701, upload-time = "2023-09-02T17:09:40.215Z" },
+    { url = "https://files.pythonhosted.org/packages/07/92/caae8c86e94681b42c246f0bca35c059a2f0529e5b92619f6aba4cf7e7b6/pre_commit-3.8.0-py2.py3-none-any.whl", hash = "sha256:9a90a53bf82fdd8778d58085faf8d83df56e40dfe18f45b19446e26bf1b3a63f", size = 204643, upload-time = "2024-07-28T19:58:59.335Z" },
 ]
 [[package]]
@@ -5030,8 +5030,8 @@ name = "triton"
 version = "3.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "setuptools", version = "79.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' and sys_platform == 'linux'" },
-    { name = "setuptools", version = "80.10.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12' and sys_platform == 'linux'" },
+    { name = "setuptools", version = "79.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "setuptools", version = "80.10.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8d/a9/549e51e9b1b2c9b854fd761a1d23df0ba2fbc60bd0c13b489ffa518cfcb7/triton-3.3.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b74db445b1c562844d3cfad6e9679c72e93fdfb1a90a24052b03bb5c49d1242e", size = 155600257, upload-time = "2025-05-29T23:39:36.085Z" },
@@ -5378,8 +5378,8 @@ name = "xformers"
 version = "0.0.31"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", marker = "sys_platform == 'linux'" },
-    { name = "torch", marker = "sys_platform == 'linux'" },
+    { name = "numpy", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "torch", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/33/35/91c172a57681e1c03de5ad1ca654dc87c282279b941052ed04e616ae5bcd/xformers-0.0.31.tar.gz", hash = "sha256:3fccb159c6327c13fc1b08f8b963c2779ca526e2e50755dee9bcc1bac67d20c6", size = 12102740, upload-time = "2025-06-25T15:12:10.241Z" }
 wheels = [

agilerl-2.4.2.dev0/DQN_LEARNING_ALGORITHM_ANALYSIS.md DELETED Viewed

@@ -1,309 +0,0 @@
-# DQN Learning Algorithm Analysis
-## Overview
-Detailed analysis of the DQN learning algorithm implementation, focusing on the `learn()`, `update()`, and `soft_update()` methods.
-## Algorithm Flow
-### 1. `learn()` Method (lines 338-359)
-```python
-def learn(self, experiences: ExperiencesType) -> float:
-    obs = experiences["obs"]
-    actions = experiences["action"]
-    rewards = experiences["reward"]
-    next_obs = experiences["next_obs"]
-    dones = experiences["done"]
-    obs = self.preprocess_observation(obs)
-    next_obs = self.preprocess_observation(next_obs)
-    loss = self.update(obs, actions, rewards, next_obs, dones)
-    # soft update target network
-    self.soft_update()
-    return loss.item()
-```
-**Analysis**: ✅ Looks correct
-- Extracts experiences correctly
-- Preprocesses observations
-- Calls `update()` to compute loss and backpropagate
-- Calls `soft_update()` after each learning step
-- Returns scalar loss value
-### 2. `update()` Method (lines 286-336)
-```python
-def update(self, obs, actions, rewards, next_obs, dones) -> torch.Tensor:
-    with torch.no_grad():
-        if self.double:  # Double Q-learning
-            q_idx = self.actor(next_obs).argmax(dim=1).unsqueeze(1)
-            q_target = (
-                self.actor_target(next_obs).gather(dim=1, index=q_idx).detach()
-            )
-        else:
-            q_target = self.actor_target(next_obs).max(axis=1)[0].unsqueeze(1)
-        # target, if terminal then y_j = rewards
-        y_j = rewards + self.gamma * q_target * (1 - dones)
-    if actions.ndim == 1:
-        actions = actions.unsqueeze(-1)
-    # Compute Q-values for actions taken and loss
-    q_eval = self.actor(obs).gather(1, actions.long())
-    loss: torch.Tensor = self.criterion(q_eval, y_j)
-    # zero gradients, perform a backward pass, and update the weights
-    self.optimizer.zero_grad()
-    if self.accelerator is not None:
-        self.accelerator.backward(loss)
-    else:
-        loss.backward()
-    self.optimizer.step()
-    return loss.detach()
-```
-## Issues Found
-### ⚠️ Issue 1: Inconsistent `max()` Usage (Line 316)
-**Problem**: Uses `axis=1` instead of `dim=1`
-```python
-q_target = self.actor_target(next_obs).max(axis=1)[0].unsqueeze(1)
-```
-**Impact**:
-- PyTorch's `max()` accepts `axis` but it's deprecated
-- Should use `dim=1` for consistency
-- **However**: This shouldn't prevent learning, just causes deprecation warning
-**Comparison**:
-- Line 311: Uses `.argmax(dim=1)` ✅ (correct)
-- Line 316: Uses `.max(axis=1)` ❌ (should be `dim=1`)
-**Fix**:
-```python
-q_target = self.actor_target(next_obs).max(dim=1)[0].unsqueeze(1)
-```
-### ⚠️ Issue 2: Target Network Initialization Method
-**Problem**: DQN uses a complex TensorDict-based initialization via `init_hook()`, while other algorithms use simple `load_state_dict()`
-**DQN Approach** (lines 185-203):
-```python
-def init_hook(self) -> None:
-    param_vals: TensorDict = from_module(self.actor).detach()
-    target_params: TensorDict = param_vals.clone().lock_()
-    try:
-        target_params.to_module(self.actor_target)
-    except KeyError:
-        pass
-    finally:
-        self.param_vals = param_vals
-        self.target_params = target_params
-```
-**RainbowDQN/CQN Approach**:
-```python
-self.actor_target.load_state_dict(self.actor.state_dict())
-```
-**Potential Issues**:
-1. The `lock_()` creates a locked TensorDict that's detached from computation graph
-2. If `to_module()` fails silently (caught by `except KeyError: pass`), target network might not be initialized
-3. The locked TensorDict might interfere with `soft_update()` parameter updates
-**Impact**:
-- If `to_module()` fails, target network starts with random weights instead of copying from actor
-- This would cause incorrect Q-targets and prevent learning
-- The silent exception handling makes this hard to detect
-**Recommendation**: Add logging or assertion to verify target network is initialized:
-```python
-def init_hook(self) -> None:
-    param_vals: TensorDict = from_module(self.actor).detach()
-    target_params: TensorDict = param_vals.clone().lock_()
-    try:
-        target_params.to_module(self.actor_target)
-    except KeyError as e:
-        # Log the error instead of silently passing
-        warnings.warn(f"Failed to initialize target network: {e}. Using load_state_dict fallback.")
-        self.actor_target.load_state_dict(self.actor.state_dict())
-    finally:
-        self.param_vals = param_vals
-        self.target_params = target_params
-```
-### ⚠️ Issue 3: Missing Gradient Clipping
-**Problem**: DQN doesn't clip gradients, while RainbowDQN does
-**RainbowDQN** (line 442):
-```python
-clip_grad_norm_(self.actor.parameters(), 10.0)
-```
-**DQN**: No gradient clipping
-**Impact**:
-- Could lead to gradient explosion in some cases
-- Not necessarily a bug, but could cause instability
-**Recommendation**: Consider adding gradient clipping:
-```python
-from torch.nn.utils import clip_grad_norm_
-# After loss.backward(), before optimizer.step()
-clip_grad_norm_(self.actor.parameters(), max_norm=10.0)
-self.optimizer.step()
-```
-### ✅ Correct Implementations
-1. **Q-Learning Update Formula** (line 319): ✅ Correct
-   ```python
-   y_j = rewards + self.gamma * q_target * (1 - dones)
-   ```
-2. **Double Q-Learning** (lines 310-314): ✅ Correct
-   - Uses actor to select action, target to evaluate
-3. **Loss Computation** (line 326): ✅ Correct
-   ```python
-   q_eval = self.actor(obs).gather(1, actions.long())
-   loss = self.criterion(q_eval, y_j)
-   ```
-4. **Gradient Flow** (lines 329-335): ✅ Correct
-   - Zero gradients
-   - Backward pass
-   - Optimizer step
-5. **Soft Update** (lines 361-368): ✅ Correct formula
-   ```python
-   target_param.data.copy_(
-       self.tau * eval_param.data + (1.0 - self.tau) * target_param.data
-   )
-   ```
-## Potential Learning Issues
-### 1. Target Network Not Initialized Properly
-**Most Likely Issue**: If `init_hook()` fails silently, target network has random weights, causing:
-- Incorrect Q-targets
-- No learning signal
-- Random behavior
-**How to Verify**:
-```python
-# After initialization, check if target network matches actor
-actor_params = list(agent.actor.parameters())
-target_params = list(agent.actor_target.parameters())
-for a, t in zip(actor_params, target_params):
-    if not torch.allclose(a.data, t.data, atol=1e-6):
-        print("WARNING: Target network not initialized correctly!")
-```
-### 2. Tau Too Small
-**Config**: `TAU: 0.001` (line 18)
-**Impact**:
-- Very slow target network updates
-- Target network stays close to initial values for a long time
-- Slower learning convergence
-**Typical Values**:
-- DQN papers often use `tau=0.01` or `tau=0.005`
-- `tau=0.001` means only 0.1% update per step
-**Recommendation**: Try `tau=0.01` or `tau=0.005`
-### 3. Learning Rate
-**Config**: `LR: 0.001` (line 12)
-**Impact**:
-- Might be too high for some environments
-- Could cause instability
-**Typical Values**:
-- DQN often uses `lr=1e-4` to `lr=5e-4`
-- `lr=0.001` is on the higher side
-**Recommendation**: Try `lr=5e-4` or `lr=1e-4`
-### 4. Learn Step Frequency
-**Config**: `LEARN_STEP: 1` (line 17)
-**Impact**:
-- Learning every step (with 16 parallel envs, that's 16 steps per environment step)
-- Very frequent learning might cause instability
-- Typical DQN learns every 4-5 steps
-**Recommendation**: Try `LEARN_STEP: 4` or `LEARN_STEP: 5`
-## Summary of Critical Issues
-1. **🔴 HIGH PRIORITY**: Target network initialization might fail silently
-   - Check if `init_hook()` actually initializes target network
-   - Add fallback to `load_state_dict()` if TensorDict method fails
-2. **🟡 MEDIUM PRIORITY**: Inconsistent `max()` usage
-   - Change `axis=1` to `dim=1` for consistency
-3. **🟡 MEDIUM PRIORITY**: Consider adding gradient clipping
-   - Prevents gradient explosion
-4. **🟡 MEDIUM PRIORITY**: Hyperparameter tuning
-   - `tau=0.001` might be too small
-   - `lr=0.001` might be too high
-   - `learn_step=1` might be too frequent
-## Recommended Fixes
-### Fix 1: Improve Target Network Initialization
-```python
-def init_hook(self) -> None:
-    """Resets module parameters for the detached and target networks."""
-    param_vals: TensorDict = from_module(self.actor).detach()
-    target_params: TensorDict = param_vals.clone().lock_()
-    try:
-        target_params.to_module(self.actor_target)
-        # Verify initialization succeeded
-        actor_first_param = next(self.actor.parameters()).data
-        target_first_param = next(self.actor_target.parameters()).data
-        if not torch.allclose(actor_first_param, target_first_param, atol=1e-5):
-            raise RuntimeError("Target network initialization verification failed")
-    except (KeyError, RuntimeError) as e:
-        warnings.warn(f"TensorDict initialization failed ({e}), using load_state_dict fallback")
-        self.actor_target.load_state_dict(self.actor.state_dict())
-    finally:
-        self.param_vals = param_vals
-        self.target_params = target_params
-```
-### Fix 2: Fix max() Usage
-```python
-# Line 316
-q_target = self.actor_target(next_obs).max(dim=1)[0].unsqueeze(1)
-```
-### Fix 3: Add Gradient Clipping (Optional)
-```python
-# After line 333 (loss.backward())
-from torch.nn.utils import clip_grad_norm_
-clip_grad_norm_(self.actor.parameters(), max_norm=10.0)
-self.optimizer.step()
-```

agilerl 2.4.2.dev0__tar.gz → 2.4.3.dev0__tar.gz

agilerl 2.4.2.dev0tar.gz → 2.4.3.dev0tar.gz