PyPI - agilerl - Versions diffs - 2.3.4.dev1__tar.gz → 2.3.5.dev0__tar.gz - Mend

agilerl 2.3.4.dev1tar.gz → 2.3.5.dev0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

{agilerl-2.3.4.dev1 → agilerl-2.3.5.dev0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: agilerl
-Version: 2.3.4.dev1
+Version: 2.3.5.dev0
 Summary: AgileRL is a deep reinforcement learning library focused on improving RL development through RLOps.
 License: Apache 2.0
 Author: Nick Ustaran-Anderegg
@@ -37,14 +37,14 @@ Requires-Dist: redis (>=4.4.4,<5.0.0)
 Requires-Dist: scipy (>=1.12.0,<2.0.0)
 Requires-Dist: tensordict (>=0.8,<0.9)
 Requires-Dist: termcolor (>=1.1.0,<2.0.0)
-Requires-Dist: torch (==2.5.1)
+Requires-Dist: torch (==2.7.1)
 Requires-Dist: tqdm (>=4.66.4,<5.0.0)
 Requires-Dist: transformers (>=4.48.1,<5.0.0)
 Requires-Dist: ucimlrepo (>=0.0.3,<0.0.4)
+Requires-Dist: vllm (==0.10.0)
 Requires-Dist: wandb (>=0.17.6,<0.18.0)
 Description-Content-Type: text/markdown
-# AgileRL
 <p align="center">
   <img src=https://user-images.githubusercontent.com/47857277/222710068-e09a4e3c-368c-458a-9e01-b68674806887.png height="120">
 </p>

{agilerl-2.3.4.dev1 → agilerl-2.3.5.dev0}/README.md RENAMED Viewed

@@ -1,4 +1,3 @@
-# AgileRL
 <p align="center">
   <img src=https://user-images.githubusercontent.com/47857277/222710068-e09a4e3c-368c-458a-9e01-b68674806887.png height="120">
 </p>

{agilerl-2.3.4.dev1 → agilerl-2.3.5.dev0}/agilerl/algorithms/core/base.py RENAMED Viewed

@@ -30,13 +30,16 @@ from accelerate import Accelerator
 from accelerate.utils import broadcast_object_list
 from accelerate.utils.deepspeed import DeepSpeedOptimizerWrapper
 from deepspeed.checkpoint.utils import clone_tensors_for_torch_save
+from deepspeed.runtime.engine import DeepSpeedEngine
 from gymnasium import spaces
 from numpy.typing import ArrayLike
-from peft import PeftModel
+from peft import PeftModel, set_peft_model_state_dict
+from safetensors.torch import load_file
 from tensordict import TensorDict
 from torch._dynamo import OptimizedModule
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import SequentialLR
+from vllm.distributed.parallel_state import destroy_model_parallel
 from agilerl.algorithms.core.optimizer_wrapper import OptimizerWrapper
 from agilerl.algorithms.core.registry import (
@@ -70,6 +73,7 @@ from agilerl.typing import (
 )
 from agilerl.utils.algo_utils import (
     CosineLRScheduleConfig,
+    check_supported_space,
     chkpt_attribute_to_device,
     clone_llm,
     create_warmup_cosine_scheduler,
@@ -869,7 +873,11 @@ class EvolvableAlgorithm(ABC, metaclass=RegistryMeta):
                 k: v for k, v in network_info["modules"].items() if k.startswith(name)
             }
-            module_cls = net_dict[f"{name}_cls"]
+            module_cls = net_dict.get(f"{name}_cls", None)
+            if module_cls is None:
+                # This allows us to super this method in the LLMAlgorithm class
+                # as we don't want to reinstantiate the network in this class
+                break
             init_dict = net_dict[f"{name}_init_dict"]
             module_dict_cls = net_dict.get(f"{name}_module_dict_cls", None)
@@ -1164,12 +1172,8 @@ class RLAlgorithm(EvolvableAlgorithm, ABC):
         super().__init__(index, hp_config, device, accelerator, torch_compiler, name)
-        assert isinstance(
-            observation_space, spaces.Space
-        ), "Observation space must be an instance of gymnasium.spaces.Space."
-        assert isinstance(
-            action_space, spaces.Space
-        ), "Action space must be an instance of gymnasium.spaces.Space."
+        check_supported_space(observation_space)
+        check_supported_space(action_space)
         self.observation_space = observation_space
         self.action_space = action_space
@@ -1257,12 +1261,7 @@ class MultiAgentRLAlgorithm(EvolvableAlgorithm, ABC):
             assert len(agent_ids) == len(
                 observation_spaces
             ), "Number of agent IDs must match number of observation spaces."
-            assert all(
-                isinstance(_space, spaces.Space) for _space in observation_spaces
-            ), "Observation spaces must be instances of gymnasium.spaces.Space."
-            assert all(
-                isinstance(_space, spaces.Space) for _space in action_spaces
-            ), "Action spaces must be instances of gymnasium.spaces.Space."
             self.possible_observation_spaces = spaces.Dict(
                 {
                     agent_id: space
@@ -1284,6 +1283,11 @@ class MultiAgentRLAlgorithm(EvolvableAlgorithm, ABC):
                 f"Observation spaces must be a list or dictionary of spaces.Space objects. Got {type(observation_spaces)}."
             )
+        for obs_space in self.possible_observation_spaces.values():
+            check_supported_space(obs_space)
+        for action_space in self.possible_action_spaces.values():
+            check_supported_space(action_space)
         self.agent_ids = list(self.possible_observation_spaces.keys())
         self.n_agents = len(self.agent_ids)
         self.placeholder_value = placeholder_value
@@ -1823,8 +1827,8 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
                 and self.zero_stage > 2
                 and self.accelerator.is_main_process
             ):
-                warnings.warn(
-                    "Zero stage 3 support is nascent and has not been thoroughly tested. It may be unstable or subject to change. We recommend caution in production environments."
+                raise NotImplementedError(
+                    "DeepSpeed ZeRO Stage 3 is not yet supported in AgileRL. This feature is in development and will be available in a future release."
                 )
         seed = 42
@@ -1848,19 +1852,44 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
     # TODO: This could hopefully be abstracted into EvolvableAlgorithm with a decorator to
     # handle _save_distributed_actor if deepspeed is used.
-    def save_checkpoint(self, path: str) -> None:
+    def save_checkpoint(self, path: str, weights_only: bool = False) -> None:
         """
         Override the save_checkpoint method to provide guidance on the correct method to use.
         :param path: Location to save checkpoint at
         :type path: string
+        :param weights_only: If True, only save the weights of the model, defaults to False
+        :type weights_only: bool, optional
         """
+        warnings.warn("weights_only default will be changed to True in the future.")
         if self.accelerator is not None:
-            self._save_distributed_actor(path, tag="save_checkpoint")
-        torch.save(
-            get_checkpoint_dict(self, using_deepspeed=self.accelerator is not None),
-            path + "/attributes.pt",
-            pickle_module=dill,
+            if not weights_only:
+                self._save_distributed_actor(path, tag="save_checkpoint")
+            else:
+                selected_adapters = (
+                    ["actor", "reference"]
+                    if self.use_separate_reference_adapter
+                    else ["actor"]
+                )
+                self.actor.save_pretrained(
+                    save_directory=path,
+                    selected_adapters=selected_adapters,
+                    is_main_process=self.accelerator.is_main_process,
+                )
+        checkpoint_dict = get_checkpoint_dict(
+            self, using_deepspeed=self.accelerator is not None
         )
+        checkpoint_dict["_weights_only"] = weights_only
+        checkpoint_dict.pop("llm", None)
+        checkpoint_dict.pop("tp_group", None)
+        if self.accelerator is None or self.accelerator.is_main_process:
+            torch.save(
+                checkpoint_dict,
+                path + "/attributes.pt",
+                pickle_module=dill,
+            )
     # TODO: This could hopefully be abstracted into EvolvableAlgorithm with a decorator to
     # handle _load_distributed_actor if deepspeed is used.
@@ -1872,8 +1901,27 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
         :type path: string
         """
         if self.accelerator is not None:
-            self._load_distributed_actor(path, tag="save_checkpoint")
             checkpoint = torch.load(path + "/attributes.pt", weights_only=False)
+            weights_only = checkpoint.get("_weights_only", False)
+            if weights_only:
+                if self.use_separate_reference_adapter:
+                    self._update_existing_adapter(
+                        self.accelerator,
+                        self.actor,
+                        path,
+                        "reference",
+                    )
+                self._update_existing_adapter(
+                    self.accelerator,
+                    self.actor,
+                    path,
+                    "actor",
+                )
+            else:
+                self._load_distributed_actor(path, tag="save_checkpoint")
             checkpoint["accelerator"] = (
                 Accelerator() if self.accelerator is not None else None
             )
@@ -2040,6 +2088,10 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
                 None,
                 None,
             )
+        if self.use_vllm:
+            destroy_model_parallel()
+            del self.llm.llm_engine.model_executor.driver_worker
+            self.llm = None
         gc.collect()
         torch.cuda.empty_cache()
@@ -2106,11 +2158,19 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
             original_lr_scheduler = self.lr_scheduler
             clone.lr_scheduler = None
             self.lr_scheduler = None
+            if self.use_vllm:
+                original_llm = self.llm
+                cloned_llm = clone.llm
+                clone.llm = None
+                self.llm = None
             clone = EvolvableAlgorithm.copy_attributes(self, clone)
             clone.accelerator = accelerator
             clone.lr_scheduler = lr_scheduler
             clone.lr_scheduler = cloned_lr_scheduler
             self.lr_scheduler = original_lr_scheduler
+            if self.use_vllm:
+                clone.llm = cloned_llm
+                self.llm = original_llm
             if self.accelerator is None:
                 clone.optimizer.optimizer.load_state_dict(
@@ -2201,3 +2261,49 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
             ] = lr
         return accelerator, None
+    def recompile(self) -> None:
+        """Recompiles the algorithm."""
+        raise NotImplementedError(
+            "Recompile method is not available for LLM finetuning algorithms."
+        )
+    @staticmethod
+    def _update_existing_adapter(
+        accelerator: Accelerator,
+        wrapped_model: DeepSpeedEngine,
+        checkpoint_dir: str,
+        adapter_name: str,
+    ) -> None:
+        """
+        Overwrite weights of an existing adapter in-place without creating new parameters.
+        :param accelerator: Accelerator
+        :type accelerator: Accelerator
+        :param wrapped_model: Wrapped model
+        :type wrapped_model: DeepSpeedEngine
+        :param checkpoint_dir: Checkpoint directory
+        :type checkpoint_dir: str
+        :param adapter_name: Adapter name
+        :type adapter_name: str
+        :return: None
+        :rtype: None
+        """
+        base_model = accelerator.unwrap_model(wrapped_model)
+        if hasattr(base_model, "module"):
+            base_model = base_model.module
+        adapter_path = f"{checkpoint_dir}/{adapter_name}/adapter_model.safetensors"
+        adapter_state = load_file(adapter_path, device="cpu")
+        with torch.no_grad():
+            set_peft_model_state_dict(
+                base_model, adapter_state, adapter_name=adapter_name
+            )
+        base_model.set_adapter(adapter_name)
+        # Make reference weights not trainable
+        for name, param in base_model.named_parameters():
+            if "reference" in name:
+                param.requires_grad = False

{agilerl-2.3.4.dev1 → agilerl-2.3.5.dev0}/agilerl/algorithms/dqn_rainbow.py RENAMED Viewed

@@ -275,23 +275,23 @@ class RainbowDQN(RLAlgorithm):
     def _dqn_loss(
         self,
-        states: TorchObsType,
+        obs: TorchObsType,
         actions: torch.Tensor,
         rewards: torch.Tensor,
-        next_states: torch.Tensor,
+        next_obs: torch.Tensor,
         dones: torch.Tensor,
         gamma: float,
     ) -> torch.Tensor:
         """Calculates the DQN loss.
-        :param states: Batch of current states
-        :type states: torch.Tensor
+        :param obs: Batch of current states
+        :type obs: torch.Tensor
         :param actions: Batch of actions taken
         :type actions: torch.Tensor
         :param rewards: Batch of rewards received
         :type rewards: torch.Tensor
-        :param next_states: Batch of next states
-        :type next_states: torch.Tensor
+        :param next_obs: Batch of next states
+        :type next_obs: torch.Tensor
         :param dones: Batch of done flags indicating episode termination
         :type dones: torch.Tensor
         :param gamma: Discount factor
@@ -299,16 +299,15 @@ class RainbowDQN(RLAlgorithm):
         :return: Element-wise loss
         :rtype: torch.Tensor
         """
-        states = self.preprocess_observation(states)
-        next_states = self.preprocess_observation(next_states)
+        obs = self.preprocess_observation(obs)
+        next_obs = self.preprocess_observation(next_obs)
         with torch.no_grad():
+            # Predict next actions from next_obs
+            next_actions = self.actor(next_obs).argmax(1)
-            # Predict next actions from next_states
-            next_actions = self.actor(next_states).argmax(1)
-            # Predict the target q distribution for the same next states
-            target_q_dist = self.actor_target(next_states, q=False)
+            # Predict the target q distribution for the same next obs
+            target_q_dist = self.actor_target(next_obs, q=False)
             # Index the target q_dist to select the distributions corresponding to next_actions
             target_q_dist = target_q_dist[range(self.batch_size), next_actions]
@@ -349,7 +348,7 @@ class RainbowDQN(RLAlgorithm):
             )
         # Calculate the current obs
-        log_q_dist = self.actor(states, q=False, log=True)
+        log_q_dist = self.actor(obs, q=False, log=True)
         log_p = log_q_dist[range(self.batch_size), actions.squeeze().long()]
         # loss
@@ -375,29 +374,29 @@ class RainbowDQN(RLAlgorithm):
         :rtype: Tuple[float, numpy.ndarray, numpy.ndarray]
         """
         n_step = n_experiences is not None
-        states = experiences["obs"]
+        obs = experiences["obs"]
         actions = experiences["action"]
         rewards = experiences["reward"]
-        next_states = experiences["next_obs"]
+        next_obs = experiences["next_obs"]
         dones = experiences["done"]
         if per:
             weights = experiences["weights"]
             idxs = experiences["idxs"]
             if n_step:
-                n_states = n_experiences["obs"]
+                n_obs = n_experiences["obs"]
                 n_actions = n_experiences["action"]
                 n_rewards = n_experiences["reward"]
-                n_next_states = n_experiences["next_obs"]
+                n_next_obs = n_experiences["next_obs"]
                 n_dones = n_experiences["done"]
             if self.combined_reward or not n_step:
                 elementwise_loss = self._dqn_loss(
-                    states, actions, rewards, next_states, dones, self.gamma
+                    obs, actions, rewards, next_obs, dones, self.gamma
                 )
             if n_step:
                 n_gamma = self.gamma**self.n_step
                 n_step_elementwise_loss = self._dqn_loss(
-                    n_states, n_actions, n_rewards, n_next_states, n_dones, n_gamma
+                    n_obs, n_actions, n_rewards, n_next_obs, n_dones, n_gamma
                 )
                 if self.combined_reward:
                     elementwise_loss += n_step_elementwise_loss
@@ -409,10 +408,10 @@ class RainbowDQN(RLAlgorithm):
         else:
             if n_step:
                 idxs = experiences["idxs"]
-                n_states = n_experiences["obs"]
+                n_obs = n_experiences["obs"]
                 n_actions = n_experiences["action"]
                 n_rewards = n_experiences["reward"]
-                n_next_states = n_experiences["next_obs"]
+                n_next_obs = n_experiences["next_obs"]
                 n_dones = n_experiences["done"]
             else:
                 idxs = None
@@ -420,13 +419,13 @@ class RainbowDQN(RLAlgorithm):
             new_priorities = None
             if self.combined_reward or not n_step:
                 elementwise_loss = self._dqn_loss(
-                    states, actions, rewards, next_states, dones, self.gamma
+                    obs, actions, rewards, next_obs, dones, self.gamma
                 )
             if n_step:
                 n_gamma = self.gamma**self.n_step
                 n_step_elementwise_loss = self._dqn_loss(
-                    n_states, n_actions, n_rewards, n_next_states, n_dones, n_gamma
+                    n_obs, n_actions, n_rewards, n_next_obs, n_dones, n_gamma
                 )
                 if self.combined_reward:
                     elementwise_loss += n_step_elementwise_loss
@@ -508,7 +507,9 @@ class RainbowDQN(RLAlgorithm):
                         ) and not finished[idx]:
                             completed_episode_scores[idx] = scores[idx]
                             finished[idx] = 1
                 rewards.append(np.mean(completed_episode_scores))
         mean_fit = np.mean(rewards)
         self.fitness.append(mean_fit)
         return mean_fit

agilerl 2.3.4.dev1__tar.gz → 2.3.5.dev0__tar.gz

agilerl 2.3.4.dev1tar.gz → 2.3.5.dev0tar.gz