PyPI - agilerl - Versions diffs - 2.4.0.dev0__tar.gz → 2.4.1__tar.gz - Mend

agilerl 2.4.0.dev0tar.gz → 2.4.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

{agilerl-2.4.0.dev0 → agilerl-2.4.1}/PKG-INFO RENAMED Viewed

@@ -1,22 +1,23 @@
 Metadata-Version: 2.4
 Name: agilerl
-Version: 2.4.0.dev0
+Version: 2.4.1
 Summary: AgileRL is a deep reinforcement learning library focused on improving RL development through RLOps.
 License: Apache 2.0
 License-File: LICENSE
 Author: Nick Ustaran-Anderegg
 Author-email: dev@agilerl.com
-Requires-Python: >=3.10,<4.0
+Requires-Python: >=3.10,<3.13
 Classifier: License :: Other/Proprietary License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
-Classifier: Programming Language :: Python :: 3.13
-Classifier: Programming Language :: Python :: 3.14
+Provides-Extra: all
+Provides-Extra: llm
 Requires-Dist: SuperSuit (>=3.9.0,<4.0.0)
 Requires-Dist: accelerate (>=1.7.0,<2.0.0)
-Requires-Dist: deepspeed (>=0.17.1,<0.18.0)
+Requires-Dist: datasets (==4.4.1) ; extra == "llm" or extra == "all"
+Requires-Dist: deepspeed (>=0.17.1,<0.18.0) ; extra == "llm" or extra == "all"
 Requires-Dist: dill (>=0.3.7,<0.4.0)
 Requires-Dist: fastrand (>=1.3.0,<2.0.0)
 Requires-Dist: flatten_dict (>=0.4.2,<0.5.0)
@@ -26,11 +27,12 @@ Requires-Dist: h5py (>=3.8.0,<4.0.0)
 Requires-Dist: hydra-core (>=1.3.2,<2.0.0)
 Requires-Dist: jax[cpu] (>=0.4.31,<0.5.0)
 Requires-Dist: matplotlib (>=3.9.4,<3.10.0)
-Requires-Dist: minari (>=0.5.2,<0.6.0)
+Requires-Dist: minari[all] (==0.5.2)
 Requires-Dist: numpy (>=1.26.4,<2.0.0)
 Requires-Dist: omegaconf (>=2.3.0,<3.0.0)
+Requires-Dist: packaging (>=20.0)
 Requires-Dist: pandas (>=2.2.3,<3.0.0)
-Requires-Dist: peft (>=0.15.2,<0.16.0)
+Requires-Dist: peft (>=0.18.0,<0.19.0) ; extra == "llm" or extra == "all"
 Requires-Dist: pettingzoo (>=1.23.1,<2.0.0)
 Requires-Dist: pre-commit (>=3.4.0,<4.0.0)
 Requires-Dist: pygame (>=2.6.0,<3.0.0)
@@ -41,9 +43,9 @@ Requires-Dist: tensordict (>=0.8,<0.9)
 Requires-Dist: termcolor (>=1.1.0,<2.0.0)
 Requires-Dist: torch (==2.7.1)
 Requires-Dist: tqdm (>=4.66.4,<5.0.0)
-Requires-Dist: transformers (>=4.48.1,<5.0.0)
+Requires-Dist: transformers (>=4.57.1,<5.0.0) ; extra == "llm" or extra == "all"
 Requires-Dist: ucimlrepo (>=0.0.3,<0.0.4)
-Requires-Dist: vllm (==0.10.0)
+Requires-Dist: vllm (==0.10.0) ; extra == "llm" or extra == "all"
 Requires-Dist: wandb (>=0.17.6,<0.18.0)
 Description-Content-Type: text/markdown
@@ -97,6 +99,16 @@ git clone https://github.com/AgileRL/AgileRL.git && cd AgileRL
 pip install -e .
 ```
+If you wish to install all additional dependencies please specify `[all]` or if you want to install a specific family of dependencies specify that family directly. At present, we have just one family, `[llm]`, which contains the dependencies related to our LLM RFT algorithms (datasets, deepspeed, peft, transformers, vllm).
+```bash
+pip install agilerl[all]
+```
+Or in development mode:
+```bash
+pip install -e ".[all]"
+```
 To install the ``nightly`` version of AgileRL with the latest features, use:
 ```bash
@@ -155,11 +167,12 @@ We are constantly updating our tutorials to showcase the latest features of Agil
   | ---------- | --------- |
   | [Bandits](https://docs.agilerl.com/en/latest/bandits/index.html) | [Neural Contextual Bandits with UCB-based Exploration (NeuralUCB)](https://docs.agilerl.com/en/latest/api/algorithms/neural_ucb.html) <br> [Neural Contextual Bandits with Thompson Sampling (NeuralTS)](https://docs.agilerl.com/en/latest/api/algorithms/neural_ts.html) |
-  ### LLM Reasoning Algorithms
+  ### LLM Fine-tuning Algorithms
   | RL         | Algorithm |
   | ---------- | --------- |
   | [On-Policy](https://docs.agilerl.com/en/latest/llm_finetuning/index.html) | [Group Relative Policy Optimization (GRPO)](https://docs.agilerl.com/en/latest/api/algorithms/grpo.html)
+  | [Off-Policy](https://docs.agilerl.com/en/latest/llm_finetuning/index.html) | [Direct Preference Optimization (DPO)](https://docs.agilerl.com/en/latest/api/algorithms/dpo.html)
 ## Train an Agent to Beat a Gym Environment

{agilerl-2.4.0.dev0 → agilerl-2.4.1}/README.md RENAMED Viewed

@@ -48,6 +48,16 @@ git clone https://github.com/AgileRL/AgileRL.git && cd AgileRL
 pip install -e .
 ```
+If you wish to install all additional dependencies please specify `[all]` or if you want to install a specific family of dependencies specify that family directly. At present, we have just one family, `[llm]`, which contains the dependencies related to our LLM RFT algorithms (datasets, deepspeed, peft, transformers, vllm).
+```bash
+pip install agilerl[all]
+```
+Or in development mode:
+```bash
+pip install -e ".[all]"
+```
 To install the ``nightly`` version of AgileRL with the latest features, use:
 ```bash
@@ -106,11 +116,12 @@ We are constantly updating our tutorials to showcase the latest features of Agil
   | ---------- | --------- |
   | [Bandits](https://docs.agilerl.com/en/latest/bandits/index.html) | [Neural Contextual Bandits with UCB-based Exploration (NeuralUCB)](https://docs.agilerl.com/en/latest/api/algorithms/neural_ucb.html) <br> [Neural Contextual Bandits with Thompson Sampling (NeuralTS)](https://docs.agilerl.com/en/latest/api/algorithms/neural_ts.html) |
-  ### LLM Reasoning Algorithms
+  ### LLM Fine-tuning Algorithms
   | RL         | Algorithm |
   | ---------- | --------- |
   | [On-Policy](https://docs.agilerl.com/en/latest/llm_finetuning/index.html) | [Group Relative Policy Optimization (GRPO)](https://docs.agilerl.com/en/latest/api/algorithms/grpo.html)
+  | [Off-Policy](https://docs.agilerl.com/en/latest/llm_finetuning/index.html) | [Direct Preference Optimization (DPO)](https://docs.agilerl.com/en/latest/api/algorithms/dpo.html)
 ## Train an Agent to Beat a Gym Environment

agilerl-2.4.1/agilerl/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+from importlib.metadata import metadata
+from importlib.util import find_spec
+from packaging.requirements import Requirement
+def get_extra_dependencies(package: str, extra: str) -> list[str]:
+    requires = metadata(package).get_all("Requires-Dist") or []
+    deps = []
+    for req in requires:
+        r = Requirement(req)
+        if r.marker and r.marker.evaluate({"extra": extra}):
+            deps.append(r.name)
+    return deps
+LLM_PACKAGES = get_extra_dependencies("agilerl", "llm")
+HAS_LLM_DEPENDENCIES = all(find_spec(pkg) is not None for pkg in LLM_PACKAGES)

{agilerl-2.4.0.dev0 → agilerl-2.4.1}/agilerl/algorithms/core/base.py RENAMED Viewed

@@ -27,19 +27,14 @@ import torch
 import torch.nn.functional as F
 from accelerate import Accelerator
 from accelerate.utils import broadcast_object_list, set_seed
-from accelerate.utils.deepspeed import DeepSpeedOptimizerWrapper
-from deepspeed.checkpoint.utils import clone_tensors_for_torch_save
 from gymnasium import spaces
-from peft import LoraConfig, PeftModel, get_peft_model, set_peft_model_state_dict
-from safetensors.torch import load_file
 from tensordict import TensorDict
 from torch._dynamo import OptimizedModule
 from torch.nn.utils import clip_grad_norm_
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import SequentialLR
-from transformers.modeling_utils import PreTrainedModel
-from vllm import LLM, SamplingParams
+from agilerl import HAS_LLM_DEPENDENCIES
 from agilerl.algorithms.core.optimizer_wrapper import OptimizerWrapper
 from agilerl.algorithms.core.registry import (
     HyperparameterConfig,
@@ -54,7 +49,11 @@ from agilerl.protocols import (
     EvolvableAttributeDict,
     EvolvableAttributeType,
     EvolvableModule,
+    LoraConfigProtocol,
     ModuleDict,
+    PeftModelProtocol,
+    PretrainedConfigProtocol,
+    PreTrainedModelProtocol,
 )
 from agilerl.typing import (
     ActionType,
@@ -73,6 +72,7 @@ from agilerl.typing import (
 )
 from agilerl.utils.algo_utils import (
     CosineLRScheduleConfig,
+    DummyOptimizer,
     VLLMConfig,
     check_supported_space,
     chkpt_attribute_to_device,
@@ -95,7 +95,18 @@ from agilerl.utils.evolvable_networks import (
     is_image_space,
     is_vector_space,
 )
-from agilerl.utils.llm_utils import DummyOptimizer, gather_if_zero3
+if HAS_LLM_DEPENDENCIES:
+    from accelerate.utils.deepspeed import DeepSpeedOptimizerWrapper
+    from deepspeed.checkpoint.utils import clone_tensors_for_torch_save
+    from peft import LoraConfig, get_peft_model, set_peft_model_state_dict
+    from safetensors.torch import load_file
+    from vllm import LLM, SamplingParams
+    from agilerl.utils.llm_utils import (
+        create_model_from_name_or_path,
+        gather_if_zero3,
+    )
 __all__ = ["EvolvableAlgorithm", "RLAlgorithm", "MultiAgentRLAlgorithm"]
@@ -596,14 +607,16 @@ class EvolvableAlgorithm(ABC, metaclass=RegistryMeta):
         )
         optimizer = opt.optimizer if hasattr(opt, "optimizer") else None
-        if isinstance(opt, DeepSpeedOptimizerWrapper):
-            if isinstance(opt.optimizer, DummyOptimizer):
-                opt = getattr(
+        if isinstance(self, LLMAlgorithm):
+            if hasattr(self.actor, "optimizer"):
+                optimizer = getattr(
                     getattr(self, "actor"), "optimizer"
                 )  # If the optimizer is defined in the deepspeed config, we do this
+            else:
+                optimizer = opt.optimizer
             self.accelerator, self.lr_scheduler = LLMAlgorithm.update_lr(
-                opt,
+                optimizer,
                 lr=getattr(self, config.lr),
                 accelerator=self.accelerator,
                 scheduler_config=self.cosine_lr_schedule_config,
@@ -1138,6 +1151,16 @@ class EvolvableAlgorithm(ABC, metaclass=RegistryMeta):
         return self
+    def clean_up(self) -> None:
+        """
+        Clean up the algorithm by deleting the networks and optimizers.
+        :return: None
+        :rtype: None
+        """
+        for evo_attr in self.evolvable_attributes().values():
+            del evo_attr
 class RLAlgorithm(EvolvableAlgorithm, ABC):
     """Base object for all single-agent algorithms in the AgileRL framework.
@@ -1782,8 +1805,6 @@ class MultiAgentRLAlgorithm(EvolvableAlgorithm, ABC):
 class LLMAlgorithm(EvolvableAlgorithm, ABC):
     """Base object for all LLM algorithms in the AgileRL framework.
-    :param observation_space: The observation space of the environment.
-    :type observation_space: gymnasium.spaces.Space
     :param action_space: The action space of the environment.
     :type action_space: gymnasium.spaces.Space
     :param index: The index of the algorithm.
@@ -1796,13 +1817,14 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
     :type accelerator: Optional[Accelerator]
     :param name: The name of the algorithm.
     :type name: Optional[str]
+    :param model_config: The configuration for the model.
+    :type model_config: dict[str, Any] | PretrainedConfig | None
+    :param gradient_checkpointing: Whether to use gradient checkpointing.
+    :type gradient_checkpointing: bool
     """
     def __init__(
         self,
-        observation_space: spaces.Space,
-        action_space: spaces.Space,
-        actor_network: PreTrainedModel,
         index: int,
         batch_size: int,
         lr: float,
@@ -1813,8 +1835,10 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
         seed: int,
         pad_token_id: int,
         pad_token: str,
-        lora_config: LoraConfig | None,
+        lora_config: LoraConfigProtocol | None,
         use_separate_reference_adapter: bool,
+        model_name: str | None = None,
+        actor_network: PreTrainedModelProtocol | None = None,
         micro_batch_size_per_gpu: int | None = None,
         cosine_lr_schedule_config: Optional[CosineLRScheduleConfig] = None,
         hp_config: Optional[HyperparameterConfig] = None,
@@ -1822,7 +1846,18 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
         device: Union[str, torch.device] = "cpu",
         accelerator: Optional[Accelerator] = None,
         name: Optional[str] = None,
+        model_config: dict[str, Any] | PretrainedConfigProtocol | None = None,
+        gradient_checkpointing: bool = True,
     ):
+        if not HAS_LLM_DEPENDENCIES:
+            raise ImportError(
+                "LLM dependencies are not installed. Please install them using `pip install agilerl[llm]`."
+            )
+        if model_name is None and actor_network is None:
+            raise ValueError(
+                "At least one of model_name or actor_network must be provided."
+            )
         if (
             accelerator is not None
             and cosine_lr_schedule_config is not None
@@ -1835,20 +1870,16 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
             cosine_lr_schedule_config = None
         super().__init__(index, hp_config, device, accelerator, None, name)
-        assert isinstance(
-            observation_space, spaces.Space
-        ), "Observation space must be an instance of gymnasium.spaces.Space."
-        assert isinstance(
-            action_space, spaces.Space
-        ), "Action space must be an instance of gymnasium.spaces.Space."
-        self.observation_space = observation_space
-        self.action_space = action_space
+        self.gradient_checkpointing = gradient_checkpointing
         self.zero_stage = None
         self.reference_update_tracker = 0  # Updated every time the reference policy is updated which is updated each time we pass through the train dataset
         self.calc_position_embeddings = calc_position_embeddings
         self.pad_token_id = pad_token_id
         self.pad_token = pad_token
+        self.pretrained_model_name_or_path = (
+            model_name if model_name is not None else actor_network.name_or_path
+        )
+        self.model_config = model_config
         if not clone and reduce_memory_peak and micro_batch_size_per_gpu is not None:
             raise ValueError(
@@ -1858,7 +1889,9 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
         self._configure_batch_size(
             batch_size, clone, reduce_memory_peak, micro_batch_size_per_gpu
         )
+        self.batch_size = self.batch_size_per_process * (
+            self.accelerator.num_processes if self.accelerator is not None else 1
+        )
         if self.accelerator is not None:
             if (
                 self.accelerator.state.deepspeed_plugin.deepspeed_config.get(
@@ -1875,22 +1908,14 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
                     )
                     lr = optim_lr
-        if lora_config is None and not isinstance(actor_network, PeftModel):
+        if lora_config is None and not isinstance(actor_network, PeftModelProtocol):
             warnings.warn(
-                "No LoRA config provided. Using default LoRA configuration for RL finetuning."
+                "No LoRA config provided. AgileRL can only be used to finetune adapters at present. Using default LoRA configuration for RL finetuning."
             )
             lora_config = LoraConfig(
                 r=16,
-                lora_alpha=64,
-                target_modules=[
-                    "q_proj",
-                    "k_proj",
-                    "v_proj",
-                    "o_proj",
-                    "up_proj",
-                    "down_proj",
-                    "gate_proj",
-                ],
+                lora_alpha=32,
+                target_modules="all-linear",
                 task_type="CAUSAL_LM",
                 lora_dropout=0.05,
             )
@@ -1900,15 +1925,20 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
         self.use_separate_reference_adapter = use_separate_reference_adapter
         self.cosine_lr_schedule_config = cosine_lr_schedule_config
-        if max_grad_norm and (accelerator is not None) and accelerator.is_main_process:
-            warnings.warn(
-                "Argument 'max_grad_norm' will be overwritten by the 'gradient_clipping' value set in the deepspeed config."
-            )
-            self.max_grad_norm = None
-        else:
-            self.max_grad_norm = max_grad_norm
+        if max_grad_norm and (accelerator is not None):
+            if accelerator.is_main_process:
+                warnings.warn(
+                    "Argument 'max_grad_norm' will overwrite the equivalent value set for 'gradient_clipping' in the deepspeed config."
+                )
+            self.accelerator.state.deepspeed_plugin.deepspeed_config[
+                "gradient_clipping"
+            ] = max_grad_norm
+        self.max_grad_norm = max_grad_norm
         self.reduce_memory_peak = reduce_memory_peak
-        self.pretrained_model_name_or_path = actor_network.name_or_path
+        if self.accelerator is not None:
+            self.register_mutation_hook(self._sync_deepspeed_gradient_clipping)
         if self.accelerator is not None:
             self.zero_stage = self.accelerator.state.deepspeed_plugin.deepspeed_config[
@@ -2044,7 +2074,7 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
                 device_map="auto"
             )
             tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")
-            model = PeftModel.from_pretrained(base_model, path)
+            model = PeftModelProtocol.from_pretrained(base_model, path)
             """
         )
@@ -2141,19 +2171,26 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
                 if not is_dummy_optimizer
                 else type(self.actor.optimizer)
             )
-            self.actor.module.gradient_checkpointing_enable(
-                gradient_checkpointing_kwargs={"use_reentrant": False}
-            )
+            if self.gradient_checkpointing:
+                self.actor.module.gradient_checkpointing_enable(
+                    gradient_checkpointing_kwargs={"use_reentrant": False}
+                )
         else:
             assert (
                 self.actor is not None
             ), "Actor is set to None, please check that the actor is defined."
             self.actor = self.actor.to(self.device)
-            self.actor.gradient_checkpointing_enable()
+            if self.gradient_checkpointing:
+                self.actor.gradient_checkpointing_enable()
     def clean_up(self) -> None:
         """Clean up the algorithm."""
         if self.accelerator is not None:
+            # Free up GPU memory occupied by parameters
+            if hasattr(self.actor, "empty_partition_cache"):
+                self.actor.empty_partition_cache()
+            if hasattr(self.actor, "destroy"):
+                self.actor.destroy()
             (
                 self.actor,
                 self.optimizer,
@@ -2177,10 +2214,8 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
         if hasattr(self, "llm"):
             del self.llm.llm_engine.model_executor
             del self.llm
         gc.collect()
         torch.cuda.empty_cache()
-        torch.cuda.reset_peak_memory_stats()
         torch.cuda.synchronize()
     def clone(self, index: Optional[int] = None, wrap: bool = True):
@@ -2215,8 +2250,8 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
             input_args["wrap"] = False
             input_args["clone"] = True
-            actor: PeftModel = cast(
-                PeftModel,
+            actor: PeftModelProtocol = cast(
+                PeftModelProtocol,
                 (
                     self.accelerator.unwrap_model(self.actor)
                     if self.accelerator is not None
@@ -2408,17 +2443,22 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
             self.reference_update_tracker += 1
     def _initialize_actors(
-        self, base_model: PreTrainedModel, add_adapters: bool = True
+        self, base_model: PreTrainedModelProtocol | None, add_adapters: bool = True
     ):
         """Initialize the actor network.
         :param base_model: Base model
-        :type base_model: PreTrainedModel
+        :type base_model: PreTrainedModelProtocol
         :param add_adapters: Flag to indicate if adapters should be added to the model, defaults to True
         :type add_adapters: bool, optional
         """
-        if isinstance(base_model, PeftModel) and add_adapters:
+        if base_model is None:
+            base_model = create_model_from_name_or_path(
+                self.pretrained_model_name_or_path
+            )
+        if isinstance(base_model, PeftModelProtocol) and add_adapters:
             # Handles backwards compatibility with user providing a peft model as the actor network
             if self.lora_config is None:
                 adapter_name = list(base_model.peft_config.keys())
@@ -2428,7 +2468,7 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
             if "default" in list(base_model.peft_config.keys()):
                 base_model.peft_config.pop("default")
-        self.actor: PeftModel = (
+        self.actor: PeftModelProtocol = (
             get_peft_model(base_model, self.lora_config, adapter_name="actor")
             if add_adapters
             else base_model
@@ -2577,7 +2617,6 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
     def _move_model_to_vllm(self) -> None:
         """Move the deepspeed model to vllm."""
-        # TODO: Add support for ZeRO Stage 3
         if self.accelerator is not None:
             self.accelerator.wait_for_everyone()
         model_ref = self.accelerator.unwrap_model(self.actor)
@@ -2945,3 +2984,28 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
         if self.accelerator is not None:
             self.accelerator.wait_for_everyone()
+    def _sync_deepspeed_gradient_clipping(self) -> None:
+        """Synchronizes max_grad_norm with DeepSpeed gradient_clipping config.
+        Registered as a mutation hook to ensure consistency after mutations.
+        """
+        if self.accelerator is None:
+            return
+        if (
+            "gradient_clipping"
+            not in self.accelerator.state.deepspeed_plugin.deepspeed_config
+        ):
+            return
+        ds_config = self.accelerator.state.deepspeed_plugin.deepspeed_config
+        if ds_config["gradient_clipping"] != self.max_grad_norm:
+            self.accelerator.state.deepspeed_plugin.deepspeed_config[
+                "gradient_clipping"
+            ] = self.max_grad_norm
+        if hasattr(self.actor, "optimizer"):
+            if hasattr(self.actor.optimizer, "grad_clip"):
+                self.actor.optimizer.grad_clip = self.max_grad_norm
+            if hasattr(self.actor.optimizer, "clip_grad"):
+                self.actor.optimizer.clip_grad = self.max_grad_norm

{agilerl-2.4.0.dev0 → agilerl-2.4.1}/agilerl/algorithms/core/optimizer_wrapper.py RENAMED Viewed

@@ -2,19 +2,27 @@ import inspect
 from typing import Any, Optional, Union
 import torch.nn as nn
-from peft import PeftModel
 from torch.optim import Optimizer
+from agilerl import HAS_LLM_DEPENDENCIES
 from agilerl.modules import EvolvableModule, ModuleDict
 from agilerl.protocols import EvolvableAlgorithm
 from agilerl.typing import OptimizerType, StateDict
-from agilerl.utils.llm_utils import DummyOptimizer
+from agilerl.utils.algo_utils import DummyOptimizer
+if HAS_LLM_DEPENDENCIES:
+    from peft import PeftModel
+    PeftModelType = PeftModel
+else:
+    PeftModelType = "PeftModel"
 ModuleList = list[EvolvableModule]
 _Optimizer = Union[
     type[OptimizerType], dict[str, type[OptimizerType]], type[DummyOptimizer]
 ]
-_Module = Union[EvolvableModule, ModuleDict, ModuleList, PeftModel]
+_Module = Union[EvolvableModule, ModuleDict, ModuleList, PeftModelType]
 def init_from_multiple(

{agilerl-2.4.0.dev0 → agilerl-2.4.1}/agilerl/algorithms/core/registry.py RENAMED Viewed

@@ -9,7 +9,7 @@ from torch.optim import Optimizer
 from agilerl.protocols import EvolvableAlgorithm
 from agilerl.typing import NetworkType
-from agilerl.utils.llm_utils import DummyOptimizer
+from agilerl.utils.algo_utils import DummyOptimizer
 @dataclass

{agilerl-2.4.0.dev0 → agilerl-2.4.1}/agilerl/algorithms/dpo.py RENAMED Viewed

@@ -1,28 +1,76 @@
 import gc
+from typing import Any
 import numpy as np
 import torch
 import torch.nn.functional as F
 from accelerate import Accelerator
-from gymnasium import spaces
-from peft import LoraConfig
-from transformers import PreTrainedModel
 from agilerl.algorithms.core.base import LLMAlgorithm
 from agilerl.algorithms.core.registry import HyperparameterConfig, NetworkGroup
+from agilerl.protocols import LoraConfigProtocol, PreTrainedModelProtocol
 from agilerl.typing import ExperiencesType, LLMObsType
 from agilerl.utils.algo_utils import get_experiences_samples
 from agilerl.utils.llm_utils import PreferenceGym
 class DPO(LLMAlgorithm):
+    """The DPO algorithm class. DPO paper: https://arxiv.org/pdf/2305.18290
+    :param pad_token_id: Pad token id
+    :type pad_token_id: int
+    :param pad_token: Pad token
+    :type pad_token: str
+    :param model_name: Model name
+    :type model_name: str, optional
+    :param actor_network: HuggingFace LLM
+    :type actor_network: PreTrainedModelProtocol
+    :param model_config: Model configuration, to be used when creating the model from a name or path
+    :param hp_config: RL hyperparameter mutation configuration, defaults to None, whereby algorithm mutations are disabled.
+    :type hp_config: HyperparameterConfig, optional
+    :param index: Index to keep track of object instance during tournament selection and mutation, defaults to 0
+    :type index: int, optional
+    :param batch_size: Batch size for training, defaults to 16
+    :type batch_size: int, optional
+    :param lr: Learning rate, defaults to 0.000005
+    :type lr: float, optional
+    :param beta: Beta parameter for DPO, defaults to 0.001
+    :type beta: float, optional
+    :param max_grad_norm: Maximum gradient norm, defaults to 0.1
+    :type max_grad_norm: float, optional
+    :param update_epochs: Number of update epochs, defaults to 1
+    :type update_epochs: int, optional
+    :param calc_position_embeddings: Flag to indicate if position embeddings should be calculated, defaults to True
+    :type calc_position_embeddings: bool, optional
+    :param micro_batch_size_per_gpu: Micro batch size per GPU, defaults to None
+    :type micro_batch_size_per_gpu: int, optional
+    :param reduce_memory_peak: Flag to indicate if memory peak should be reduced, defaults to False
+    :type reduce_memory_peak: bool, optional
+    :param device: Device for accelerated computing, 'cpu' or 'cuda', defaults to 'cpu'
+    :type device: str, optional
+    :param lora_config: Config for LoRA, defaults to None
+    :type lora_config: LoraConfigProtocol, optional
+    :param accelerator: Accelerator for distributed computing, defaults to None
+    :type accelerator: accelerate.Accelerator(), optional
+    :param wrap: Wrap models for distributed training upon creation, defaults to True
+    :type wrap: bool, optional
+    :param clone: Flag to indicate if the instantiation is a cloning, defaults to False
+    :type clone: bool, optional
+    :param use_separate_reference_adapter: Flag to indicate if the reference policy should have a separate adapter, defaults to False
+    :type use_separate_reference_adapter: bool, optional
+    :param seed: Seed for the random number generator, defaults to 42
+    :type seed: int, optional
+    :param gradient_checkpointing: Flag to indicate if gradient checkpointing should be used, defaults to True
+    :type gradient_checkpointing: bool, optional
+    """
     def __init__(
         self,
-        observation_space: spaces.Space,
-        action_space: spaces.Space,
-        actor_network: PreTrainedModel,
         pad_token_id: int,
         pad_token: str,
+        model_name: str | None = None,
+        actor_network: PreTrainedModelProtocol | None = None,
+        model_config: dict[str, Any] | None = None,
         hp_config: HyperparameterConfig | None = None,
         index: int = 0,
         batch_size: int = 16,
@@ -34,12 +82,13 @@ class DPO(LLMAlgorithm):
         micro_batch_size_per_gpu: int | None = None,
         reduce_memory_peak: bool = False,
         device: str = "cpu",
-        lora_config: LoraConfig | None = None,
+        lora_config: LoraConfigProtocol | None = None,
         accelerator: Accelerator | None = None,
         wrap: bool = True,
         clone: bool = False,
         use_separate_reference_adapter: bool = False,
         seed: int = 42,
+        gradient_checkpointing: bool = True,
     ):
         device = (
             f"cuda:{accelerator.process_index}"
@@ -47,9 +96,6 @@ class DPO(LLMAlgorithm):
             else ("cuda" if torch.cuda.is_available() else "cpu")
         )
         super().__init__(
-            observation_space,
-            action_space,
-            actor_network,
             index=index,
             batch_size=batch_size,
             lr=lr,
@@ -62,6 +108,9 @@ class DPO(LLMAlgorithm):
             pad_token=pad_token,
             lora_config=lora_config,
             use_separate_reference_adapter=use_separate_reference_adapter,
+            model_name=model_name,
+            actor_network=actor_network,
+            model_config=model_config,
             micro_batch_size_per_gpu=micro_batch_size_per_gpu,
             cosine_lr_schedule_config=None,
             hp_config=hp_config,
@@ -69,6 +118,7 @@ class DPO(LLMAlgorithm):
             device=device,
             accelerator=accelerator,
             name="DPO",
+            gradient_checkpointing=gradient_checkpointing,
         )
         self.beta = beta
         self.temperature = (

agilerl 2.4.0.dev0__tar.gz → 2.4.1__tar.gz

agilerl 2.4.0.dev0tar.gz → 2.4.1tar.gz