PyPI - agilerl - Versions diffs - 2.4.0.dev0__tar.gz → 2.4.1.dev0__tar.gz - Mend

agilerl 2.4.0.dev0tar.gz → 2.4.1.dev0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/PKG-INFO RENAMED Viewed

@@ -1,9 +1,8 @@
-Metadata-Version: 2.4
+Metadata-Version: 2.3
 Name: agilerl
-Version: 2.4.0.dev0
+Version: 2.4.1.dev0
 Summary: AgileRL is a deep reinforcement learning library focused on improving RL development through RLOps.
 License: Apache 2.0
-License-File: LICENSE
 Author: Nick Ustaran-Anderegg
 Author-email: dev@agilerl.com
 Requires-Python: >=3.10,<4.0
@@ -13,7 +12,6 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
-Classifier: Programming Language :: Python :: 3.14
 Requires-Dist: SuperSuit (>=3.9.0,<4.0.0)
 Requires-Dist: accelerate (>=1.7.0,<2.0.0)
 Requires-Dist: deepspeed (>=0.17.1,<0.18.0)

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/core/base.py RENAMED Viewed

@@ -37,6 +37,7 @@ from torch._dynamo import OptimizedModule
 from torch.nn.utils import clip_grad_norm_
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import SequentialLR
+from transformers import PretrainedConfig
 from transformers.modeling_utils import PreTrainedModel
 from vllm import LLM, SamplingParams
@@ -95,7 +96,11 @@ from agilerl.utils.evolvable_networks import (
     is_image_space,
     is_vector_space,
 )
-from agilerl.utils.llm_utils import DummyOptimizer, gather_if_zero3
+from agilerl.utils.llm_utils import (
+    DummyOptimizer,
+    create_model_from_name_or_path,
+    gather_if_zero3,
+)
 __all__ = ["EvolvableAlgorithm", "RLAlgorithm", "MultiAgentRLAlgorithm"]
@@ -1782,8 +1787,6 @@ class MultiAgentRLAlgorithm(EvolvableAlgorithm, ABC):
 class LLMAlgorithm(EvolvableAlgorithm, ABC):
     """Base object for all LLM algorithms in the AgileRL framework.
-    :param observation_space: The observation space of the environment.
-    :type observation_space: gymnasium.spaces.Space
     :param action_space: The action space of the environment.
     :type action_space: gymnasium.spaces.Space
     :param index: The index of the algorithm.
@@ -1800,9 +1803,6 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
     def __init__(
         self,
-        observation_space: spaces.Space,
-        action_space: spaces.Space,
-        actor_network: PreTrainedModel,
         index: int,
         batch_size: int,
         lr: float,
@@ -1815,6 +1815,8 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
         pad_token: str,
         lora_config: LoraConfig | None,
         use_separate_reference_adapter: bool,
+        model_name: str | None = None,
+        actor_network: PreTrainedModel | None = None,
         micro_batch_size_per_gpu: int | None = None,
         cosine_lr_schedule_config: Optional[CosineLRScheduleConfig] = None,
         hp_config: Optional[HyperparameterConfig] = None,
@@ -1822,7 +1824,13 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
         device: Union[str, torch.device] = "cpu",
         accelerator: Optional[Accelerator] = None,
         name: Optional[str] = None,
+        model_config: dict[str, Any] | PretrainedConfig | None = None,
+        gradient_checkpointing: bool = True,
     ):
+        if model_name is None and actor_network is None:
+            raise ValueError(
+                "At least one of model_name or actor_network must be provided."
+            )
         if (
             accelerator is not None
             and cosine_lr_schedule_config is not None
@@ -1835,20 +1843,16 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
             cosine_lr_schedule_config = None
         super().__init__(index, hp_config, device, accelerator, None, name)
-        assert isinstance(
-            observation_space, spaces.Space
-        ), "Observation space must be an instance of gymnasium.spaces.Space."
-        assert isinstance(
-            action_space, spaces.Space
-        ), "Action space must be an instance of gymnasium.spaces.Space."
-        self.observation_space = observation_space
-        self.action_space = action_space
+        self.gradient_checkpointing = gradient_checkpointing
         self.zero_stage = None
         self.reference_update_tracker = 0  # Updated every time the reference policy is updated which is updated each time we pass through the train dataset
         self.calc_position_embeddings = calc_position_embeddings
         self.pad_token_id = pad_token_id
         self.pad_token = pad_token
+        self.pretrained_model_name_or_path = (
+            model_name if model_name is not None else actor_network.name_or_path
+        )
+        self.model_config = model_config
         if not clone and reduce_memory_peak and micro_batch_size_per_gpu is not None:
             raise ValueError(
@@ -1858,7 +1862,9 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
         self._configure_batch_size(
             batch_size, clone, reduce_memory_peak, micro_batch_size_per_gpu
         )
+        self.batch_size = self.batch_size_per_process * (
+            self.accelerator.num_processes if self.accelerator is not None else 1
+        )
         if self.accelerator is not None:
             if (
                 self.accelerator.state.deepspeed_plugin.deepspeed_config.get(
@@ -1877,20 +1883,12 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
         if lora_config is None and not isinstance(actor_network, PeftModel):
             warnings.warn(
-                "No LoRA config provided. Using default LoRA configuration for RL finetuning."
+                "No LoRA config provided. AgileRL can only be used to finetune adapters at present. Using default LoRA configuration for RL finetuning."
             )
             lora_config = LoraConfig(
                 r=16,
-                lora_alpha=64,
-                target_modules=[
-                    "q_proj",
-                    "k_proj",
-                    "v_proj",
-                    "o_proj",
-                    "up_proj",
-                    "down_proj",
-                    "gate_proj",
-                ],
+                lora_alpha=32,
+                target_modules="all-linear",
                 task_type="CAUSAL_LM",
                 lora_dropout=0.05,
             )
@@ -1908,7 +1906,6 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
         else:
             self.max_grad_norm = max_grad_norm
         self.reduce_memory_peak = reduce_memory_peak
-        self.pretrained_model_name_or_path = actor_network.name_or_path
         if self.accelerator is not None:
             self.zero_stage = self.accelerator.state.deepspeed_plugin.deepspeed_config[
@@ -2141,15 +2138,17 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
                 if not is_dummy_optimizer
                 else type(self.actor.optimizer)
             )
-            self.actor.module.gradient_checkpointing_enable(
-                gradient_checkpointing_kwargs={"use_reentrant": False}
-            )
+            if self.gradient_checkpointing:
+                self.actor.module.gradient_checkpointing_enable(
+                    gradient_checkpointing_kwargs={"use_reentrant": False}
+                )
         else:
             assert (
                 self.actor is not None
             ), "Actor is set to None, please check that the actor is defined."
             self.actor = self.actor.to(self.device)
-            self.actor.gradient_checkpointing_enable()
+            if self.gradient_checkpointing:
+                self.actor.gradient_checkpointing_enable()
     def clean_up(self) -> None:
         """Clean up the algorithm."""
@@ -2408,7 +2407,7 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
             self.reference_update_tracker += 1
     def _initialize_actors(
-        self, base_model: PreTrainedModel, add_adapters: bool = True
+        self, base_model: PreTrainedModel | None, add_adapters: bool = True
     ):
         """Initialize the actor network.
@@ -2418,6 +2417,11 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
         :type add_adapters: bool, optional
         """
+        if base_model is None:
+            base_model = create_model_from_name_or_path(
+                self.pretrained_model_name_or_path
+            )
         if isinstance(base_model, PeftModel) and add_adapters:
             # Handles backwards compatibility with user providing a peft model as the actor network
             if self.lora_config is None:

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/dpo.py RENAMED Viewed

@@ -1,10 +1,10 @@
 import gc
+from typing import Any
 import numpy as np
 import torch
 import torch.nn.functional as F
 from accelerate import Accelerator
-from gymnasium import spaces
 from peft import LoraConfig
 from transformers import PreTrainedModel
@@ -16,13 +16,62 @@ from agilerl.utils.llm_utils import PreferenceGym
 class DPO(LLMAlgorithm):
+    """The DPO algorithm class. DPO paper: https://arxiv.org/pdf/2305.18290
+    :param pad_token_id: Pad token id
+    :type pad_token_id: int
+    :param pad_token: Pad token
+    :type pad_token: str
+    :param model_name: Model name
+    :type model_name: str, optional
+    :param actor_network: HuggingFace LLM
+    :type actor_network: PreTrainedModel
+    :param model_config: Model configuration, to be used when creating the model from a name or path
+    :param hp_config: RL hyperparameter mutation configuration, defaults to None, whereby algorithm mutations are disabled.
+    :type hp_config: HyperparameterConfig, optional
+    :param index: Index to keep track of object instance during tournament selection and mutation, defaults to 0
+    :type index: int, optional
+    :param batch_size: Batch size for training, defaults to 16
+    :type batch_size: int, optional
+    :param lr: Learning rate, defaults to 0.000005
+    :type lr: float, optional
+    :param beta: Beta parameter for DPO, defaults to 0.001
+    :type beta: float, optional
+    :param max_grad_norm: Maximum gradient norm, defaults to 0.1
+    :type max_grad_norm: float, optional
+    :param update_epochs: Number of update epochs, defaults to 1
+    :type update_epochs: int, optional
+    :param calc_position_embeddings: Flag to indicate if position embeddings should be calculated, defaults to True
+    :type calc_position_embeddings: bool, optional
+    :param micro_batch_size_per_gpu: Micro batch size per GPU, defaults to None
+    :type micro_batch_size_per_gpu: int, optional
+    :param reduce_memory_peak: Flag to indicate if memory peak should be reduced, defaults to False
+    :type reduce_memory_peak: bool, optional
+    :param device: Device for accelerated computing, 'cpu' or 'cuda', defaults to 'cpu'
+    :type device: str, optional
+    :param lora_config: Config for LoRA, defaults to None
+    :type lora_config: LoraConfig, optional
+    :param accelerator: Accelerator for distributed computing, defaults to None
+    :type accelerator: accelerate.Accelerator(), optional
+    :param wrap: Wrap models for distributed training upon creation, defaults to True
+    :type wrap: bool, optional
+    :param clone: Flag to indicate if the instantiation is a cloning, defaults to False
+    :type clone: bool, optional
+    :param use_separate_reference_adapter: Flag to indicate if the reference policy should have a separate adapter, defaults to False
+    :type use_separate_reference_adapter: bool, optional
+    :param seed: Seed for the random number generator, defaults to 42
+    :type seed: int, optional
+    :param gradient_checkpointing: Flag to indicate if gradient checkpointing should be used, defaults to True
+    :type gradient_checkpointing: bool, optional
+    """
     def __init__(
         self,
-        observation_space: spaces.Space,
-        action_space: spaces.Space,
-        actor_network: PreTrainedModel,
         pad_token_id: int,
         pad_token: str,
+        model_name: str | None = None,
+        actor_network: PreTrainedModel | None = None,
+        model_config: dict[str, Any] | None = None,
         hp_config: HyperparameterConfig | None = None,
         index: int = 0,
         batch_size: int = 16,
@@ -40,6 +89,7 @@ class DPO(LLMAlgorithm):
         clone: bool = False,
         use_separate_reference_adapter: bool = False,
         seed: int = 42,
+        gradient_checkpointing: bool = True,
     ):
         device = (
             f"cuda:{accelerator.process_index}"
@@ -47,9 +97,6 @@ class DPO(LLMAlgorithm):
             else ("cuda" if torch.cuda.is_available() else "cpu")
         )
         super().__init__(
-            observation_space,
-            action_space,
-            actor_network,
             index=index,
             batch_size=batch_size,
             lr=lr,
@@ -62,6 +109,9 @@ class DPO(LLMAlgorithm):
             pad_token=pad_token,
             lora_config=lora_config,
             use_separate_reference_adapter=use_separate_reference_adapter,
+            model_name=model_name,
+            actor_network=actor_network,
+            model_config=model_config,
             micro_batch_size_per_gpu=micro_batch_size_per_gpu,
             cosine_lr_schedule_config=None,
             hp_config=hp_config,
@@ -69,6 +119,7 @@ class DPO(LLMAlgorithm):
             device=device,
             accelerator=accelerator,
             name="DPO",
+            gradient_checkpointing=gradient_checkpointing,
         )
         self.beta = beta
         self.temperature = (

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/grpo.py RENAMED Viewed

@@ -1,12 +1,11 @@
 import gc
-from typing import Optional, Union
+from typing import Any, Optional, Union
 import numpy as np
 import torch
 from accelerate import Accelerator
 from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
 from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer
-from gymnasium import spaces
 from peft import LoraConfig, PeftModel
 from transformers import GenerationConfig
 from transformers.modeling_utils import PreTrainedModel
@@ -33,12 +32,16 @@ DeepSpeedOptimizerType = Union[
 class GRPO(LLMAlgorithm):
     """The GRPO algorithm class. GRPO paper: https://arxiv.org/pdf/2402.03300
-    :param observation_space: Observation space of the environment
-    :type observation_space: gym.spaces.Space
-    :param action_space: Action space of the environment
-    :type action_space: gym.spaces.Space
+    :param pad_token_id: Pad token id
+    :type pad_token_id: int
+    :param pad_token: Pad token
+    :type pad_token: str
+    :param model_name: Model name
+    :type model_name: str, optional
     :param actor_network: HuggingFace LLM
     :type actor_network: PreTrainedModel
+    :param model_config: Model configuration, to be used when creating the model from a name or path
+    :type model_config: dict[str, Any], optional
     :param hp_config: RL hyperparameter mutation configuration, defaults to None, whereby algorithm mutations are disabled.
     :type hp_config: HyperparameterConfig, optional
     :param index: Index to keep track of object instance during tournament selection and mutation, defaults to 0
@@ -93,15 +96,17 @@ class GRPO(LLMAlgorithm):
     :type vllm_config: VLLMConfig, optional
     :param seed: Seed for the random number generator, defaults to 42
     :type seed: int, optional
+    :param gradient_checkpointing: Flag to indicate if gradient checkpointing should be used, defaults to True
+    :type gradient_checkpointing: bool, optional
     """
     def __init__(
         self,
-        observation_space: spaces.Space,
-        action_space: spaces.Space,
-        actor_network: PreTrainedModel,
         pad_token_id: int,
         pad_token: str,
+        model_name: str | None = None,
+        actor_network: PreTrainedModel | None = None,
+        model_config: dict[str, Any] | None = None,
         hp_config: Optional[HyperparameterConfig] = None,
         index: int = 0,
         batch_size: int = 16,
@@ -132,6 +137,7 @@ class GRPO(LLMAlgorithm):
         use_vllm: bool = False,
         vllm_config: Optional[VLLMConfig] = None,
         seed: int = 42,
+        gradient_checkpointing: bool = True,
     ) -> None:
         device = (
@@ -140,9 +146,6 @@ class GRPO(LLMAlgorithm):
             else ("cuda" if torch.cuda.is_available() else "cpu")
         )
         super().__init__(
-            observation_space,
-            action_space,
-            actor_network,
             index=index,
             batch_size=batch_size,
             lr=lr,
@@ -155,6 +158,9 @@ class GRPO(LLMAlgorithm):
             pad_token=pad_token,
             lora_config=lora_config,
             use_separate_reference_adapter=use_separate_reference_adapter,
+            model_name=model_name,
+            actor_network=actor_network,
+            model_config=model_config,
             micro_batch_size_per_gpu=micro_batch_size_per_gpu,
             cosine_lr_schedule_config=cosine_lr_schedule_config,
             wrap=wrap,
@@ -162,6 +168,7 @@ class GRPO(LLMAlgorithm):
             device=device,
             accelerator=accelerator,
             name="GRPO",
+            gradient_checkpointing=gradient_checkpointing,
         )
         assert isinstance(batch_size, int), "Batch size must be an integer."
         assert batch_size >= 1, "Batch size must be greater than or equal to one."
@@ -179,9 +186,10 @@ class GRPO(LLMAlgorithm):
         assert (
             update_epochs >= 1
         ), "Policy update epochs must be greater than or equal to one."
-        assert isinstance(
-            actor_network, (PeftModel, PreTrainedModel)
-        ), "Actor network must be a PeftModel or PreTrainedModel"
+        if actor_network is not None:
+            assert isinstance(
+                actor_network, (PeftModel, PreTrainedModel)
+            ), "Actor network must be a PeftModel or PreTrainedModel"
         self.clip_coef = clip_coef
         self.update_epochs = update_epochs

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/training/train_llm.py RENAMED Viewed

@@ -115,7 +115,7 @@ def finetune_llm_reasoning(
     if init_hp is None:
         init_hp = {}
-        init_hp["BATCH_SIZE_PER_GPU"] = pop[0].batch_size
+        init_hp["BATCH_SIZE_PER_GPU"] = pop[0].batch_size_per_process
         init_hp["ALGO"] = pop[0].algo
     data_increment = (
         getattr(dist, "get_world_size", lambda: 1)() if dist.is_initialized() else 1
@@ -463,7 +463,7 @@ def finetune_llm_preference(
     if init_hp is None:
         init_hp = {}
-        init_hp["BATCH_SIZE_PER_GPU"] = pop[0].batch_size
+        init_hp["BATCH_SIZE_PER_GPU"] = pop[0].batch_size_per_process
         init_hp["ALGO"] = pop[0].algo
     data_increment = accelerator.num_processes if accelerator is not None else 1

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/utils/algo_utils.py RENAMED Viewed

@@ -1328,6 +1328,13 @@ class VLLMConfig:
     max_num_seqs: int = 8
     sleep_mode: bool = False
+    def __post_init__(self):
+        if self.sleep_mode:
+            warnings.warn(
+                """VLLM sleep mode cannot be used with populations of agents on a single device. To use sleep mode, ensure,
+                you are training a single agent or, alternatively, use a different device for each agent."""
+            )
 def create_warmup_cosine_scheduler(
     optimizer: torch.optim.Optimizer,

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/utils/llm_utils.py RENAMED Viewed

@@ -11,12 +11,53 @@ import torch.nn as nn
 from accelerate import Accelerator
 from datasets import Dataset
 from torch.utils.data import DataLoader
-from transformers import AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.modeling_utils import PreTrainedModel
 from transformers.tokenization_utils_base import BatchEncoding
 from agilerl.typing import PreferencePrompts, ReasoningPrompts
+def apply_chat_template(
+    conversation_template: list[dict[str, str]],
+    question: str,
+    answer: str,
+    tokenizer: AutoTokenizer,
+) -> BatchEncoding:
+    """
+    Create and tokenize a chat template for a reaosning task.
+    :param conversation_template: The conversation template to be tokenized.
+    :type conversation_template: list[dict[str, str]]
+    :param question: The question to be tokenized.
+    :type question: str
+    :param answer: The answer to be tokenized.
+    :type answer: str
+    :param tokenizer: The tokenizer to be used.
+    :type tokenizer: AutoTokenizer
+    :return: The tokenized prompt.
+    :rtype: BatchEncoding
+    """
+    formatted_conversation = [
+        {
+            "role": msg["role"],
+            "content": msg["content"].format(question=question, answer=answer),
+        }
+        for msg in conversation_template
+    ]
+    updated_prompt = tokenizer.apply_chat_template(
+        formatted_conversation, tokenize=False, continue_final_message=True
+    )
+    tokenized_prompt = tokenizer(
+        [updated_prompt],
+        return_tensors="pt",
+        padding=True,
+        padding_side="left",
+        return_attention_mask=True,
+    )
+    return tokenized_prompt
 class HuggingFaceGym(gym.Env, ABC):
     """Abstract base class for HuggingFace Gymnasium environments.
@@ -28,8 +69,8 @@ class HuggingFaceGym(gym.Env, ABC):
     :type tokenizer: AutoTokenizer
     :param custom_collate_fn: Custom collate function to be used for creating the batch, defaults to None
     :type custom_collate_fn: Callable, optional
-    :param apply_chat_template_fn: Function to apply the chat template to the batch of questions and answers, defaults to None
-    :type apply_chat_template_fn: Callable, optional
+    :param conversation_template: A structured conversation that acts as a base pattern for each data point.
+    :type conversation_template: list[dict[str, str]]
     :param data_batch_size_per_gpu: DataLoader batch size, defaults to 8
     :type data_batch_size_per_gpu: int, optional
     :param max_context_length: Maximum context length, defaults to None
@@ -47,12 +88,7 @@ class HuggingFaceGym(gym.Env, ABC):
         train_dataset: Dataset,
         test_dataset: Dataset,
         tokenizer: AutoTokenizer,
-        custom_collate_fn: (
-            Callable[[list[dict[str, Any]]], dict[str, Any]] | None
-        ) = None,
-        apply_chat_template_fn: (
-            Callable[[str, str, AutoTokenizer], BatchEncoding] | None
-        ) = None,
+        conversation_template: list[dict[str, str]],
         data_batch_size_per_gpu: int = 8,
         max_context_length: int | None = None,
         min_completion_length: int = None,
@@ -70,11 +106,8 @@ class HuggingFaceGym(gym.Env, ABC):
         self.max_context_length = max_context_length
         self.seed = seed
         generator = torch.Generator().manual_seed(seed)
-        if custom_collate_fn is None:
-            collate_kwargs = {"tokenizer": tokenizer}
-            if apply_chat_template_fn is not None:
-                collate_kwargs["apply_chat_template_fn"] = apply_chat_template_fn
-            custom_collate_fn = self.create_collate_fn(**collate_kwargs)
+        self.conversation_template = conversation_template
+        custom_collate_fn = self.create_collate_fn(tokenizer)
         dataloader_kwargs = {"collate_fn": custom_collate_fn}
         train_dataset = self._filter_dataset_by_max_context_length(
             train_dataset, "train dataset"
@@ -107,11 +140,6 @@ class HuggingFaceGym(gym.Env, ABC):
         self.test_dataloader_iter = iter(self.test_dataloader)
         self.dataloader = self.train_dataloader_iter
         self.reset_called = False
-        self.observation_space = gym.spaces.Box(low=0, high=tokenizer.vocab_size - 1)
-        self.action_space = gym.spaces.Box(
-            low=0,
-            high=tokenizer.vocab_size - 1,
-        )
         self.evaluation_mode = False
         self.num_epochs = 0
@@ -196,9 +224,11 @@ class HuggingFaceGym(gym.Env, ABC):
         :rtype: tuple[Dataset, Dataset]
         """
         dataset_type = "dataset" if dataset_type is None else dataset_type
-        if self.max_context_length is None:
-            return dataset
         filter_keyword = "prompt" if "prompt" in dataset.features.keys() else "question"
+        if self.max_context_length is None or not isinstance(
+            dataset[0][filter_keyword], str
+        ):
+            return dataset
         filtered_dataset = dataset.filter(
             lambda x: len(self.tokenizer.encode(x[filter_keyword]))
             <= self.max_context_length - self.min_completion_length
@@ -225,10 +255,10 @@ class ReasoningGym(HuggingFaceGym):
     :type tokenizer: AutoTokenizer
     :param reward_fn: Reward function for evaluating completions.
     :type reward_fn: Callable[..., float]
+    :param conversation_template: A structured conversation that acts as a base pattern for each data point.
+    :type conversation_template: list[dict[str, str]]
     :param data_batch_size_per_gpu: DataLoader batch size, defaults to 8
     :type data_batch_size_per_gpu: int, optional
-    :param custom_collate_fn: Custom collate fxwunction to be used for creating the batch, defaults to None
-    :type custom_collate_fn: Callable, optional
     :param accelerator: Accelerator to be used for training, defaults to None
     :type accelerator: Accelerator, optional
     :param max_context_length: Maximum context length, defaults to None
@@ -245,9 +275,8 @@ class ReasoningGym(HuggingFaceGym):
         test_dataset: Dataset,
         tokenizer: AutoTokenizer,
         reward_fn: Callable[[str, str, str], float],
-        apply_chat_template_fn: Callable[[str, str, AutoTokenizer], BatchEncoding],
+        conversation_template: list[dict[str, str]],
         data_batch_size_per_gpu: int = 8,
-        custom_collate_fn: Callable | None = None,
         accelerator: Accelerator | None = None,
         return_raw_completions: bool = False,
         max_context_length: int | None = None,
@@ -264,8 +293,7 @@ class ReasoningGym(HuggingFaceGym):
             train_dataset=train_dataset,
             test_dataset=test_dataset,
             tokenizer=tokenizer,
-            custom_collate_fn=custom_collate_fn,
-            apply_chat_template_fn=apply_chat_template_fn,
+            conversation_template=conversation_template,
             data_batch_size_per_gpu=data_batch_size_per_gpu,
             max_context_length=max_context_length,
             min_completion_length=0,
@@ -382,15 +410,12 @@ class ReasoningGym(HuggingFaceGym):
     def create_collate_fn(
         self,
         tokenizer: AutoTokenizer,
-        apply_chat_template_fn: Callable[[str, str, AutoTokenizer], BatchEncoding],
     ) -> Callable[[list[dict[str, Any]]], dict[str, Any]]:
         """
         Create a collate function that applies the chat template to the batch of questions and answers.
         :param tokenizer: Tokenizer to be used for encoding and decoding the prompts.
         :type tokenizer: AutoTokenizer
-        :param apply_chat_template_fn: Function to apply the chat template to the batch of questions and answers.
-        :type apply_chat_template_fn: Callable[[str, str, AutoTokenizer], BatchEncoding]
         :return: Collate function that applies the chat template to the batch of questions and answers.
         :rtype: Callable[[list[dict[str, Any]]], dict[str, Any]]
         """
@@ -402,7 +427,7 @@ class ReasoningGym(HuggingFaceGym):
             # Apply chat template to all samples
             tokenized_prompts = [
-                apply_chat_template_fn(q, a, tokenizer)
+                apply_chat_template(self.conversation_template, q, a, tokenizer)
                 for q, a in zip(questions, answers)
             ]
@@ -451,8 +476,7 @@ class PreferenceGym(HuggingFaceGym):
             train_dataset=train_dataset,
             test_dataset=test_dataset,
             tokenizer=tokenizer,
-            custom_collate_fn=None,
-            apply_chat_template_fn=None,
+            conversation_template=None,
             data_batch_size_per_gpu=data_batch_size_per_gpu,
             max_context_length=max_context_length,
             min_completion_length=min_completion_length,
@@ -667,3 +691,27 @@ def get_state_dict(model: nn.Module) -> dict[str, torch.Tensor]:
     with gather_if_zero3(3, list(model.parameters()), modifier_rank=0):
         return model.state_dict()
+def create_model_from_name_or_path(
+    model_name_or_path: str, model_config: dict[str, Any] | None = None
+) -> PreTrainedModel:
+    """
+    Create a model from a name or path.
+    :param model_name_or_path: The name or path of the model to create.
+    :type model_name_or_path: str
+    :param model_config: The configuration of the model to create.
+    :type model_config: dict[str, Any ] | None
+    :return: The created model.
+    :rtype: PreTrainedModel
+    """
+    if model_config is None:
+        model_config = {
+            "torch_dtype": torch.bfloat16,
+            "attn_implementation": "sdpa",
+        }
+    model = AutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path=model_name_or_path, **model_config
+    )
+    return model

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/utils/utils.py RENAMED Viewed

@@ -36,7 +36,7 @@ from agilerl.hpo.mutation import Mutations
 from agilerl.hpo.tournament import TournamentSelection
 from agilerl.modules import EvolvableModule
 from agilerl.typing import BPTTSequenceType, GymSpaceType, PopulationType
-from agilerl.utils.algo_utils import CosineLRScheduleConfig, VLLMConfig, clone_llm
+from agilerl.utils.algo_utils import CosineLRScheduleConfig, clone_llm
 from agilerl.utils.llm_utils import DummyOptimizer, get_state_dict
 from agilerl.vector.pz_async_vec_env import AsyncPettingZooVecEnv
@@ -213,10 +213,10 @@ def default_progress_bar(
 def create_population(
     algo: str,
-    observation_space: GymSpaceType,
-    action_space: GymSpaceType,
     net_config: Optional[dict[str, Any]],
     INIT_HP: dict[str, Any],
+    observation_space: GymSpaceType | None = None,
+    action_space: GymSpaceType | None = None,
     hp_config: Optional[HyperparameterConfig] = None,
     actor_network: Optional[EvolvableModule] = None,
     critic_network: Optional[EvolvableModule] = None,
@@ -233,14 +233,14 @@ def create_population(
     :param algo: RL algorithm
     :type algo: str
-    :param observation_space: Observation space
-    :type observation_space: spaces.Space
-    :param action_space: Action space
-    :type action_space: spaces.Space
     :param net_config: Network configuration
     :type net_config: dict or None
     :param INIT_HP: Initial hyperparameters
     :type INIT_HP: dict
+    :param observation_space: Observation space
+    :type observation_space: spaces.Space
+    :param action_space: Action space
+    :type action_space: spaces.Space
     :param hp_config: Choice of algorithm hyperparameters to mutate during training, defaults to None
     :type hp_config: HyperparameterConfig, optional
     :param actor_network: Custom actor network, defaults to None
@@ -572,23 +572,23 @@ def create_population(
     elif algo == "GRPO":
         for idx in range(population_size):
             agent = GRPO(
-                observation_space=observation_space,
-                action_space=action_space,
                 actor_network=(
-                    clone_llm(
-                        actor_network,
-                        zero_stage=INIT_HP.get("ZERO_STAGE", 0),
-                        state_dict=(
-                            actor_network.state_dict()
-                            if accelerator is None
-                            else get_state_dict(actor_network)
-                        ),
+                    (
+                        clone_llm(
+                            actor_network,
+                            zero_stage=INIT_HP.get("ZERO_STAGE", 0),
+                            state_dict=(
+                                actor_network.state_dict()
+                                if accelerator is None
+                                else get_state_dict(actor_network)
+                            ),
+                        )
+                        if idx != 0
+                        else actor_network
                     )
-                    if idx != 0
-                    else actor_network
+                    if actor_network is not None
+                    else None
                 ),
-                pad_token_id=INIT_HP.get("PAD_TOKEN_ID"),
-                pad_token=INIT_HP.get("PAD_TOKEN"),
                 hp_config=hp_config,
                 index=idx,
                 batch_size=INIT_HP.get("BATCH_SIZE", 2),
@@ -601,7 +601,7 @@ def create_population(
                 temperature=INIT_HP.get("TEMPERATURE", 0.9),
                 calc_position_embeddings=INIT_HP.get("CALC_POSITION_EMBEDDINGS", True),
                 reduce_memory_peak=INIT_HP.get("REDUCE_MEMORY_PEAK", False),
-                max_output_tokens=INIT_HP.get("MAX_OUTPUT_TOKENS", 1024),
+                max_output_tokens=INIT_HP.get("MAX_OUTPUT_TOKENS", None),
                 min_output_tokens=INIT_HP.get("MIN_OUTPUT_TOKENS", None),
                 cosine_lr_schedule_config=(
                     CosineLRScheduleConfig(**INIT_HP.get("COSINE_lR_SCHEDULER", None))
@@ -610,23 +610,13 @@ def create_population(
                 ),
                 accelerator=Accelerator() if accelerator else None,
                 device=device,
-                use_separate_reference_adapter=False,
                 max_model_len=INIT_HP.get("MAX_MODEL_LEN", None),
-                use_vllm=INIT_HP.get("USE_VLLM", False),
-                vllm_config=(
-                    VLLMConfig(**INIT_HP.get("VLLM_CONFIG"))
-                    if INIT_HP.get("VLLM_CONFIG", None) is not None
-                    and INIT_HP.get("USE_VLLM", False)
-                    else None
-                ),
                 **algo_kwargs,
             )
             population.append(agent)
     elif algo == "DPO":
         for idx in range(population_size):
             agent = DPO(
-                observation_space=observation_space,
-                action_space=action_space,
                 actor_network=(
                     clone_llm(
                         actor_network,
@@ -640,8 +630,6 @@ def create_population(
                     if idx != 0
                     else actor_network
                 ),
-                pad_token_id=INIT_HP.get("PAD_TOKEN_ID"),
-                pad_token=INIT_HP.get("PAD_TOKEN"),
                 hp_config=hp_config,
                 index=idx,
                 batch_size=INIT_HP.get("BATCH_SIZE", 2),

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "agilerl"
-version = "2.4.0.dev0"
+version = "2.4.1.dev0"
 description = "AgileRL is a deep reinforcement learning library focused on improving RL development through RLOps."
 authors = ["Nick Ustaran-Anderegg <dev@agilerl.com>"]

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/LICENSE RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/README.md RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/__init__.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/__init__.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/bc_lm.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/core/__init__.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/core/optimizer_wrapper.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/core/registry.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/cqn.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/ddpg.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/dqn.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/dqn_rainbow.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/ilql.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/ippo.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/maddpg.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/matd3.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/neural_ts_bandit.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/neural_ucb_bandit.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/ppo.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/algorithms/td3.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/components/__init__.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/components/data.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/components/multi_agent_replay_buffer.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/components/replay_buffer.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/components/rollout_buffer.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/components/sampler.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/components/segment_tree.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/data/__init__.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/data/language_environment.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/data/rl_data.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/data/tokenizer.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/data/torch_datasets.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/hpo/__init__.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/hpo/mutation.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/hpo/tournament.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/modules/__init__.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/modules/base.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/modules/bert.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/modules/cnn.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/modules/configs.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/modules/custom_components.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/modules/dummy.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/modules/gpt.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/modules/lstm.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/modules/mlp.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/modules/multi_input.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/modules/resnet.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/modules/simba.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/networks/__init__.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/networks/actors.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/networks/base.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/networks/custom_modules.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/networks/distributions.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/networks/distributions_experimental.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/networks/q_networks.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/networks/value_networks.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/protocols.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/rollouts/__init__.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/rollouts/on_policy.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/training/__init__.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/training/train_bandits.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/training/train_multi_agent_off_policy.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/training/train_multi_agent_on_policy.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/training/train_off_policy.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/training/train_offline.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/training/train_on_policy.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/typing.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/utils/__init__.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/utils/cache.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/utils/evolvable_networks.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/utils/ilql_utils.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/utils/log_utils.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/utils/minari_utils.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/utils/probe_envs.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/utils/probe_envs_ma.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/utils/sampling_utils.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/utils/torch_utils.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/vector/__init__.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/vector/pz_async_vec_env.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/vector/pz_vec_env.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/wrappers/__init__.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/wrappers/agent.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/wrappers/learning.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/wrappers/make_evolvable.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/wrappers/pettingzoo_wrappers.py RENAMED Viewed

File without changes

{agilerl-2.4.0.dev0 → agilerl-2.4.1.dev0}/agilerl/wrappers/utils.py RENAMED Viewed

File without changes

agilerl 2.4.0.dev0__tar.gz → 2.4.1.dev0__tar.gz

agilerl 2.4.0.dev0tar.gz → 2.4.1.dev0tar.gz