PyPI - judgeval - Versions diffs - 0.16.9__py3-none-any.whl → 0.17.0__py3-none-any.whl - Mend

judgeval 0.16.9py3-none-any.whl → 0.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of judgeval might be problematic. Click here for more details.

Files changed (11) hide show

judgeval/trainer/__init__.py +10 -1
judgeval/trainer/base_trainer.py +117 -0
judgeval/trainer/config.py +1 -1
judgeval/trainer/fireworks_trainer.py +381 -0
judgeval/trainer/trainer.py +52 -387
judgeval/version.py +1 -1
{judgeval-0.16.9.dist-info → judgeval-0.17.0.dist-info}/METADATA +2 -3
{judgeval-0.16.9.dist-info → judgeval-0.17.0.dist-info}/RECORD +11 -9
{judgeval-0.16.9.dist-info → judgeval-0.17.0.dist-info}/WHEEL +0 -0
{judgeval-0.16.9.dist-info → judgeval-0.17.0.dist-info}/entry_points.txt +0 -0
{judgeval-0.16.9.dist-info → judgeval-0.17.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/trainer/__init__.py CHANGED Viewed

@@ -1,5 +1,14 @@
 from judgeval.trainer.trainer import JudgmentTrainer
 from judgeval.trainer.config import TrainerConfig, ModelConfig
 from judgeval.trainer.trainable_model import TrainableModel
+from judgeval.trainer.base_trainer import BaseTrainer
+from judgeval.trainer.fireworks_trainer import FireworksTrainer
-__all__ = ["JudgmentTrainer", "TrainerConfig", "ModelConfig", "TrainableModel"]
+__all__ = [
+    "JudgmentTrainer",
+    "TrainerConfig",
+    "ModelConfig",
+    "TrainableModel",
+    "BaseTrainer",
+    "FireworksTrainer",
+]

judgeval/trainer/base_trainer.py ADDED Viewed

@@ -0,0 +1,117 @@
+from abc import ABC, abstractmethod
+from typing import Any, Callable, List, Optional, Union, Dict, TYPE_CHECKING
+from .config import TrainerConfig, ModelConfig
+from judgeval.scorers import ExampleScorer, ExampleAPIScorerConfig
+if TYPE_CHECKING:
+    from judgeval.tracer import Tracer
+    from .trainable_model import TrainableModel
+class BaseTrainer(ABC):
+    """
+    Abstract base class for training providers.
+    This class defines the interface that all training provider implementations
+    must follow. Each provider (Fireworks, Verifiers, etc.) will have its own
+    concrete implementation of this interface.
+    """
+    def __init__(
+        self,
+        config: TrainerConfig,
+        trainable_model: "TrainableModel",
+        tracer: "Tracer",
+        project_name: Optional[str] = None,
+    ):
+        """
+        Initialize the base trainer.
+        Args:
+            config: TrainerConfig instance with training parameters
+            trainable_model: TrainableModel instance to use for training
+            tracer: Tracer for observability
+            project_name: Project name for organizing training runs
+        """
+        self.config = config
+        self.trainable_model = trainable_model
+        self.tracer = tracer
+        self.project_name = project_name or "judgment_training"
+    @abstractmethod
+    async def generate_rollouts_and_rewards(
+        self,
+        agent_function: Callable[[Any], Any],
+        scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
+        prompts: List[Any],
+        num_prompts_per_step: Optional[int] = None,
+        num_generations_per_prompt: Optional[int] = None,
+        concurrency: Optional[int] = None,
+    ) -> Any:
+        """
+        Generate rollouts and compute rewards using the current model snapshot.
+        Args:
+            agent_function: Function/agent to call for generating responses
+            scorers: List of scorer objects to evaluate responses
+            prompts: List of prompts to use for training
+            num_prompts_per_step: Number of prompts to use per step
+            num_generations_per_prompt: Generations per prompt
+            concurrency: Concurrency limit
+        Returns:
+            Provider-specific dataset format for training
+        """
+        pass
+    @abstractmethod
+    async def run_reinforcement_learning(
+        self,
+        agent_function: Callable[[Any], Any],
+        scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
+        prompts: List[Any],
+    ) -> ModelConfig:
+        """
+        Run the iterative reinforcement learning fine-tuning loop.
+        Args:
+            agent_function: Function/agent to call for generating responses
+            scorers: List of scorer objects to evaluate responses
+            prompts: List of prompts to use for training
+        Returns:
+            ModelConfig: Configuration of the trained model
+        """
+        pass
+    @abstractmethod
+    async def train(
+        self,
+        agent_function: Callable[[Any], Any],
+        scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
+        prompts: List[Any],
+    ) -> ModelConfig:
+        """
+        Start the reinforcement learning fine-tuning process.
+        This is the main entry point for running the training.
+        Args:
+            agent_function: Function/agent to call for generating responses
+            scorers: List of scorer objects to evaluate responses
+            prompts: List of prompts to use for training
+        Returns:
+            ModelConfig: Configuration of the trained model
+        """
+        pass
+    @abstractmethod
+    def _extract_message_history_from_spans(self) -> List[Dict[str, str]]:
+        """
+        Extract message history from spans for training purposes.
+        Returns:
+            List of message dictionaries with 'role' and 'content' keys
+        """
+        pass

judgeval/trainer/config.py CHANGED Viewed

@@ -16,7 +16,7 @@ class TrainerConfig:
     user_id: str
     model_id: str
     base_model_name: str = "qwen2p5-7b-instruct"
-    rft_provider: str = "fireworks"
+    rft_provider: str = "fireworks"  # Supported: "fireworks", "verifiers" (future)
     num_steps: int = 5
     num_generations_per_prompt: int = 4
     num_prompts_per_step: int = 4

judgeval/trainer/fireworks_trainer.py ADDED Viewed

@@ -0,0 +1,381 @@
+import asyncio
+import json
+from typing import Optional, Callable, Any, List, Union, Dict
+from fireworks import Dataset  # type: ignore[import-not-found]
+from .config import TrainerConfig, ModelConfig
+from .base_trainer import BaseTrainer
+from .trainable_model import TrainableModel
+from judgeval.tracer import Tracer
+from judgeval.tracer.exporters.store import SpanStore
+from judgeval.tracer.exporters import InMemorySpanExporter
+from judgeval.tracer.keys import AttributeKeys
+from judgeval import JudgmentClient
+from judgeval.scorers import ExampleScorer, ExampleAPIScorerConfig
+from judgeval.data import Example
+from .console import _spinner_progress, _print_progress, _print_progress_update
+from judgeval.exceptions import JudgmentRuntimeError
+class FireworksTrainer(BaseTrainer):
+    """
+    Fireworks AI implementation of the training provider.
+    This trainer uses Fireworks AI's infrastructure for reinforcement learning
+    fine-tuning (RFT) of language models.
+    """
+    def __init__(
+        self,
+        config: TrainerConfig,
+        trainable_model: TrainableModel,
+        tracer: Tracer,
+        project_name: Optional[str] = None,
+    ):
+        """
+        Initialize the FireworksTrainer.
+        Args:
+            config: TrainerConfig instance with training parameters
+            trainable_model: TrainableModel instance for Fireworks training
+            tracer: Tracer for observability
+            project_name: Project name for organizing training runs and evaluations
+        """
+        try:
+            super().__init__(config, trainable_model, tracer, project_name)
+            self.judgment_client = JudgmentClient()
+            self.span_store = SpanStore()
+            self.span_exporter = InMemorySpanExporter(self.span_store)
+        except Exception as e:
+            raise JudgmentRuntimeError(
+                f"Failed to initialize FireworksTrainer: {str(e)}"
+            ) from e
+    def _extract_message_history_from_spans(self) -> List[Dict[str, str]]:
+        """
+        Extract message history from spans in the span store for training purposes.
+        This method processes trace spans to reconstruct the conversation flow,
+        extracting messages in chronological order from LLM, user, and tool spans.
+        Returns:
+            List of message dictionaries with 'role' and 'content' keys
+        """
+        spans = self.span_store.get_all()
+        if not spans:
+            return []
+        messages = []
+        first_found = False
+        for span in sorted(spans, key=lambda s: getattr(s, "start_time", 0)):
+            span_attributes = span.attributes or {}
+            span_type = span_attributes.get(AttributeKeys.JUDGMENT_SPAN_KIND, "span")
+            if (
+                not span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
+                and span_type != "llm"
+            ):
+                continue
+            if span_type == "llm":
+                if not first_found and span_attributes.get(
+                    AttributeKeys.JUDGMENT_INPUT
+                ):
+                    input_data: Any = span_attributes.get(
+                        AttributeKeys.JUDGMENT_INPUT, {}
+                    )
+                    if isinstance(input_data, dict) and "messages" in input_data:
+                        input_messages = input_data["messages"]
+                        if input_messages:
+                            first_found = True
+                            for msg in input_messages:
+                                if (
+                                    isinstance(msg, dict)
+                                    and "role" in msg
+                                    and "content" in msg
+                                ):
+                                    messages.append(
+                                        {"role": msg["role"], "content": msg["content"]}
+                                    )
+                # Add assistant response from span output
+                output = span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
+                if output is not None:
+                    content = str(output)
+                    try:
+                        parsed = json.loads(content)
+                        if isinstance(parsed, dict) and "messages" in parsed:
+                            # Extract the actual assistant message content
+                            for msg in parsed["messages"]:
+                                if (
+                                    isinstance(msg, dict)
+                                    and msg.get("role") == "assistant"
+                                ):
+                                    content = msg.get("content", content)
+                                    break
+                    except (json.JSONDecodeError, KeyError):
+                        pass
+                    messages.append({"role": "assistant", "content": content})
+            elif span_type in ("user", "tool"):
+                output = span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
+                if output is not None:
+                    content = str(output)
+                    try:
+                        parsed = json.loads(content)
+                        if isinstance(parsed, dict) and "messages" in parsed:
+                            for msg in parsed["messages"]:
+                                if isinstance(msg, dict) and msg.get("role") == "user":
+                                    content = msg.get("content", content)
+                                    break
+                    except (json.JSONDecodeError, KeyError):
+                        pass
+                    messages.append({"role": "user", "content": content})
+        return messages
+    async def generate_rollouts_and_rewards(
+        self,
+        agent_function: Callable[[Any], Any],
+        scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
+        prompts: List[Any],
+        num_prompts_per_step: Optional[int] = None,
+        num_generations_per_prompt: Optional[int] = None,
+        concurrency: Optional[int] = None,
+    ):
+        """
+        Generate rollouts and compute rewards using the current model snapshot.
+        Each sample contains multiple generations for reinforcement learning optimization.
+        Args:
+            agent_function: Function/agent to call for generating responses
+            scorers: List of scorer objects to evaluate responses
+            prompts: List of prompts to use for training
+            num_prompts_per_step: Number of prompts to use per step (defaults to config value, limited by prompts list length)
+            num_generations_per_prompt: Generations per prompt (defaults to config value)
+            concurrency: Concurrency limit (defaults to config value)
+        Returns:
+            List of dataset rows containing samples with messages and evaluations
+        """
+        num_prompts_per_step = min(
+            num_prompts_per_step or self.config.num_prompts_per_step, len(prompts)
+        )
+        num_generations_per_prompt = (
+            num_generations_per_prompt or self.config.num_generations_per_prompt
+        )
+        concurrency = concurrency or self.config.concurrency
+        semaphore = asyncio.Semaphore(concurrency)
+        @self.tracer.observe(span_type="function")
+        async def generate_single_response(prompt_id, generation_id):
+            async with semaphore:
+                prompt_input = prompts[prompt_id]
+                response_data = await agent_function(**prompt_input)
+                messages = response_data.get("messages", [])
+                try:
+                    traced_messages = self._extract_message_history_from_spans()
+                    if traced_messages:
+                        messages = traced_messages
+                except Exception as e:
+                    print(f"Warning: Failed to get message history from trace: {e}")
+                    pass
+                finally:
+                    self.span_store.spans = []
+                example = Example(
+                    input=prompt_input,
+                    messages=messages,
+                    actual_output=response_data,
+                )
+                scoring_results = self.judgment_client.run_evaluation(
+                    examples=[example],
+                    scorers=scorers,
+                    project_name=self.project_name,
+                    eval_run_name=f"training_step_{self.trainable_model.current_step}_prompt_{prompt_id}_gen_{generation_id}",
+                )
+                if scoring_results and scoring_results[0].scorers_data:
+                    scores = [
+                        scorer_data.score
+                        for scorer_data in scoring_results[0].scorers_data
+                        if scorer_data.score is not None
+                    ]
+                    reward = sum(scores) / len(scores) if scores else 0.0
+                else:
+                    reward = 0.0
+            return {
+                "prompt_id": prompt_id,
+                "generation_id": generation_id,
+                "messages": messages,
+                "evals": {"score": reward},
+            }
+        coros = []
+        for prompt_id in range(num_prompts_per_step):
+            for generation_id in range(num_generations_per_prompt):
+                coro = generate_single_response(prompt_id, generation_id)
+                coros.append(coro)
+        with _spinner_progress(f"Generating {len(coros)} rollouts..."):
+            num_completed = 0
+            results = []
+            for coro in asyncio.as_completed(coros):
+                result = await coro
+                results.append(result)
+                num_completed += 1
+        _print_progress(f"Generated {len(results)} rollouts successfully")
+        dataset_rows = []
+        for prompt_id in range(num_prompts_per_step):
+            prompt_generations = [r for r in results if r["prompt_id"] == prompt_id]
+            sample_generations = [
+                {"messages": gen["messages"], "evals": gen["evals"]}
+                for gen in prompt_generations
+            ]
+            dataset_rows.append({"samples": sample_generations})
+        return dataset_rows
+    async def run_reinforcement_learning(
+        self,
+        agent_function: Callable[[Any], Any],
+        scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
+        prompts: List[Any],
+    ) -> ModelConfig:
+        """
+        Run the iterative reinforcement learning fine-tuning loop.
+        This method performs multiple steps of reinforcement learning, where each step:
+        1. Advances to the appropriate model snapshot
+        2. Generates rollouts and computes rewards using scorers
+        3. Trains a new model using reinforcement learning
+        4. Waits for training completion
+        Args:
+            agent_function: Function/agent to call for generating responses
+            scorers: List of scorer objects to evaluate responses
+            prompts: List of prompts to use for training
+        Returns:
+            ModelConfig: Configuration of the trained model for inference and future training
+        """
+        _print_progress("Starting reinforcement learning training")
+        training_params = {
+            "num_steps": self.config.num_steps,
+            "num_prompts_per_step": self.config.num_prompts_per_step,
+            "num_generations_per_prompt": self.config.num_generations_per_prompt,
+            "epochs": self.config.epochs,
+            "learning_rate": self.config.learning_rate,
+            "accelerator_count": self.config.accelerator_count,
+            "accelerator_type": self.config.accelerator_type,
+            "temperature": self.config.temperature,
+            "max_tokens": self.config.max_tokens,
+        }
+        start_step = self.trainable_model.current_step
+        for step in range(start_step, self.config.num_steps):
+            step_num = step + 1
+            _print_progress(
+                f"Starting training step {step_num}", step_num, self.config.num_steps
+            )
+            self.trainable_model.advance_to_next_step(step)
+            dataset_rows = await self.generate_rollouts_and_rewards(
+                agent_function, scorers, prompts
+            )
+            with _spinner_progress(
+                "Preparing training dataset", step_num, self.config.num_steps
+            ):
+                dataset = Dataset.from_list(dataset_rows)
+                dataset.sync()
+            _print_progress(
+                "Starting reinforcement training", step_num, self.config.num_steps
+            )
+            job = self.trainable_model.perform_reinforcement_step(dataset, step)
+            last_state = None
+            with _spinner_progress(
+                "Training job in progress", step_num, self.config.num_steps
+            ):
+                while not job.is_completed:
+                    job.raise_if_bad_state()
+                    current_state = job.state
+                    if current_state != last_state:
+                        if current_state in ["uploading", "validating"]:
+                            _print_progress_update(
+                                f"Training job: {current_state} data"
+                            )
+                        elif current_state == "training":
+                            _print_progress_update(
+                                "Training job: model training in progress"
+                            )
+                        else:
+                            _print_progress_update(f"Training job: {current_state}")
+                        last_state = current_state
+                    await asyncio.sleep(10)
+                    job = job.get()
+                    if job is None:
+                        raise JudgmentRuntimeError(
+                            "Training job was deleted while waiting for completion"
+                        )
+            _print_progress(
+                f"Training completed! New model: {job.output_model}",
+                step_num,
+                self.config.num_steps,
+            )
+            dataset.delete()
+        _print_progress("All training steps completed!")
+        with _spinner_progress("Deploying final trained model"):
+            self.trainable_model.advance_to_next_step(self.config.num_steps)
+        return self.trainable_model.get_model_config(training_params)
+    async def train(
+        self,
+        agent_function: Callable[[Any], Any],
+        scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
+        prompts: List[Any],
+    ) -> ModelConfig:
+        """
+        Start the reinforcement learning fine-tuning process.
+        This is the main entry point for running the reinforcement learning training.
+        Args:
+            agent_function: Function/agent to call for generating responses.
+            scorers: List of scorer objects to evaluate responses
+            prompts: List of prompts to use for training
+        Returns:
+            ModelConfig: Configuration of the trained model for future loading
+        """
+        try:
+            return await self.run_reinforcement_learning(
+                agent_function, scorers, prompts
+            )
+        except JudgmentRuntimeError:
+            # Re-raise JudgmentRuntimeError as-is
+            raise
+        except Exception as e:
+            raise JudgmentRuntimeError(f"Training process failed: {str(e)}") from e

judgeval/trainer/trainer.py CHANGED Viewed

@@ -1,405 +1,70 @@
-import asyncio
-import json
-import time
-from typing import Optional, Callable, Any, List, Union, Dict
-from fireworks import Dataset  # type: ignore[import-not-found]
-from .config import TrainerConfig, ModelConfig
+from typing import Optional
+from .config import TrainerConfig
+from .base_trainer import BaseTrainer
+from .fireworks_trainer import FireworksTrainer
 from .trainable_model import TrainableModel
 from judgeval.tracer import Tracer
-from judgeval.tracer.exporters.store import SpanStore
-from judgeval.tracer.exporters import InMemorySpanExporter
-from judgeval.tracer.keys import AttributeKeys
-from judgeval import JudgmentClient
-from judgeval.scorers import ExampleScorer, ExampleAPIScorerConfig
-from judgeval.data import Example
-from .console import _spinner_progress, _print_progress, _print_progress_update
 from judgeval.exceptions import JudgmentRuntimeError
-class JudgmentTrainer:
+def JudgmentTrainer(
+    config: TrainerConfig,
+    trainable_model: TrainableModel,
+    tracer: Tracer,
+    project_name: Optional[str] = None,
+) -> BaseTrainer:
     """
-    A reinforcement learning trainer for Judgment models using Fine-Tuning.
+    Factory function for creating reinforcement learning trainers.
-    This class handles the iterative training process where models are improved
-    through reinforcement learning fine-tuning based on generated rollouts and rewards.
-    """
-    def __init__(
-        self,
-        config: TrainerConfig,
-        trainable_model: TrainableModel,
-        tracer: Tracer,
-        project_name: Optional[str] = None,
-    ):
-        """
-        Initialize the JudgmentTrainer.
-        Args:
-            config: TrainerConfig instance with training parameters. If None, uses default config.
-            tracer: Optional tracer for observability
-            trainable_model: Optional trainable model instance
-            project_name: Project name for organizing training runs and evaluations
-        """
-        try:
-            self.config = config
-            self.tracer = tracer
-            self.project_name = project_name or "judgment_training"
-            self.trainable_model = trainable_model
-            self.judgment_client = JudgmentClient()
-            self.span_store = SpanStore()
-            self.span_exporter = InMemorySpanExporter(self.span_store)
-        except Exception as e:
-            raise JudgmentRuntimeError(
-                f"Failed to initialize JudgmentTrainer: {str(e)}"
-            ) from e
-    def _extract_message_history_from_spans(self) -> List[Dict[str, str]]:
-        """
-        Extract message history from spans in the span store for training purposes.
-        This method processes trace spans to reconstruct the conversation flow,
-        extracting messages in chronological order from LLM, user, and tool spans.
-        Returns:
-            List of message dictionaries with 'role' and 'content' keys
-        """
-        spans = self.span_store.get_all()
-        if not spans:
-            return []
-        messages = []
-        first_found = False
-        for span in sorted(spans, key=lambda s: getattr(s, "start_time", 0)):
-            span_attributes = span.attributes or {}
-            span_type = span_attributes.get(AttributeKeys.JUDGMENT_SPAN_KIND, "span")
-            if (
-                not span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
-                and span_type != "llm"
-            ):
-                continue
-            if span_type == "llm":
-                if not first_found and span_attributes.get(
-                    AttributeKeys.JUDGMENT_INPUT
-                ):
-                    input_data: Any = span_attributes.get(
-                        AttributeKeys.JUDGMENT_INPUT, {}
-                    )
-                    if isinstance(input_data, dict) and "messages" in input_data:
-                        input_messages = input_data["messages"]
-                        if input_messages:
-                            first_found = True
-                            for msg in input_messages:
-                                if (
-                                    isinstance(msg, dict)
-                                    and "role" in msg
-                                    and "content" in msg
-                                ):
-                                    messages.append(
-                                        {"role": msg["role"], "content": msg["content"]}
-                                    )
-                # Add assistant response from span output
-                output = span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
-                if output is not None:
-                    content = str(output)
-                    try:
-                        parsed = json.loads(content)
-                        if isinstance(parsed, dict) and "messages" in parsed:
-                            # Extract the actual assistant message content
-                            for msg in parsed["messages"]:
-                                if (
-                                    isinstance(msg, dict)
-                                    and msg.get("role") == "assistant"
-                                ):
-                                    content = msg.get("content", content)
-                                    break
-                    except (json.JSONDecodeError, KeyError):
-                        pass
-                    messages.append({"role": "assistant", "content": content})
-            elif span_type == "user":
-                output = span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
-                if output is not None:
-                    content = str(output)
-                    try:
-                        parsed = json.loads(content)
-                        if isinstance(parsed, dict) and "messages" in parsed:
-                            for msg in parsed["messages"]:
-                                if isinstance(msg, dict) and msg.get("role") == "user":
-                                    content = msg.get("content", content)
-                                    break
-                    except (json.JSONDecodeError, KeyError):
-                        pass
-                    messages.append({"role": "user", "content": content})
+    This factory creates and returns provider-specific trainer implementations
+    (FireworksTrainer, VerifiersTrainer, etc.) based on the configured RFT provider.
-            elif span_type == "tool":
-                output = span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
-                if output is not None:
-                    content = str(output)
-                    try:
-                        parsed = json.loads(content)
-                        if isinstance(parsed, dict) and "messages" in parsed:
-                            for msg in parsed["messages"]:
-                                if isinstance(msg, dict) and msg.get("role") == "user":
-                                    content = msg.get("content", content)
-                                    break
-                    except (json.JSONDecodeError, KeyError):
-                        pass
-                    messages.append({"role": "user", "content": content})
+    The factory pattern allows for easy extension to support multiple training
+    providers without changing the client-facing API.
-        return messages
-    async def generate_rollouts_and_rewards(
-        self,
-        agent_function: Callable[[Any], Any],
-        scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
-        prompts: List[Any],
-        num_prompts_per_step: Optional[int] = None,
-        num_generations_per_prompt: Optional[int] = None,
-        concurrency: Optional[int] = None,
-    ):
-        """
-        Generate rollouts and compute rewards using the current model snapshot.
-        Each sample contains multiple generations for reinforcement learning optimization.
-        Args:
-            agent_function: Function/agent to call for generating responses
-            scorers: List of scorer objects to evaluate responses
-            prompts: List of prompts to use for training
-            num_prompts_per_step: Number of prompts to use per step (defaults to config value, limited by prompts list length)
-            num_generations_per_prompt: Generations per prompt (defaults to config value)
-            concurrency: Concurrency limit (defaults to config value)
-        Returns:
-            List of dataset rows containing samples with messages and evaluations
-        """
-        num_prompts_per_step = min(
-            num_prompts_per_step or self.config.num_prompts_per_step, len(prompts)
-        )
-        num_generations_per_prompt = (
-            num_generations_per_prompt or self.config.num_generations_per_prompt
+    Example:
+        config = TrainerConfig(
+            deployment_id="my-deployment",
+            user_id="my-user",
+            model_id="my-model",
+            rft_provider="fireworks"  # or "verifiers" in the future
         )
-        concurrency = concurrency or self.config.concurrency
-        semaphore = asyncio.Semaphore(concurrency)
-        @self.tracer.observe(span_type="function")
-        async def generate_single_response(prompt_id, generation_id):
-            async with semaphore:
-                prompt_input = prompts[prompt_id]
-                response_data = await agent_function(**prompt_input)
-                messages = response_data.get("messages", [])
-                try:
-                    traced_messages = self._extract_message_history_from_spans()
-                    if traced_messages:
-                        messages = traced_messages
-                except Exception as e:
-                    print(f"Warning: Failed to get message history from trace: {e}")
-                    pass
-                finally:
-                    self.span_store.spans = []
-                example = Example(
-                    input=prompt_input,
-                    messages=messages,
-                    actual_output=response_data,
-                )
-                scoring_results = self.judgment_client.run_evaluation(
-                    examples=[example],
-                    scorers=scorers,
-                    project_name=self.project_name,
-                    eval_run_name=f"training_step_{self.trainable_model.current_step}_prompt_{prompt_id}_gen_{generation_id}",
-                )
-                if scoring_results and scoring_results[0].scorers_data:
-                    scores = [
-                        scorer_data.score
-                        for scorer_data in scoring_results[0].scorers_data
-                        if scorer_data.score is not None
-                    ]
-                    reward = sum(scores) / len(scores) if scores else 0.0
-                else:
-                    reward = 0.0
-            return {
-                "prompt_id": prompt_id,
-                "generation_id": generation_id,
-                "messages": messages,
-                "evals": {"score": reward},
-            }
-        coros = []
-        for prompt_id in range(num_prompts_per_step):
-            for generation_id in range(num_generations_per_prompt):
-                coro = generate_single_response(prompt_id, generation_id)
-                coros.append(coro)
-        with _spinner_progress(f"Generating {len(coros)} rollouts..."):
-            num_completed = 0
-            results = []
+        # User creates and configures the trainable model
+        trainable_model = TrainableModel(config)
+        tracer = Tracer()
-            for coro in asyncio.as_completed(coros):
-                result = await coro
-                results.append(result)
-                num_completed += 1
+        # JudgmentTrainer automatically creates the appropriate provider-specific trainer
+        trainer = JudgmentTrainer(config, trainable_model, tracer)
-        _print_progress(f"Generated {len(results)} rollouts successfully")
-        dataset_rows = []
-        for prompt_id in range(num_prompts_per_step):
-            prompt_generations = [r for r in results if r["prompt_id"] == prompt_id]
-            sample_generations = [
-                {"messages": gen["messages"], "evals": gen["evals"]}
-                for gen in prompt_generations
-            ]
-            dataset_rows.append({"samples": sample_generations})
-        return dataset_rows
-    async def run_reinforcement_learning(
-        self,
-        agent_function: Callable[[Any], Any],
-        scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
-        prompts: List[Any],
-    ) -> ModelConfig:
-        """
-        Run the iterative reinforcement learning fine-tuning loop.
-        This method performs multiple steps of reinforcement learning, where each step:
-        1. Advances to the appropriate model snapshot
-        2. Generates rollouts and computes rewards using scorers
-        3. Trains a new model using reinforcement learning
-        4. Waits for training completion
+        # The returned trainer implements the BaseTrainer interface
+        model_config = await trainer.train(agent_function, scorers, prompts)
         Args:
-            agent_function: Function/agent to call for generating responses
-            scorers: List of scorer objects to evaluate responses
-            prompts: List of prompts to use for training
+        config: TrainerConfig instance with training parameters including rft_provider
+        trainable_model: Provider-specific trainable model instance (e.g., TrainableModel for Fireworks)
+        tracer: Tracer for observability
+        project_name: Project name for organizing training runs and evaluations
         Returns:
-            ModelConfig: Configuration of the trained model for inference and future training
-        """
-        _print_progress("Starting reinforcement learning training")
-        training_params = {
-            "num_steps": self.config.num_steps,
-            "num_prompts_per_step": self.config.num_prompts_per_step,
-            "num_generations_per_prompt": self.config.num_generations_per_prompt,
-            "epochs": self.config.epochs,
-            "learning_rate": self.config.learning_rate,
-            "accelerator_count": self.config.accelerator_count,
-            "accelerator_type": self.config.accelerator_type,
-            "temperature": self.config.temperature,
-            "max_tokens": self.config.max_tokens,
-        }
-        start_step = self.trainable_model.current_step
-        for step in range(start_step, self.config.num_steps):
-            step_num = step + 1
-            _print_progress(
-                f"Starting training step {step_num}", step_num, self.config.num_steps
-            )
-            self.trainable_model.advance_to_next_step(step)
-            dataset_rows = await self.generate_rollouts_and_rewards(
-                agent_function, scorers, prompts
-            )
-            with _spinner_progress(
-                "Preparing training dataset", step_num, self.config.num_steps
-            ):
-                dataset = Dataset.from_list(dataset_rows)
-                dataset.sync()
-            _print_progress(
-                "Starting reinforcement training", step_num, self.config.num_steps
-            )
-            job = self.trainable_model.perform_reinforcement_step(dataset, step)
-            last_state = None
-            with _spinner_progress(
-                "Training job in progress", step_num, self.config.num_steps
-            ):
-                while not job.is_completed:
-                    job.raise_if_bad_state()
-                    current_state = job.state
-                    if current_state != last_state:
-                        if current_state in ["uploading", "validating"]:
-                            _print_progress_update(
-                                f"Training job: {current_state} data"
-                            )
-                        elif current_state == "training":
-                            _print_progress_update(
-                                "Training job: model training in progress"
-                            )
-                        else:
-                            _print_progress_update(f"Training job: {current_state}")
-                        last_state = current_state
-                    time.sleep(10)
-                    job = job.get()
-                    if job is None:
-                        raise JudgmentRuntimeError(
-                            "Training job was deleted while waiting for completion"
-                        )
+        Provider-specific trainer instance (FireworksTrainer, etc.) that implements
+        the BaseTrainer interface
-            _print_progress(
-                f"Training completed! New model: {job.output_model}",
-                step_num,
-                self.config.num_steps,
-            )
-            dataset.delete()
-        _print_progress("All training steps completed!")
-        with _spinner_progress("Deploying final trained model"):
-            self.trainable_model.advance_to_next_step(self.config.num_steps)
-        return self.trainable_model.get_model_config(training_params)
-    async def train(
-        self,
-        agent_function: Callable[[Any], Any],
-        scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
-        prompts: List[Any],
-        rft_provider: Optional[str] = None,
-    ) -> ModelConfig:
-        """
-        Start the reinforcement learning fine-tuning process.
-        This is the main entry point for running the reinforcement learning training.
-        Args:
-            agent_function: Function/agent to call for generating responses.
-            scorers: List of scorer objects to evaluate responses
-            prompts: List of prompts to use for training
-            rft_provider: RFT provider to use for training. Currently only "fireworks" is supported.
-                         Support for other providers is planned for future releases.
-        Returns:
-            ModelConfig: Configuration of the trained model for future loading
-        """
-        try:
-            if rft_provider is not None:
-                self.config.rft_provider = rft_provider
-            return await self.run_reinforcement_learning(
-                agent_function, scorers, prompts
-            )
-        except JudgmentRuntimeError:
-            # Re-raise JudgmentAPIError as-is
-            raise
-        except Exception as e:
-            raise JudgmentRuntimeError(f"Training process failed: {str(e)}") from e
+    Raises:
+        JudgmentRuntimeError: If the specified provider is not supported
+    """
+    provider = config.rft_provider.lower()
+    if provider == "fireworks":
+        return FireworksTrainer(config, trainable_model, tracer, project_name)
+    elif provider == "verifiers":
+        # Placeholder for future implementation
+        raise JudgmentRuntimeError(
+            "Verifiers provider is not yet implemented. "
+            "Currently supported providers: 'fireworks'"
+        )
+    else:
+        raise JudgmentRuntimeError(
+            f"Unsupported RFT provider: '{config.rft_provider}'. "
+            f"Currently supported providers: 'fireworks'"
+        )

judgeval/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.16.9"
+__version__ = "0.17.0"
 def get_version() -> str:

{judgeval-0.16.9.dist-info → judgeval-0.17.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.16.9
+Version: 0.17.0
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -63,8 +63,7 @@ Judgeval's agent monitoring infra provides a simple harness for integrating GRPO
 await trainer.train(
     agent_function=your_agent_function,  # entry point to your agent
     scorers=[RewardScorer()],  # Custom scorer you define based on task criteria, acts as reward
-    prompts=training_prompts,  # Tasks
-    rft_provider="fireworks"
+    prompts=training_prompts  # Tasks
 )
 ```

{judgeval-0.16.9.dist-info → judgeval-0.17.0.dist-info}/RECORD RENAMED Viewed

@@ -4,7 +4,7 @@ judgeval/constants.py,sha256=JZZJ1MqzZZDVk-5PRPRbmLnM8mXI-RDL5vxa1JFuscs,3408
 judgeval/env.py,sha256=37Mn4g0OkpFxXCZGlO_CLqKJnyX-jx_R24tC28XJzig,2112
 judgeval/exceptions.py,sha256=tTbfe4yoOtPXmn22UQz9-6a-5PT9uOko85xaRRwr0Sw,621
 judgeval/logger.py,sha256=VP5blbsJ53mvJbNHfBf5p2KrARUrkrErpPkB-__Hh3U,1562
-judgeval/version.py,sha256=na4SICn1_ldveglTM2Suf3pZLRnw2qbMJMUmIhGkh0Q,74
+judgeval/version.py,sha256=vPcSY2o-MH6v7gn4Fzt6yeb_jPUs2hu117IC_EWy33g,74
 judgeval/warnings.py,sha256=LbGte14ppiFjrkp-JJYueZ40NWFvMkWRvPXr6r-fUWw,73
 judgeval/api/__init__.py,sha256=ho8L4wC9y-STYEpk5zHwc2mZJhC4ezW8jiGgOIERBVY,12058
 judgeval/api/api_types.py,sha256=xOHcgK8NTHMuBr1HBHlCvoSYldVOtG8DQsXeo23-YQk,8874
@@ -74,11 +74,13 @@ judgeval/tracer/llm/llm_together/chat_completions.py,sha256=YxVL1zqG7Tjoss0BH3hm
 judgeval/tracer/llm/llm_together/config.py,sha256=jCJY0KQcHJZZJk2vq038GKIDUMusqgvRjQ0B6OV5uEc,150
 judgeval/tracer/llm/llm_together/wrapper.py,sha256=HFqy_MabQeSq8oj2diZhEuk1SDt_hDfk5MFdPn9MFhg,1733
 judgeval/tracer/processors/__init__.py,sha256=BdOOPOD1RfMI5YHW76DNPKR07EAev-JxoolZ3KaXNNU,7100
-judgeval/trainer/__init__.py,sha256=h_DDVV7HFF7HUPAJFpt2d9wjqgnmEVcHxqZyB1k7pPQ,257
-judgeval/trainer/config.py,sha256=sAAVBgeoFDJWYjGIgOvoQoiO0gtqNAOI6MHncwdN_mk,4292
+judgeval/trainer/__init__.py,sha256=nJo913vFdss3E_PR-M1OUjznS0SYgNZ-MP-Y_6Mj5PA,437
+judgeval/trainer/base_trainer.py,sha256=21adIMmYyn7XKbiI1Dc6N5thPbuH5wK7vVfrtoFX6Ys,3886
+judgeval/trainer/config.py,sha256=7ZSwr6p7vq0MRadh9axm6XB-RAotdWqULZ5yDl0xGbQ,4340
 judgeval/trainer/console.py,sha256=SvokkFEU-K1vLV4Rd1m6YJJ7HyYwTr4Azdzwx_JPZUY,4351
+judgeval/trainer/fireworks_trainer.py,sha256=FqGoS1OzmxzyT0134e_EW3pgzFNO04GpKST4NcjYSyU,15432
 judgeval/trainer/trainable_model.py,sha256=T-Sioi_sXtfYlcu3lE0cd60PHs8DrYaZ-Kxb4h1nU04,8993
-judgeval/trainer/trainer.py,sha256=FBhHq2YPooKADDCC_IEKex81L6a5quCmAMyl9mn3QLk,16675
+judgeval/trainer/trainer.py,sha256=twLEHNaomelTg6ZYG6veI9OpB3wzhPCtPVQMTnDZWx4,2626
 judgeval/utils/async_utils.py,sha256=AF1xdu8Ao5GyhFvfaLOaKJHn1RISyXZ4U70UZe9zfBA,1083
 judgeval/utils/file_utils.py,sha256=vq-n5WZEZjVbZ5S9QTkW8nSH6Pvw-Jx0ttsQ1t0wnPQ,3140
 judgeval/utils/guards.py,sha256=QBb6m6KElxdvt2bskLZCKh_zGHbBcqV-VfGzT63o3hY,807
@@ -100,8 +102,8 @@ judgeval/utils/wrappers/mutable_wrap_async.py,sha256=stHISOUCGFUJXY8seXmxUo4ZpMF
 judgeval/utils/wrappers/mutable_wrap_sync.py,sha256=t5jygAQ1vqhy8s1GfiLeYygYgaLTgfoYASN47U5JiPs,2888
 judgeval/utils/wrappers/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 judgeval/utils/wrappers/utils.py,sha256=j18vaa6JWDw2s3nQy1z5PfV_9Xxio-bVARaHG_0XyL0,1228
-judgeval-0.16.9.dist-info/METADATA,sha256=OiLnf6tEWwnFyLkEjqBbqORUSfcTgjJSyK9nFr6dxHo,11513
-judgeval-0.16.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.16.9.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
-judgeval-0.16.9.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.16.9.dist-info/RECORD,,
+judgeval-0.17.0.dist-info/METADATA,sha256=0A2L0alaZoA7KR-b43_IZlD9IolcBSwyVJj8Db-DC20,11483
+judgeval-0.17.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.17.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
+judgeval-0.17.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.17.0.dist-info/RECORD,,

{judgeval-0.16.9.dist-info → judgeval-0.17.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.16.9.dist-info → judgeval-0.17.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{judgeval-0.16.9.dist-info → judgeval-0.17.0.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.16.9__py3-none-any.whl → 0.17.0__py3-none-any.whl

Potentially problematic release.

judgeval 0.16.9py3-none-any.whl → 0.17.0py3-none-any.whl