PyPI - judgeval - Versions diffs - 0.16.9__py3-none-any.whl → 0.22.2__py3-none-any.whl - Mend

judgeval 0.16.9py3-none-any.whl → 0.22.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of judgeval might be problematic. Click here for more details.

Files changed (37) hide show

judgeval/__init__.py +32 -2
judgeval/api/__init__.py +108 -0
judgeval/api/api_types.py +76 -15
judgeval/cli.py +16 -1
judgeval/data/judgment_types.py +76 -20
judgeval/dataset/__init__.py +11 -2
judgeval/env.py +2 -11
judgeval/evaluation/__init__.py +4 -0
judgeval/prompt/__init__.py +330 -0
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +1 -13
judgeval/tracer/__init__.py +371 -257
judgeval/tracer/constants.py +1 -1
judgeval/tracer/exporters/store.py +32 -16
judgeval/tracer/keys.py +11 -9
judgeval/tracer/llm/llm_anthropic/messages.py +38 -26
judgeval/tracer/llm/llm_anthropic/messages_stream.py +14 -14
judgeval/tracer/llm/llm_google/generate_content.py +9 -7
judgeval/tracer/llm/llm_openai/beta_chat_completions.py +38 -14
judgeval/tracer/llm/llm_openai/chat_completions.py +90 -26
judgeval/tracer/llm/llm_openai/responses.py +88 -26
judgeval/tracer/llm/llm_openai/utils.py +42 -0
judgeval/tracer/llm/llm_together/chat_completions.py +26 -18
judgeval/tracer/managers.py +4 -0
judgeval/trainer/__init__.py +10 -1
judgeval/trainer/base_trainer.py +122 -0
judgeval/trainer/config.py +1 -1
judgeval/trainer/fireworks_trainer.py +396 -0
judgeval/trainer/trainer.py +52 -387
judgeval/utils/guards.py +9 -5
judgeval/utils/project.py +15 -0
judgeval/utils/serialize.py +2 -2
judgeval/version.py +1 -1
{judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/METADATA +2 -3
{judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/RECORD +37 -32
{judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
{judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/entry_points.txt +0 -0
{judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0

judgeval/trainer/trainer.py CHANGED Viewed

@@ -1,405 +1,70 @@
-import asyncio
-import json
-import time
-from typing import Optional, Callable, Any, List, Union, Dict
-from fireworks import Dataset  # type: ignore[import-not-found]
-from .config import TrainerConfig, ModelConfig
+from typing import Optional
+from .config import TrainerConfig
+from .base_trainer import BaseTrainer
+from .fireworks_trainer import FireworksTrainer
 from .trainable_model import TrainableModel
 from judgeval.tracer import Tracer
-from judgeval.tracer.exporters.store import SpanStore
-from judgeval.tracer.exporters import InMemorySpanExporter
-from judgeval.tracer.keys import AttributeKeys
-from judgeval import JudgmentClient
-from judgeval.scorers import ExampleScorer, ExampleAPIScorerConfig
-from judgeval.data import Example
-from .console import _spinner_progress, _print_progress, _print_progress_update
 from judgeval.exceptions import JudgmentRuntimeError
-class JudgmentTrainer:
+def JudgmentTrainer(
+    config: TrainerConfig,
+    trainable_model: TrainableModel,
+    tracer: Tracer,
+    project_name: Optional[str] = None,
+) -> BaseTrainer:
     """
-    A reinforcement learning trainer for Judgment models using Fine-Tuning.
+    Factory function for creating reinforcement learning trainers.
-    This class handles the iterative training process where models are improved
-    through reinforcement learning fine-tuning based on generated rollouts and rewards.
-    """
-    def __init__(
-        self,
-        config: TrainerConfig,
-        trainable_model: TrainableModel,
-        tracer: Tracer,
-        project_name: Optional[str] = None,
-    ):
-        """
-        Initialize the JudgmentTrainer.
-        Args:
-            config: TrainerConfig instance with training parameters. If None, uses default config.
-            tracer: Optional tracer for observability
-            trainable_model: Optional trainable model instance
-            project_name: Project name for organizing training runs and evaluations
-        """
-        try:
-            self.config = config
-            self.tracer = tracer
-            self.project_name = project_name or "judgment_training"
-            self.trainable_model = trainable_model
-            self.judgment_client = JudgmentClient()
-            self.span_store = SpanStore()
-            self.span_exporter = InMemorySpanExporter(self.span_store)
-        except Exception as e:
-            raise JudgmentRuntimeError(
-                f"Failed to initialize JudgmentTrainer: {str(e)}"
-            ) from e
-    def _extract_message_history_from_spans(self) -> List[Dict[str, str]]:
-        """
-        Extract message history from spans in the span store for training purposes.
-        This method processes trace spans to reconstruct the conversation flow,
-        extracting messages in chronological order from LLM, user, and tool spans.
-        Returns:
-            List of message dictionaries with 'role' and 'content' keys
-        """
-        spans = self.span_store.get_all()
-        if not spans:
-            return []
-        messages = []
-        first_found = False
-        for span in sorted(spans, key=lambda s: getattr(s, "start_time", 0)):
-            span_attributes = span.attributes or {}
-            span_type = span_attributes.get(AttributeKeys.JUDGMENT_SPAN_KIND, "span")
-            if (
-                not span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
-                and span_type != "llm"
-            ):
-                continue
-            if span_type == "llm":
-                if not first_found and span_attributes.get(
-                    AttributeKeys.JUDGMENT_INPUT
-                ):
-                    input_data: Any = span_attributes.get(
-                        AttributeKeys.JUDGMENT_INPUT, {}
-                    )
-                    if isinstance(input_data, dict) and "messages" in input_data:
-                        input_messages = input_data["messages"]
-                        if input_messages:
-                            first_found = True
-                            for msg in input_messages:
-                                if (
-                                    isinstance(msg, dict)
-                                    and "role" in msg
-                                    and "content" in msg
-                                ):
-                                    messages.append(
-                                        {"role": msg["role"], "content": msg["content"]}
-                                    )
-                # Add assistant response from span output
-                output = span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
-                if output is not None:
-                    content = str(output)
-                    try:
-                        parsed = json.loads(content)
-                        if isinstance(parsed, dict) and "messages" in parsed:
-                            # Extract the actual assistant message content
-                            for msg in parsed["messages"]:
-                                if (
-                                    isinstance(msg, dict)
-                                    and msg.get("role") == "assistant"
-                                ):
-                                    content = msg.get("content", content)
-                                    break
-                    except (json.JSONDecodeError, KeyError):
-                        pass
-                    messages.append({"role": "assistant", "content": content})
-            elif span_type == "user":
-                output = span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
-                if output is not None:
-                    content = str(output)
-                    try:
-                        parsed = json.loads(content)
-                        if isinstance(parsed, dict) and "messages" in parsed:
-                            for msg in parsed["messages"]:
-                                if isinstance(msg, dict) and msg.get("role") == "user":
-                                    content = msg.get("content", content)
-                                    break
-                    except (json.JSONDecodeError, KeyError):
-                        pass
-                    messages.append({"role": "user", "content": content})
+    This factory creates and returns provider-specific trainer implementations
+    (FireworksTrainer, VerifiersTrainer, etc.) based on the configured RFT provider.
-            elif span_type == "tool":
-                output = span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
-                if output is not None:
-                    content = str(output)
-                    try:
-                        parsed = json.loads(content)
-                        if isinstance(parsed, dict) and "messages" in parsed:
-                            for msg in parsed["messages"]:
-                                if isinstance(msg, dict) and msg.get("role") == "user":
-                                    content = msg.get("content", content)
-                                    break
-                    except (json.JSONDecodeError, KeyError):
-                        pass
-                    messages.append({"role": "user", "content": content})
+    The factory pattern allows for easy extension to support multiple training
+    providers without changing the client-facing API.
-        return messages
-    async def generate_rollouts_and_rewards(
-        self,
-        agent_function: Callable[[Any], Any],
-        scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
-        prompts: List[Any],
-        num_prompts_per_step: Optional[int] = None,
-        num_generations_per_prompt: Optional[int] = None,
-        concurrency: Optional[int] = None,
-    ):
-        """
-        Generate rollouts and compute rewards using the current model snapshot.
-        Each sample contains multiple generations for reinforcement learning optimization.
-        Args:
-            agent_function: Function/agent to call for generating responses
-            scorers: List of scorer objects to evaluate responses
-            prompts: List of prompts to use for training
-            num_prompts_per_step: Number of prompts to use per step (defaults to config value, limited by prompts list length)
-            num_generations_per_prompt: Generations per prompt (defaults to config value)
-            concurrency: Concurrency limit (defaults to config value)
-        Returns:
-            List of dataset rows containing samples with messages and evaluations
-        """
-        num_prompts_per_step = min(
-            num_prompts_per_step or self.config.num_prompts_per_step, len(prompts)
-        )
-        num_generations_per_prompt = (
-            num_generations_per_prompt or self.config.num_generations_per_prompt
+    Example:
+        config = TrainerConfig(
+            deployment_id="my-deployment",
+            user_id="my-user",
+            model_id="my-model",
+            rft_provider="fireworks"  # or "verifiers" in the future
         )
-        concurrency = concurrency or self.config.concurrency
-        semaphore = asyncio.Semaphore(concurrency)
-        @self.tracer.observe(span_type="function")
-        async def generate_single_response(prompt_id, generation_id):
-            async with semaphore:
-                prompt_input = prompts[prompt_id]
-                response_data = await agent_function(**prompt_input)
-                messages = response_data.get("messages", [])
-                try:
-                    traced_messages = self._extract_message_history_from_spans()
-                    if traced_messages:
-                        messages = traced_messages
-                except Exception as e:
-                    print(f"Warning: Failed to get message history from trace: {e}")
-                    pass
-                finally:
-                    self.span_store.spans = []
-                example = Example(
-                    input=prompt_input,
-                    messages=messages,
-                    actual_output=response_data,
-                )
-                scoring_results = self.judgment_client.run_evaluation(
-                    examples=[example],
-                    scorers=scorers,
-                    project_name=self.project_name,
-                    eval_run_name=f"training_step_{self.trainable_model.current_step}_prompt_{prompt_id}_gen_{generation_id}",
-                )
-                if scoring_results and scoring_results[0].scorers_data:
-                    scores = [
-                        scorer_data.score
-                        for scorer_data in scoring_results[0].scorers_data
-                        if scorer_data.score is not None
-                    ]
-                    reward = sum(scores) / len(scores) if scores else 0.0
-                else:
-                    reward = 0.0
-            return {
-                "prompt_id": prompt_id,
-                "generation_id": generation_id,
-                "messages": messages,
-                "evals": {"score": reward},
-            }
-        coros = []
-        for prompt_id in range(num_prompts_per_step):
-            for generation_id in range(num_generations_per_prompt):
-                coro = generate_single_response(prompt_id, generation_id)
-                coros.append(coro)
-        with _spinner_progress(f"Generating {len(coros)} rollouts..."):
-            num_completed = 0
-            results = []
+        # User creates and configures the trainable model
+        trainable_model = TrainableModel(config)
+        tracer = Tracer()
-            for coro in asyncio.as_completed(coros):
-                result = await coro
-                results.append(result)
-                num_completed += 1
+        # JudgmentTrainer automatically creates the appropriate provider-specific trainer
+        trainer = JudgmentTrainer(config, trainable_model, tracer)
-        _print_progress(f"Generated {len(results)} rollouts successfully")
-        dataset_rows = []
-        for prompt_id in range(num_prompts_per_step):
-            prompt_generations = [r for r in results if r["prompt_id"] == prompt_id]
-            sample_generations = [
-                {"messages": gen["messages"], "evals": gen["evals"]}
-                for gen in prompt_generations
-            ]
-            dataset_rows.append({"samples": sample_generations})
-        return dataset_rows
-    async def run_reinforcement_learning(
-        self,
-        agent_function: Callable[[Any], Any],
-        scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
-        prompts: List[Any],
-    ) -> ModelConfig:
-        """
-        Run the iterative reinforcement learning fine-tuning loop.
-        This method performs multiple steps of reinforcement learning, where each step:
-        1. Advances to the appropriate model snapshot
-        2. Generates rollouts and computes rewards using scorers
-        3. Trains a new model using reinforcement learning
-        4. Waits for training completion
+        # The returned trainer implements the BaseTrainer interface
+        model_config = await trainer.train(agent_function, scorers, prompts)
         Args:
-            agent_function: Function/agent to call for generating responses
-            scorers: List of scorer objects to evaluate responses
-            prompts: List of prompts to use for training
+        config: TrainerConfig instance with training parameters including rft_provider
+        trainable_model: Provider-specific trainable model instance (e.g., TrainableModel for Fireworks)
+        tracer: Tracer for observability
+        project_name: Project name for organizing training runs and evaluations
         Returns:
-            ModelConfig: Configuration of the trained model for inference and future training
-        """
-        _print_progress("Starting reinforcement learning training")
-        training_params = {
-            "num_steps": self.config.num_steps,
-            "num_prompts_per_step": self.config.num_prompts_per_step,
-            "num_generations_per_prompt": self.config.num_generations_per_prompt,
-            "epochs": self.config.epochs,
-            "learning_rate": self.config.learning_rate,
-            "accelerator_count": self.config.accelerator_count,
-            "accelerator_type": self.config.accelerator_type,
-            "temperature": self.config.temperature,
-            "max_tokens": self.config.max_tokens,
-        }
-        start_step = self.trainable_model.current_step
-        for step in range(start_step, self.config.num_steps):
-            step_num = step + 1
-            _print_progress(
-                f"Starting training step {step_num}", step_num, self.config.num_steps
-            )
-            self.trainable_model.advance_to_next_step(step)
-            dataset_rows = await self.generate_rollouts_and_rewards(
-                agent_function, scorers, prompts
-            )
-            with _spinner_progress(
-                "Preparing training dataset", step_num, self.config.num_steps
-            ):
-                dataset = Dataset.from_list(dataset_rows)
-                dataset.sync()
-            _print_progress(
-                "Starting reinforcement training", step_num, self.config.num_steps
-            )
-            job = self.trainable_model.perform_reinforcement_step(dataset, step)
-            last_state = None
-            with _spinner_progress(
-                "Training job in progress", step_num, self.config.num_steps
-            ):
-                while not job.is_completed:
-                    job.raise_if_bad_state()
-                    current_state = job.state
-                    if current_state != last_state:
-                        if current_state in ["uploading", "validating"]:
-                            _print_progress_update(
-                                f"Training job: {current_state} data"
-                            )
-                        elif current_state == "training":
-                            _print_progress_update(
-                                "Training job: model training in progress"
-                            )
-                        else:
-                            _print_progress_update(f"Training job: {current_state}")
-                        last_state = current_state
-                    time.sleep(10)
-                    job = job.get()
-                    if job is None:
-                        raise JudgmentRuntimeError(
-                            "Training job was deleted while waiting for completion"
-                        )
+        Provider-specific trainer instance (FireworksTrainer, etc.) that implements
+        the BaseTrainer interface
-            _print_progress(
-                f"Training completed! New model: {job.output_model}",
-                step_num,
-                self.config.num_steps,
-            )
-            dataset.delete()
-        _print_progress("All training steps completed!")
-        with _spinner_progress("Deploying final trained model"):
-            self.trainable_model.advance_to_next_step(self.config.num_steps)
-        return self.trainable_model.get_model_config(training_params)
-    async def train(
-        self,
-        agent_function: Callable[[Any], Any],
-        scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
-        prompts: List[Any],
-        rft_provider: Optional[str] = None,
-    ) -> ModelConfig:
-        """
-        Start the reinforcement learning fine-tuning process.
-        This is the main entry point for running the reinforcement learning training.
-        Args:
-            agent_function: Function/agent to call for generating responses.
-            scorers: List of scorer objects to evaluate responses
-            prompts: List of prompts to use for training
-            rft_provider: RFT provider to use for training. Currently only "fireworks" is supported.
-                         Support for other providers is planned for future releases.
-        Returns:
-            ModelConfig: Configuration of the trained model for future loading
-        """
-        try:
-            if rft_provider is not None:
-                self.config.rft_provider = rft_provider
-            return await self.run_reinforcement_learning(
-                agent_function, scorers, prompts
-            )
-        except JudgmentRuntimeError:
-            # Re-raise JudgmentAPIError as-is
-            raise
-        except Exception as e:
-            raise JudgmentRuntimeError(f"Training process failed: {str(e)}") from e
+    Raises:
+        JudgmentRuntimeError: If the specified provider is not supported
+    """
+    provider = config.rft_provider.lower()
+    if provider == "fireworks":
+        return FireworksTrainer(config, trainable_model, tracer, project_name)
+    elif provider == "verifiers":
+        # Placeholder for future implementation
+        raise JudgmentRuntimeError(
+            "Verifiers provider is not yet implemented. "
+            "Currently supported providers: 'fireworks'"
+        )
+    else:
+        raise JudgmentRuntimeError(
+            f"Unsupported RFT provider: '{config.rft_provider}'. "
+            f"Currently supported providers: 'fireworks'"
+        )

judgeval/utils/guards.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING
+from judgeval.logger import judgeval_logger
 if TYPE_CHECKING:
     from typing import TypeVar
@@ -8,24 +9,27 @@ if TYPE_CHECKING:
     T = TypeVar("T")
-def expect_exists(value: T | None, message: str) -> T:
-    if value is None:
-        raise ValueError(message)
+def expect_exists(value: T | None, message: str, default: T) -> T:
+    if not value:
+        judgeval_logger.error(message)
+        return default
     return value
-def expect_api_key(api_key: str | None) -> str:
+def expect_api_key(api_key: str | None) -> str | None:
     return expect_exists(
         api_key,
         "API Key is not set, please set JUDGMENT_API_KEY in the environment variables or pass it as `api_key`",
+        default=None,
     )
-def expect_organization_id(organization_id: str | None) -> str:
+def expect_organization_id(organization_id: str | None) -> str | None:
     return expect_exists(
         organization_id,
         "Organization ID is not set, please set JUDGMENT_ORG_ID in the environment variables or pass it as `organization_id`",
+        default=None,
     )

judgeval/utils/project.py ADDED Viewed

@@ -0,0 +1,15 @@
+from judgeval.utils.decorators.dont_throw import dont_throw
+import functools
+from judgeval.api import JudgmentSyncClient
+@dont_throw
+@functools.lru_cache(maxsize=64)
+def _resolve_project_id(project_name: str, api_key: str, organization_id: str) -> str:
+    """Resolve project_id from project_name using the API."""
+    client = JudgmentSyncClient(
+        api_key=api_key,
+        organization_id=organization_id,
+    )
+    response = client.projects_resolve({"project_name": project_name})
+    return response["project_id"]

judgeval/utils/serialize.py CHANGED Viewed

@@ -247,7 +247,7 @@ encoders_by_class_tuples = generate_encoders_by_class_tuples(ENCODERS_BY_TYPE)
 # Seralize arbitrary object to a json string
 def safe_serialize(obj: Any) -> str:
     try:
-        return orjson.dumps(json_encoder(obj)).decode()
+        return orjson.dumps(json_encoder(obj), option=orjson.OPT_NON_STR_KEYS).decode()
     except Exception as e:
         judgeval_logger.warning(f"Error serializing object: {e}")
-        return orjson.dumps(repr(obj)).decode()
+        return repr(obj)

judgeval/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.16.9"
+__version__ = "0.22.2"
 def get_version() -> str:

{judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.16.9
+Version: 0.22.2
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -63,8 +63,7 @@ Judgeval's agent monitoring infra provides a simple harness for integrating GRPO
 await trainer.train(
     agent_function=your_agent_function,  # entry point to your agent
     scorers=[RewardScorer()],  # Custom scorer you define based on task criteria, acts as reward
-    prompts=training_prompts,  # Tasks
-    rft_provider="fireworks"
+    prompts=training_prompts  # Tasks
 )
 ```

judgeval 0.16.9__py3-none-any.whl → 0.22.2__py3-none-any.whl

Potentially problematic release.

judgeval 0.16.9py3-none-any.whl → 0.22.2py3-none-any.whl