PyPI - judgeval - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

judgeval 0.6.0py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

judgeval/cli.py +1 -1
judgeval/common/api/constants.py +1 -1
judgeval/common/tracer/core.py +171 -1
judgeval/common/tracer/trace_manager.py +6 -1
judgeval/common/trainer/__init__.py +5 -0
judgeval/common/trainer/config.py +125 -0
judgeval/common/trainer/console.py +151 -0
judgeval/common/trainer/trainable_model.py +238 -0
judgeval/common/trainer/trainer.py +301 -0
judgeval/judgment_client.py +4 -104
judgeval/run_evaluation.py +10 -107
{judgeval-0.6.0.dist-info → judgeval-0.7.1.dist-info}/METADATA +8 -47
{judgeval-0.6.0.dist-info → judgeval-0.7.1.dist-info}/RECORD +16 -11
{judgeval-0.6.0.dist-info → judgeval-0.7.1.dist-info}/WHEEL +0 -0
{judgeval-0.6.0.dist-info → judgeval-0.7.1.dist-info}/entry_points.txt +0 -0
{judgeval-0.6.0.dist-info → judgeval-0.7.1.dist-info}/licenses/LICENSE.md +0 -0

judgeval/cli.py CHANGED Viewed

@@ -38,7 +38,7 @@ def upload_scorer(
     try:
         client = JudgmentClient()
-        result = client.save_custom_scorer(
+        result = client.upload_custom_scorer(
             scorer_file_path=scorer_file_path,
             requirements_file_path=requirements_file_path,
             unique_name=unique_name,

judgeval/common/api/constants.py CHANGED Viewed

@@ -51,7 +51,7 @@ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
 JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
 # Custom Scorers API
-JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL = f"{ROOT_API}/build_sandbox_template/"
+JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL = f"{ROOT_API}/upload_scorer/"
 # Evaluation API Payloads

judgeval/common/tracer/core.py CHANGED Viewed

@@ -815,6 +815,8 @@ class Tracer:
         == "true",
         enable_evaluations: bool = os.getenv("JUDGMENT_EVALUATIONS", "true").lower()
         == "true",
+        show_trace_urls: bool = os.getenv("JUDGMENT_SHOW_TRACE_URLS", "true").lower()
+        == "true",
         # S3 configuration
         use_s3: bool = False,
         s3_bucket_name: Optional[str] = None,
@@ -859,6 +861,7 @@ class Tracer:
             self.traces: List[Trace] = []
             self.enable_monitoring: bool = enable_monitoring
             self.enable_evaluations: bool = enable_evaluations
+            self.show_trace_urls: bool = show_trace_urls
             self.class_identifiers: Dict[
                 str, str
             ] = {}  # Dictionary to store class identifiers
@@ -1731,6 +1734,93 @@ class Tracer:
                     f"Error during background service shutdown: {e}"
                 )
+    def trace_to_message_history(
+        self, trace: Union[Trace, TraceClient]
+    ) -> List[Dict[str, str]]:
+        """
+        Extract message history from a trace for training purposes.
+        This method processes trace spans to reconstruct the conversation flow,
+        extracting messages in chronological order from LLM, user, and tool spans.
+        Args:
+            trace: Trace or TraceClient instance to extract messages from
+        Returns:
+            List of message dictionaries with 'role' and 'content' keys
+        Raises:
+            ValueError: If no trace is provided
+        """
+        if not trace:
+            raise ValueError("No trace provided")
+        # Handle both Trace and TraceClient objects
+        if isinstance(trace, TraceClient):
+            spans = trace.trace_spans
+        else:
+            spans = trace.trace_spans if hasattr(trace, "trace_spans") else []
+        messages = []
+        first_found = False
+        # Process spans in chronological order
+        for span in sorted(
+            spans, key=lambda s: s.created_at if hasattr(s, "created_at") else 0
+        ):
+            # Skip spans without output (except for first LLM span which may have input messages)
+            if span.output is None and span.span_type != "llm":
+                continue
+            if span.span_type == "llm":
+                # For the first LLM span, extract input messages (system + user prompts)
+                if not first_found and hasattr(span, "inputs") and span.inputs:
+                    input_messages = span.inputs.get("messages", [])
+                    if input_messages:
+                        first_found = True
+                        # Add input messages (typically system and user messages)
+                        for msg in input_messages:
+                            if (
+                                isinstance(msg, dict)
+                                and "role" in msg
+                                and "content" in msg
+                            ):
+                                messages.append(
+                                    {"role": msg["role"], "content": msg["content"]}
+                                )
+                # Add assistant response from span output
+                if span.output is not None:
+                    messages.append({"role": "assistant", "content": str(span.output)})
+            elif span.span_type == "user":
+                # Add user messages
+                if span.output is not None:
+                    messages.append({"role": "user", "content": str(span.output)})
+            elif span.span_type == "tool":
+                # Add tool responses as user messages (common pattern in training)
+                if span.output is not None:
+                    messages.append({"role": "user", "content": str(span.output)})
+        return messages
+    def get_current_message_history(self) -> List[Dict[str, str]]:
+        """
+        Get message history from the current trace.
+        Returns:
+            List of message dictionaries from the current trace context
+        Raises:
+            ValueError: If no current trace is found
+        """
+        current_trace = self.get_current_trace()
+        if not current_trace:
+            raise ValueError("No current trace found")
+        return self.trace_to_message_history(current_trace)
 def _get_current_trace(
     trace_across_async_contexts: bool = Tracer.trace_across_async_contexts,
@@ -1746,7 +1836,7 @@ def wrap(
 ) -> Any:
     """
     Wraps an API client to add tracing capabilities.
-    Supports OpenAI, Together, Anthropic, and Google GenAI clients.
+    Supports OpenAI, Together, Anthropic, Google GenAI clients, and TrainableModel.
     Patches both '.create' and Anthropic's '.stream' methods using a wrapper class.
     """
     (
@@ -1871,6 +1961,39 @@ def wrap(
             setattr(client.chat.completions, "create", wrapped(original_create))
         elif isinstance(client, (groq_AsyncGroq)):
             setattr(client.chat.completions, "create", wrapped_async(original_create))
+    # Check for TrainableModel from judgeval.common.trainer
+    try:
+        from judgeval.common.trainer import TrainableModel
+        if isinstance(client, TrainableModel):
+            # Define a wrapper function that can be reapplied to new model instances
+            def wrap_model_instance(model_instance):
+                """Wrap a model instance with tracing functionality"""
+                if hasattr(model_instance, "chat") and hasattr(
+                    model_instance.chat, "completions"
+                ):
+                    if hasattr(model_instance.chat.completions, "create"):
+                        setattr(
+                            model_instance.chat.completions,
+                            "create",
+                            wrapped(model_instance.chat.completions.create),
+                        )
+                    if hasattr(model_instance.chat.completions, "acreate"):
+                        setattr(
+                            model_instance.chat.completions,
+                            "acreate",
+                            wrapped_async(model_instance.chat.completions.acreate),
+                        )
+            # Register the wrapper function with the TrainableModel
+            client._register_tracer_wrapper(wrap_model_instance)
+            # Apply wrapping to the current model
+            wrap_model_instance(client._current_model)
+    except ImportError:
+        pass  # TrainableModel not available
     return client
@@ -1977,6 +2100,22 @@ def _get_client_config(
             return "GROQ_API_CALL", client.chat.completions.create, None, None, None
         elif isinstance(client, (groq_AsyncGroq)):
             return "GROQ_API_CALL", client.chat.completions.create, None, None, None
+    # Check for TrainableModel
+    try:
+        from judgeval.common.trainer import TrainableModel
+        if isinstance(client, TrainableModel):
+            return (
+                "FIREWORKS_TRAINABLE_MODEL_CALL",
+                client._current_model.chat.completions.create,
+                None,
+                None,
+                None,
+            )
+    except ImportError:
+        pass  # TrainableModel not available
     raise ValueError(f"Unsupported client type: {type(client)}")
@@ -2155,6 +2294,37 @@ def _format_output_data(
                 cache_creation_input_tokens,
             )
+    # Check for TrainableModel
+    try:
+        from judgeval.common.trainer import TrainableModel
+        if isinstance(client, TrainableModel):
+            # TrainableModel uses Fireworks LLM internally, so response format should be similar to OpenAI
+            if (
+                hasattr(response, "model")
+                and hasattr(response, "usage")
+                and hasattr(response, "choices")
+            ):
+                model_name = response.model
+                prompt_tokens = response.usage.prompt_tokens if response.usage else 0
+                completion_tokens = (
+                    response.usage.completion_tokens if response.usage else 0
+                )
+                message_content = response.choices[0].message.content
+                # Use LiteLLM cost calculation with fireworks_ai prefix
+                # LiteLLM supports Fireworks AI models for cost calculation when prefixed with "fireworks_ai/"
+                fireworks_model_name = f"fireworks_ai/{model_name}"
+                return message_content, _create_usage(
+                    fireworks_model_name,
+                    prompt_tokens,
+                    completion_tokens,
+                    cache_read_input_tokens,
+                    cache_creation_input_tokens,
+                )
+    except ImportError:
+        pass  # TrainableModel not available
     judgeval_logger.warning(f"Unsupported client type: {type(client)}")
     return None, None

judgeval/common/tracer/trace_manager.py CHANGED Viewed

@@ -71,7 +71,12 @@ class TraceManagerClient:
         server_response = self.api_client.upsert_trace(trace_data)
-        if not offline_mode and show_link and "ui_results_url" in server_response:
+        if (
+            not offline_mode
+            and show_link
+            and "ui_results_url" in server_response
+            and self.tracer.show_trace_urls
+        ):
             pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={server_response['ui_results_url']}]View Trace[/link]\n"
             rprint(pretty_str)

judgeval/common/trainer/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .trainer import JudgmentTrainer
+from .config import TrainerConfig, ModelConfig
+from .trainable_model import TrainableModel
+__all__ = ["JudgmentTrainer", "TrainerConfig", "ModelConfig", "TrainableModel"]

judgeval/common/trainer/config.py ADDED Viewed

@@ -0,0 +1,125 @@
+from dataclasses import dataclass
+from typing import Optional, Dict, Any
+import json
+@dataclass
+class TrainerConfig:
+    """Configuration class for JudgmentTrainer parameters."""
+    deployment_id: str
+    user_id: str
+    model_id: str
+    base_model_name: str = "qwen2p5-7b-instruct"
+    rft_provider: str = "fireworks"
+    num_steps: int = 5
+    num_generations_per_prompt: int = (
+        4  # Number of rollouts/generations per input prompt
+    )
+    num_prompts_per_step: int = 4  # Number of input prompts to sample per training step
+    concurrency: int = 100
+    epochs: int = 1
+    learning_rate: float = 1e-5
+    accelerator_count: int = 1
+    accelerator_type: str = "NVIDIA_A100_80GB"
+    temperature: float = 1.5
+    max_tokens: int = 50
+    enable_addons: bool = True
+@dataclass
+class ModelConfig:
+    """
+    Configuration class for storing and loading trained model state.
+    This class enables persistence of trained models so they can be loaded
+    and used later without retraining.
+    Example usage:
+        trainer = JudgmentTrainer(config)
+        model_config = trainer.train(agent_function, scorers, prompts)
+        # Save the trained model configuration
+        model_config.save_to_file("my_trained_model.json")
+        # Later, load and use the trained model
+        loaded_config = ModelConfig.load_from_file("my_trained_model.json")
+        trained_model = TrainableModel.from_model_config(loaded_config)
+        # Use the trained model for inference
+        response = trained_model.chat.completions.create(
+            model="current",  # Uses the loaded trained model
+            messages=[{"role": "user", "content": "Hello!"}]
+        )
+    """
+    # Base model configuration
+    base_model_name: str
+    deployment_id: str
+    user_id: str
+    model_id: str
+    enable_addons: bool
+    # Training state
+    current_step: int
+    total_steps: int
+    # Current model information
+    current_model_name: Optional[str] = None
+    is_trained: bool = False
+    # Training parameters used (for reference)
+    training_params: Optional[Dict[str, Any]] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert ModelConfig to dictionary for serialization."""
+        return {
+            "base_model_name": self.base_model_name,
+            "deployment_id": self.deployment_id,
+            "user_id": self.user_id,
+            "model_id": self.model_id,
+            "enable_addons": self.enable_addons,
+            "current_step": self.current_step,
+            "total_steps": self.total_steps,
+            "current_model_name": self.current_model_name,
+            "is_trained": self.is_trained,
+            "training_params": self.training_params,
+        }
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ModelConfig":
+        """Create ModelConfig from dictionary."""
+        return cls(
+            base_model_name=data.get("base_model_name", "qwen2p5-7b-instruct"),
+            deployment_id=data.get("deployment_id", "my-base-deployment"),
+            user_id=data.get("user_id", ""),
+            model_id=data.get("model_id", ""),
+            enable_addons=data.get("enable_addons", True),
+            current_step=data.get("current_step", 0),
+            total_steps=data.get("total_steps", 0),
+            current_model_name=data.get("current_model_name"),
+            is_trained=data.get("is_trained", False),
+            training_params=data.get("training_params"),
+        )
+    def to_json(self) -> str:
+        """Convert ModelConfig to JSON string."""
+        return json.dumps(self.to_dict(), indent=2)
+    @classmethod
+    def from_json(cls, json_str: str) -> "ModelConfig":
+        """Create ModelConfig from JSON string."""
+        data = json.loads(json_str)
+        return cls.from_dict(data)
+    def save_to_file(self, filepath: str):
+        """Save ModelConfig to a JSON file."""
+        with open(filepath, "w") as f:
+            f.write(self.to_json())
+    @classmethod
+    def load_from_file(cls, filepath: str) -> "ModelConfig":
+        """Load ModelConfig from a JSON file."""
+        with open(filepath, "r") as f:
+            json_str = f.read()
+        return cls.from_json(json_str)

judgeval/common/trainer/console.py ADDED Viewed

@@ -0,0 +1,151 @@
+from contextlib import contextmanager
+from typing import Optional
+import sys
+import os
+# Detect if we're running in a Jupyter environment
+def _is_jupyter_environment():
+    """Check if we're running in a Jupyter notebook or similar environment."""
+    try:
+        # Check for IPython kernel
+        if "ipykernel" in sys.modules or "IPython" in sys.modules:
+            return True
+        # Check for Jupyter environment variables
+        if "JPY_PARENT_PID" in os.environ:
+            return True
+        # Check if we're in Google Colab
+        if "google.colab" in sys.modules:
+            return True
+        return False
+    except Exception:
+        return False
+# Check environment once at import time
+IS_JUPYTER = _is_jupyter_environment()
+if not IS_JUPYTER:
+    # Safe to use Rich in non-Jupyter environments
+    try:
+        from rich.console import Console
+        from rich.spinner import Spinner
+        from rich.live import Live
+        from rich.text import Text
+        # Shared console instance for the trainer module to avoid conflicts
+        shared_console = Console()
+        RICH_AVAILABLE = True
+    except ImportError:
+        RICH_AVAILABLE = False
+else:
+    # In Jupyter, avoid Rich to prevent recursion issues
+    RICH_AVAILABLE = False
+# Fallback implementations for when Rich is not available or safe
+class SimpleSpinner:
+    def __init__(self, name, text):
+        self.text = text
+class SimpleLive:
+    def __init__(self, spinner, console=None, refresh_per_second=None):
+        self.spinner = spinner
+    def __enter__(self):
+        print(f"🔄 {self.spinner.text}")
+        return self
+    def __exit__(self, *args):
+        pass
+    def update(self, spinner):
+        print(f"🔄 {spinner.text}")
+def safe_print(message, style=None):
+    """Safe print function that works in all environments."""
+    if RICH_AVAILABLE and not IS_JUPYTER:
+        shared_console.print(message, style=style)
+    else:
+        # Use simple print with emoji indicators for different styles
+        if style == "green":
+            print(f"✅ {message}")
+        elif style == "yellow":
+            print(f"⚠️ {message}")
+        elif style == "blue":
+            print(f"🔵 {message}")
+        elif style == "cyan":
+            print(f"🔷 {message}")
+        else:
+            print(message)
+@contextmanager
+def _spinner_progress(
+    message: str, step: Optional[int] = None, total_steps: Optional[int] = None
+):
+    """Context manager for spinner-based progress display."""
+    if step is not None and total_steps is not None:
+        full_message = f"[Step {step}/{total_steps}] {message}"
+    else:
+        full_message = f"[Training] {message}"
+    if RICH_AVAILABLE and not IS_JUPYTER:
+        spinner = Spinner("dots", text=Text(full_message, style="cyan"))
+        with Live(spinner, console=shared_console, refresh_per_second=10):
+            yield
+    else:
+        # Fallback for Jupyter or when Rich is not available
+        print(f"🔄 {full_message}")
+        try:
+            yield
+        finally:
+            print(f"✅ {full_message} - Complete")
+@contextmanager
+def _model_spinner_progress(message: str):
+    """Context manager for model operation spinner-based progress display."""
+    if RICH_AVAILABLE and not IS_JUPYTER:
+        spinner = Spinner("dots", text=Text(f"[Model] {message}", style="blue"))
+        with Live(spinner, console=shared_console, refresh_per_second=10) as live:
+            def update_progress(progress_message: str):
+                """Update the spinner with a new progress message."""
+                new_text = f"[Model] {message}\n  └─ {progress_message}"
+                spinner.text = Text(new_text, style="blue")
+                live.update(spinner)
+            yield update_progress
+    else:
+        # Fallback for Jupyter or when Rich is not available
+        print(f"🔵 [Model] {message}")
+        def update_progress(progress_message: str):
+            print(f"  └─ {progress_message}")
+        yield update_progress
+def _print_progress(
+    message: str, step: Optional[int] = None, total_steps: Optional[int] = None
+):
+    """Print progress message with consistent formatting."""
+    if step is not None and total_steps is not None:
+        safe_print(f"[Step {step}/{total_steps}] {message}", style="green")
+    else:
+        safe_print(f"[Training] {message}", style="green")
+def _print_progress_update(
+    message: str, step: Optional[int] = None, total_steps: Optional[int] = None
+):
+    """Print progress update message (for status changes during long operations)."""
+    safe_print(f"  └─ {message}", style="yellow")
+def _print_model_progress(message: str):
+    """Print model progress message with consistent formatting."""
+    safe_print(f"[Model] {message}", style="blue")

judgeval 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

judgeval 0.6.0py3-none-any.whl → 0.7.1py3-none-any.whl