PyPI - judgeval - Versions diffs - 0.0.40__py3-none-any.whl → 0.0.41__py3-none-any.whl - Mend

judgeval 0.0.40py3-none-any.whl → 0.0.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

judgeval/common/tracer.py +160 -38
judgeval/common/utils.py +5 -1
judgeval/data/datasets/dataset.py +12 -6
judgeval/data/datasets/eval_dataset_client.py +3 -1
judgeval/data/trace.py +6 -2
judgeval/judgment_client.py +9 -1
judgeval/run_evaluation.py +17 -3
judgeval/scorers/judgeval_scorer.py +4 -1
judgeval/scorers/prompt_scorer.py +3 -0
{judgeval-0.0.40.dist-info → judgeval-0.0.41.dist-info}/METADATA +25 -16
{judgeval-0.0.40.dist-info → judgeval-0.0.41.dist-info}/RECORD +13 -13
{judgeval-0.0.40.dist-info → judgeval-0.0.41.dist-info}/WHEEL +0 -0
{judgeval-0.0.40.dist-info → judgeval-0.0.41.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/tracer.py CHANGED Viewed

@@ -5,7 +5,6 @@ Tracing system for judgeval that allows for function tracing using decorators.
 import asyncio
 import functools
 import inspect
-import json
 import os
 import site
 import sysconfig
@@ -16,6 +15,7 @@ import uuid
 import warnings
 import contextvars
 import sys
+import json
 from contextlib import contextmanager, asynccontextmanager, AbstractAsyncContextManager, AbstractContextManager # Import context manager bases
 from dataclasses import dataclass, field
 from datetime import datetime
@@ -29,20 +29,16 @@ from typing import (
     Literal,
     Optional,
     Tuple,
-    Type,
-    TypeVar,
     Union,
     AsyncGenerator,
     TypeAlias,
-    Set
 )
 from rich import print as rprint
-import types # <--- Add this import
+import types
 # Third-party imports
 import requests
 from litellm import cost_per_token as _original_cost_per_token
-from pydantic import BaseModel
 from rich import print as rprint
 from openai import OpenAI, AsyncOpenAI
 from together import Together, AsyncTogether
@@ -64,8 +60,7 @@ from judgeval.data import Example, Trace, TraceSpan, TraceUsage
 from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
 from judgeval.rules import Rule
 from judgeval.evaluation_run import EvaluationRun
-from judgeval.data.result import ScoringResult
-from judgeval.common.utils import validate_api_key
+from judgeval.common.utils import ExcInfo, validate_api_key
 from judgeval.common.exceptions import JudgmentAPIError
 # Standard library imports needed for the new class
@@ -307,7 +302,7 @@ class TraceClient:
         tracer: Optional["Tracer"],
         trace_id: Optional[str] = None,
         name: str = "default",
-        project_name: str = "default_project",
+        project_name: str = None,
         overwrite: bool = False,
         rules: Optional[List[Rule]] = None,
         enable_monitoring: bool = True,
@@ -317,7 +312,7 @@ class TraceClient:
     ):
         self.name = name
         self.trace_id = trace_id or str(uuid.uuid4())
-        self.project_name = project_name
+        self.project_name = project_name or str(uuid.uuid4())
         self.overwrite = overwrite
         self.tracer = tracer
         self.rules = rules or []
@@ -507,6 +502,28 @@ class TraceClient:
             span = self.span_id_to_span[current_span_id]
             span.agent_name = agent_name
+    def record_state_before(self, state: dict):
+        """Records the agent's state before a tool execution on the current span.
+        Args:
+            state: A dictionary representing the agent's state.
+        """
+        current_span_id = current_span_var.get()
+        if current_span_id:
+            span = self.span_id_to_span[current_span_id]
+            span.state_before = state
+    def record_state_after(self, state: dict):
+        """Records the agent's state after a tool execution on the current span.
+        Args:
+            state: A dictionary representing the agent's state.
+        """
+        current_span_id = current_span_var.get()
+        if current_span_id:
+            span = self.span_id_to_span[current_span_id]
+            span.state_after = state
     async def _update_coroutine(self, span: TraceSpan, coroutine: Any, field: str):
         """Helper method to update the output of a trace entry once the coroutine completes"""
         try:
@@ -540,7 +557,7 @@ class TraceClient:
         # Removed else block - original didn't have one
         return None # Return None if no span_id found
-    def record_error(self, error: Any):
+    def record_error(self, error: Dict[str, Any]):
         current_span_id = current_span_var.get()
         if current_span_id:
             span = self.span_id_to_span[current_span_id]
@@ -579,7 +596,7 @@ class TraceClient:
             "project_name": self.project_name,
             "created_at": datetime.utcfromtimestamp(self.start_time).isoformat(),
             "duration": total_duration,
-            "entries": [span.model_dump() for span in self.trace_spans],
+            "trace_spans": [span.model_dump() for span in self.trace_spans],
             "evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
             "overwrite": overwrite,
             "offline_mode": self.tracer.offline_mode,
@@ -599,7 +616,7 @@ class TraceClient:
     def delete(self):
         return self.trace_manager_client.delete_trace(self.trace_id)
-def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_info: Tuple[Optional[type], Optional[BaseException], Optional[types.TracebackType]]):
+def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_info: ExcInfo):
     if not current_trace:
         return
@@ -609,6 +626,27 @@ def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_inf
         "message": str(exc_value) if exc_value else "No exception message",
         "traceback": traceback.format_tb(exc_traceback_obj) if exc_traceback_obj else []
     }
+    # This is where we specially handle exceptions that we might want to collect additional data for.
+    # When we do this, always try checking the module from sys.modules instead of importing. This will
+    # Let us support a wider range of exceptions without needing to import them for all clients.
+    # Most clients (requests, httpx, urllib) support the standard format of exposing error.request.url and error.response.status_code
+    # The alternative is to hand select libraries we want from sys.modules and check for them:
+    # As an example:  requests_module = sys.modules.get("requests", None) // then do things with requests_module;
+     # General HTTP Like errors
+    try:
+        url = getattr(getattr(exc_value, "request", None), "url", None)
+        status_code = getattr(getattr(exc_value, "response", None), "status_code", None)
+        if status_code:
+            formatted_exception["http"] = {
+                "url": url if url else "Unknown URL",
+                "status_code": status_code if status_code else None,
+            }
+    except Exception as e:
+        pass
     current_trace.record_error(formatted_exception)
 class _DeepTracer:
     _instance: Optional["_DeepTracer"] = None
@@ -907,7 +945,7 @@ class Tracer:
     def __init__(
         self,
         api_key: str = os.getenv("JUDGMENT_API_KEY"),
-        project_name: str = "default_project",
+        project_name: str = None,
         rules: Optional[List[Rule]] = None,  # Added rules parameter
         organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
         enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower() == "true",
@@ -935,7 +973,7 @@ class Tracer:
                 raise ValueError("S3 bucket name must be provided when use_s3 is True")
             self.api_key: str = api_key
-            self.project_name: str = project_name
+            self.project_name: str = project_name or str(uuid.uuid4())
             self.organization_id: str = organization_id
             self._current_trace: Optional[str] = None
             self._active_trace_client: Optional[TraceClient] = None # Add active trace client attribute
@@ -1068,32 +1106,92 @@ class Tracer:
         rprint(f"[bold]{label}:[/bold] {msg}")
-    def identify(self, identifier: str):
+    def identify(self, identifier: str, track_state: bool = False, track_attributes: Optional[List[str]] = None, field_mappings: Optional[Dict[str, str]] = None):
         """
-        Class decorator that associates a class with a custom identifier.
+        Class decorator that associates a class with a custom identifier and enables state tracking.
         This decorator creates a mapping between the class name and the provided
         identifier, which can be useful for tagging, grouping, or referencing
-        classes in a standardized way.
+        classes in a standardized way. It also enables automatic state capture
+        for instances of the decorated class when used with tracing.
         Args:
-            identifier: The identifier to associate with the decorated class
-        Returns:
-            A decorator function that registers the class with the given identifier
+            identifier: The identifier to associate with the decorated class.
+                    This will be used as the instance name in traces.
+            track_state: Whether to automatically capture the state (attributes)
+                        of instances before and after function execution. Defaults to False.
+            track_attributes: Optional list of specific attribute names to track.
+                            If None, all non-private attributes (not starting with '_')
+                            will be tracked when track_state=True.
+            field_mappings: Optional dictionary mapping internal attribute names to
+                        display names in the captured state. For example:
+                        {"system_prompt": "instructions"} will capture the
+                        'instructions' attribute as 'system_prompt' in the state.
         Example:
-            @tracer.identify(identifier="user_model")
+            @tracer.identify(identifier="user_model", track_state=True, track_attributes=["name", "age"], field_mappings={"system_prompt": "instructions"})
             class User:
                 # Class implementation
         """
         def decorator(cls):
             class_name = cls.__name__
-            self.class_identifiers[class_name] = identifier
+            self.class_identifiers[class_name] = {
+                "identifier": identifier,
+                "track_state": track_state,
+                "track_attributes": track_attributes,
+                "field_mappings": field_mappings or {}
+            }
             return cls
         return decorator
+    def _capture_instance_state(self, instance: Any, class_config: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Capture the state of an instance based on class configuration.
+        Args:
+            instance: The instance to capture the state of.
+            class_config: Configuration dictionary for state capture,
+                          expected to contain 'track_attributes' and 'field_mappings'.
+        """
+        track_attributes = class_config.get('track_attributes')
+        field_mappings = class_config.get('field_mappings')
+        if track_attributes:
+            state = {attr: getattr(instance, attr, None) for attr in track_attributes}
+        else:
+            state = {k: v for k, v in instance.__dict__.items() if not k.startswith('_')}
+        if field_mappings:
+            state['field_mappings'] = field_mappings
+        return state
+    def _get_instance_state_if_tracked(self, args):
+        """
+        Extract instance state if the instance should be tracked.
+        Returns the captured state dict if tracking is enabled, None otherwise.
+        """
+        if args and hasattr(args[0], '__class__'):
+            instance = args[0]
+            class_name = instance.__class__.__name__
+            if (class_name in self.class_identifiers and
+                isinstance(self.class_identifiers[class_name], dict) and
+                self.class_identifiers[class_name].get('track_state', False)):
+                return self._capture_instance_state(instance, self.class_identifiers[class_name])
+    def _conditionally_capture_and_record_state(self, trace_client_instance: TraceClient, args: tuple, is_before: bool):
+        """Captures instance state if tracked and records it via the trace_client."""
+        state = self._get_instance_state_if_tracked(args)
+        if state:
+            if is_before:
+                trace_client_instance.record_state_before(state)
+            else:
+                trace_client_instance.record_state_after(state)
     def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False, deep_tracing: bool = None):
         """
         Decorator to trace function execution with detailed entry/exit information.
@@ -1171,6 +1269,9 @@ class Tracer:
                             span.record_input(inputs)
                             if agent_name:
                                 span.record_agent_name(agent_name)
+                            # Capture state before execution
+                            self._conditionally_capture_and_record_state(span, args, is_before=True)
                             if use_deep_tracing:
                                 with _DeepTracer():
@@ -1181,7 +1282,10 @@ class Tracer:
                                 except Exception as e:
                                     _capture_exception_for_trace(current_trace, sys.exc_info())
                                     raise e
+                            # Capture state after execution
+                            self._conditionally_capture_and_record_state(span, args, is_before=False)
                             # Record output
                             span.record_output(result)
                         return result
@@ -1199,6 +1303,9 @@ class Tracer:
                         if agent_name:
                             span.record_agent_name(agent_name)
+                        # Capture state before execution
+                        self._conditionally_capture_and_record_state(span, args, is_before=True)
                         if use_deep_tracing:
                             with _DeepTracer():
                                 result = await func(*args, **kwargs)
@@ -1208,6 +1315,9 @@ class Tracer:
                             except Exception as e:
                                 _capture_exception_for_trace(current_trace, sys.exc_info())
                                 raise e
+                        # Capture state after execution
+                        self._conditionally_capture_and_record_state(span, args, is_before=False)
                         span.record_output(result)
                     return result
@@ -1258,6 +1368,9 @@ class Tracer:
                             span.record_input(inputs)
                             if agent_name:
                                 span.record_agent_name(agent_name)
+                            # Capture state before execution
+                            self._conditionally_capture_and_record_state(span, args, is_before=True)
                             if use_deep_tracing:
                                 with _DeepTracer():
                                     result = func(*args, **kwargs)
@@ -1267,6 +1380,10 @@ class Tracer:
                                 except Exception as e:
                                     _capture_exception_for_trace(current_trace, sys.exc_info())
                                     raise e
+                            # Capture state after execution
+                            self._conditionally_capture_and_record_state(span, args, is_before=False)
                             # Record output
                             span.record_output(result)
@@ -1286,6 +1403,9 @@ class Tracer:
                         if agent_name:
                             span.record_agent_name(agent_name)
+                        # Capture state before execution
+                        self._conditionally_capture_and_record_state(span, args, is_before=True)
                         if use_deep_tracing:
                             with _DeepTracer():
                                 result = func(*args, **kwargs)
@@ -1296,6 +1416,9 @@ class Tracer:
                                 _capture_exception_for_trace(current_trace, sys.exc_info())
                                 raise e
+                        # Capture state after execution
+                        self._conditionally_capture_and_record_state(span, args, is_before=False)
                         span.record_output(result)
                     return result
@@ -1369,13 +1492,6 @@ def wrap(client: Any) -> Any:
             span.record_usage(usage)
             return response
-    def _handle_error(span, e, is_async):
-        """Handle and record errors"""
-        call_type = "async" if is_async else "sync"
-        print(f"Error during wrapped {call_type} API call ({span_name}): {e}")
-        span.record_output({"error": str(e)})
-        raise
     # --- Traced Async Functions ---
     async def traced_create_async(*args, **kwargs):
         current_trace = current_trace_var.get()
@@ -1389,7 +1505,8 @@ def wrap(client: Any) -> Any:
                 response_or_iterator = await original_create(*args, **kwargs)
                 return _format_and_record_output(span, response_or_iterator, is_streaming, True, False)
             except Exception as e:
-                return _handle_error(span, e, True)
+                _capture_exception_for_trace(span, sys.exc_info())
+                raise e
     # Async responses for OpenAI clients
     async def traced_response_create_async(*args, **kwargs):
@@ -1404,7 +1521,8 @@ def wrap(client: Any) -> Any:
                 response_or_iterator = await original_responses_create(*args, **kwargs)
                 return _format_and_record_output(span, response_or_iterator, is_streaming, True, True)
             except Exception as e:
-                return _handle_error(span, e, True)
+                _capture_exception_for_trace(span, sys.exc_info())
+                raise e
     # Function replacing .stream() for async clients
     def traced_stream_async(*args, **kwargs):
@@ -1435,7 +1553,8 @@ def wrap(client: Any) -> Any:
                 response_or_iterator = original_create(*args, **kwargs)
                 return _format_and_record_output(span, response_or_iterator, is_streaming, False, False)
             except Exception as e:
-                return _handle_error(span, e, False)
+                _capture_exception_for_trace(span, sys.exc_info())
+                raise e
     def traced_response_create_sync(*args, **kwargs):
         current_trace = current_trace_var.get()
@@ -1449,7 +1568,8 @@ def wrap(client: Any) -> Any:
                 response_or_iterator = original_responses_create(*args, **kwargs)
                 return _format_and_record_output(span, response_or_iterator, is_streaming, False, True)
             except Exception as e:
-                return _handle_error(span, e, False)
+                _capture_exception_for_trace(span, sys.exc_info())
+                raise e
     # Function replacing sync .stream()
     def traced_stream_sync(*args, **kwargs):
@@ -1990,10 +2110,12 @@ def get_instance_prefixed_name(instance, class_name, class_identifiers):
     Otherwise, returns None.
     """
     if class_name in class_identifiers:
-        attr = class_identifiers[class_name]
+        class_config = class_identifiers[class_name]
+        attr = class_config['identifier']
         if hasattr(instance, attr):
             instance_name = getattr(instance, attr)
             return instance_name
         else:
-            raise Exception(f"Attribute {class_identifiers[class_name]} does not exist for {class_name}. Check your identify() decorator.")
+            raise Exception(f"Attribute {attr} does not exist for {class_name}. Check your identify() decorator.")
     return None

judgeval/common/utils.py CHANGED Viewed

@@ -12,9 +12,10 @@ NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is a
 import asyncio
 import concurrent.futures
 import os
+from types import TracebackType
 import requests
 import pprint
-from typing import Any, Dict, List, Literal, Mapping, Optional, Union
+from typing import Any, Dict, List, Literal, Mapping, Optional, TypeAlias, Union
 # Third-party imports
 import litellm
@@ -782,3 +783,6 @@ if __name__ == "__main__":
             ]
         ]
     ))
+ExcInfo: TypeAlias = tuple[type[BaseException], BaseException, TracebackType]
+OptExcInfo: TypeAlias = ExcInfo | tuple[None, None, None]

judgeval/data/datasets/dataset.py CHANGED Viewed

@@ -5,14 +5,15 @@ import json
 import os
 import yaml
 from dataclasses import dataclass, field
-from typing import List, Union, Literal
+from typing import List, Union, Literal, Optional
-from judgeval.data import Example
+from judgeval.data import Example, Trace
 from judgeval.common.logger import debug, error, warning, info
 @dataclass
 class EvalDataset:
     examples: List[Example]
+    traces: List[Trace]
     _alias: Union[str, None] = field(default=None)
     _id: Union[str, None] = field(default=None)
     judgment_api_key: str = field(default="")
@@ -20,12 +21,13 @@ class EvalDataset:
     def __init__(self,
                  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
                  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
-                 examples: List[Example] = [],
+                 examples: Optional[List[Example]] = None,
+                 traces: Optional[List[Trace]] = None
                  ):
-        debug(f"Initializing EvalDataset with {len(examples)} examples")
         if not judgment_api_key:
             warning("No judgment_api_key provided")
-        self.examples = examples
+        self.examples = examples or []
+        self.traces = traces or []
         self._alias = None
         self._id = None
         self.judgment_api_key = judgment_api_key
@@ -218,8 +220,11 @@ class EvalDataset:
             self.add_example(e)
     def add_example(self, e: Example) -> None:
-        self.examples = self.examples + [e]
+        self.examples.append(e)
         # TODO if we need to add rank, then we need to do it here
+    def add_trace(self, t: Trace) -> None:
+        self.traces.append(t)
     def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
         """
@@ -307,6 +312,7 @@ class EvalDataset:
         return (
             f"{self.__class__.__name__}("
             f"examples={self.examples}, "
+            f"traces={self.traces}, "
             f"_alias={self._alias}, "
             f"_id={self._id}"
             f")"

judgeval/data/datasets/eval_dataset_client.py CHANGED Viewed

@@ -13,7 +13,7 @@ from judgeval.constants import (
     JUDGMENT_DATASETS_INSERT_API_URL,
     JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
 )
-from judgeval.data import Example
+from judgeval.data import Example, Trace
 from judgeval.data.datasets import EvalDataset
@@ -58,6 +58,7 @@ class EvalDatasetClient:
                     "dataset_alias": alias,
                     "project_name": project_name,
                     "examples": [e.to_dict() for e in dataset.examples],
+                    "traces": [t.model_dump() for t in dataset.traces],
                     "overwrite": overwrite,
                 }
             try:
@@ -202,6 +203,7 @@ class EvalDatasetClient:
                 info(f"Successfully pulled dataset with alias '{alias}'")
                 payload = response.json()
                 dataset.examples = [Example(**e) for e in payload.get("examples", [])]
+                dataset.traces = [Trace(**t) for t in payload.get("traces", [])]
                 dataset._alias = payload.get("alias")
                 dataset._id = payload.get("id")
                 progress.update(

judgeval/data/trace.py CHANGED Viewed

@@ -33,6 +33,8 @@ class TraceSpan(BaseModel):
     additional_metadata: Optional[Dict[str, Any]] = None
     has_evaluation: Optional[bool] = False
     agent_name: Optional[str] = None
+    state_before: Optional[Dict[str, Any]] = None
+    state_after: Optional[Dict[str, Any]] = None
     def model_dump(self, **kwargs):
         return {
@@ -50,7 +52,9 @@ class TraceSpan(BaseModel):
             "span_type": self.span_type,
             "usage": self.usage.model_dump() if self.usage else None,
             "has_evaluation": self.has_evaluation,
-            "agent_name": self.agent_name
+            "agent_name": self.agent_name,
+            "state_before": self.state_before,
+            "state_after": self.state_after
         }
     def print_span(self):
@@ -113,7 +117,7 @@ class Trace(BaseModel):
     name: str
     created_at: str
     duration: float
-    entries: List[TraceSpan]
+    trace_spans: List[TraceSpan]
     overwrite: bool = False
     offline_mode: bool = False
     rules: Optional[Dict[str, Any]] = None

judgeval/judgment_client.py CHANGED Viewed

@@ -63,7 +63,15 @@ class SingletonMeta(type):
         return cls._instances[cls]
 class JudgmentClient(metaclass=SingletonMeta):
-    def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
+    def __init__(self, judgment_api_key: Optional[str] = os.getenv("JUDGMENT_API_KEY"), organization_id: Optional[str] = os.getenv("JUDGMENT_ORG_ID")):
+        # Check if API key is None
+        if judgment_api_key is None:
+            raise ValueError("JUDGMENT_API_KEY cannot be None. Please provide a valid API key or set the JUDGMENT_API_KEY environment variable.")
+        # Check if organization ID is None
+        if organization_id is None:
+            raise ValueError("JUDGMENT_ORG_ID cannot be None. Please provide a valid organization ID or set the JUDGMENT_ORG_ID environment variable.")
         self.judgment_api_key = judgment_api_key
         self.organization_id = organization_id
         self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)

judgeval/run_evaluation.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import requests
 import time
+import json
 import sys
 import itertools
 import threading
@@ -362,14 +363,26 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
     """
     Checks if the example contains the necessary parameters for the scorer.
     """
+    prompt_user = False
     for scorer in scorers:
         for example in examples:
             missing_params = []
             for param in scorer.required_params:
                 if getattr(example, param.value) is None:
-                    missing_params.append(f"'{param.value}'")
+                    missing_params.append(f"{param.value}")
             if missing_params:
-                print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
+                rprint(f"[yellow]⚠️  WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]")
+                rprint(f"Missing parameters: {', '.join(missing_params)}")
+                rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
+                rprint("-"*40)
+                prompt_user = True
+    if prompt_user:
+        user_input = input("Do you want to continue? (y/n)")
+        if user_input.lower() != "y":
+            sys.exit(0)
+        else:
+            rprint("[green]Continuing...[/green]")
 def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -407,7 +420,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
         for i, trace in enumerate(tracer.traces):
             # We set the root-level trace span with the expected tools of the Trace
             trace = Trace(**trace)
-            trace.entries[0].expected_tools = examples[i].expected_tools
+            trace.trace_spans[0].expected_tools = examples[i].expected_tools
             new_traces.append(trace)
         trace_run.traces = new_traces
         tracer.traces = []
@@ -894,6 +907,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
             f"Processing evaluation '{evaluation_run.eval_name}': "
         )
     else:
+        check_examples(evaluation_run.examples, evaluation_run.scorers)
         if judgment_scorers:
             # Execute evaluation using Judgment API
             info("Starting API evaluation")

judgeval/scorers/judgeval_scorer.py CHANGED Viewed

@@ -12,7 +12,7 @@ from judgeval.common.logger import debug, info, warning, error
 from judgeval.judges import JudgevalJudge
 from judgeval.judges.utils import create_judge
 from judgeval.constants import UNBOUNDED_SCORERS
+from judgeval.data.example import ExampleParams
 class JudgevalScorer:
     """
     Base class for scorers in `judgeval`.
@@ -39,6 +39,7 @@ class JudgevalScorer:
     evaluation_cost: Optional[float] = None  # The cost of running the scorer
     verbose_logs: Optional[str] = None  # The verbose logs of the scorer
     additional_metadata: Optional[Dict] = None  # Additional metadata for the scorer
+    required_params: Optional[List[ExampleParams]] = None  # The required parameters for the scorer
     error: Optional[str] = None
     success: Optional[bool] = None
@@ -51,6 +52,7 @@ class JudgevalScorer:
         reason: Optional[str] = None,
         success: Optional[bool] = None,
         evaluation_model: Optional[str] = None,
+        required_params: Optional[List[ExampleParams]] = None,
         strict_mode: bool = False,
         async_mode: bool = True,
         verbose_mode: bool = True,
@@ -87,6 +89,7 @@ class JudgevalScorer:
             self.evaluation_cost = evaluation_cost
             self.verbose_logs = verbose_logs
             self.additional_metadata = additional_metadata
+            self.required_params = required_params
     def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
         """

judgeval/scorers/prompt_scorer.py CHANGED Viewed

@@ -30,6 +30,7 @@ from typing import List, Optional, Tuple, Any, Mapping
 from pydantic import BaseModel, model_serializer, Field
 from judgeval.data import Example
+from judgeval.data.example import ExampleParams
 from judgeval.scorers import JudgevalScorer
 from judgeval.scorers.utils import (
     scorer_progress_meter,
@@ -64,6 +65,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
         async_mode: bool = True,
         strict_mode: bool = False,
         verbose_mode: bool = False,
+        required_params: Optional[List[ExampleParams]] = None,
     ):
         # Initialize BaseModel first
         BaseModel.__init__(
@@ -85,6 +87,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
             async_mode=async_mode,
             strict_mode=strict_mode,
             verbose_mode=verbose_mode,
+            required_params=required_params,
         )
     def score_example(

{judgeval-0.0.40.dist-info → judgeval-0.0.41.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.40
+Version: 0.0.41
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -37,11 +37,11 @@ Description-Content-Type: text/markdown
 <br>
-## [🌐 Landing Page](https://www.judgmentlabs.ai/)  • [📚 Docs](https://judgment.mintlify.app/getting_started) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o)
+## [🌐 Landing Page](https://www.judgmentlabs.ai/)  • [📚 Docs](https://docs.judgmentlabs.ai/introduction) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o)
 [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
 [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
-[![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/FMxHkYTtFE)
+[![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/ZCnSXYug)
 </div>
@@ -56,19 +56,28 @@ We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and a
 Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
 ## 📋 Table of Contents
-* [✨ Features](#-features)
-    * [🔍 Tracing](#-tracing)
-    * [🧪 Evals](#-evals)
-    * [📡 Monitoring](#-monitoring)
-    * [📊 Datasets](#-datasets)
-    * [💡 Insights](#-insights)
-* [🛠️ Installation](#️-installation)
-* [🏁 Get Started](#-get-started)
-* [🏢 Self-Hosting](#-self-hosting)
-* [📚 Cookbooks](#-cookbooks)
-* [💻 Development with Cursor](#-development-with-cursor)
-* [⭐ Star Us on GitHub](#-star-us-on-github)
-* [❤️ Contributors](#️-contributors)
+- [🌐 Landing Page  • 📚 Docs • 🚀 Demos](#-landing-page----docs---demos)
+- [Judgeval: open-source testing, monitoring, and optimization for AI agents](#judgeval-open-source-testing-monitoring-and-optimization-for-ai-agents)
+- [📋 Table of Contents](#-table-of-contents)
+- [✨ Features](#-features)
+- [🛠️ Installation](#️-installation)
+- [🏁 Get Started](#-get-started)
+  - [🛰️ Tracing](#️-tracing)
+  - [📝 Offline Evaluations](#-offline-evaluations)
+  - [📡 Online Evaluations](#-online-evaluations)
+- [🏢 Self-Hosting](#-self-hosting)
+  - [Key Features](#key-features)
+  - [Getting Started](#getting-started)
+- [📚 Cookbooks](#-cookbooks)
+  - [Sample Agents](#sample-agents)
+    - [💰 LangGraph Financial QA Agent](#-langgraph-financial-qa-agent)
+    - [✈️ OpenAI Travel Agent](#️-openai-travel-agent)
+  - [Custom Evaluators](#custom-evaluators)
+    - [🔍 PII Detection](#-pii-detection)
+    - [📧 Cold Email Generation](#-cold-email-generation)
+- [💻 Development with Cursor](#-development-with-cursor)
+- [⭐ Star Us on GitHub](#-star-us-on-github)
+- [❤️ Contributors](#️-contributors)
 <!-- Created by https://github.com/ekalinin/github-markdown-toc -->

{judgeval-0.0.40.dist-info → judgeval-0.0.41.dist-info}/RECORD RENAMED Viewed

@@ -2,27 +2,27 @@ judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
 judgeval/clients.py,sha256=EiTmvvWksTPyWIuMC9jz06SPY2vFzokIJUIGoScpisA,989
 judgeval/constants.py,sha256=xuO-Und5c0-K3yTRn2fAkwyY2uTf8b7dGd39CPVqkSQ,5661
 judgeval/evaluation_run.py,sha256=KNGtaGAwD18pDNOKF7PCMlLnQe9SpRLTs0XWFMrCiLc,6684
-judgeval/judgment_client.py,sha256=TkYNCzuy5toIqvsgCSGO4WyKfUSgEM_gX2pbQqWCFJo,24481
+judgeval/judgment_client.py,sha256=JO3AkU-disPHQVK5g1SM-bs_EUSy8QZ3AaAj_Q2ag6s,24968
 judgeval/rules.py,sha256=jkh1cXXcUf8oRY7xJUZfcQBYWn_rjUW4GvrhRt15PeU,20265
-judgeval/run_evaluation.py,sha256=Kg2iFrpVq-rGMfQokM5s_LJ4BSqGNxjQUxnxfaiTOZ4,49135
+judgeval/run_evaluation.py,sha256=MshtOGvWm_eGj2JamEtiMWvPjdCwrKTp9WcAUrBm2Fs,49673
 judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
 judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
 judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
 judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
 judgeval/common/s3_storage.py,sha256=W8wq9S7qJZdqdBR4sk3aEZ4K3-pz40DOoolOJrWs9Vo,3768
-judgeval/common/tracer.py,sha256=0HwWwbvnT9Z686q9ppqUB54xR-GKqlBnkXf29c76eow,84425
-judgeval/common/utils.py,sha256=w1SjpDtB1DTJapFSAvLzr_a3gGI45iacEoxIUnQXx4Q,34087
+judgeval/common/tracer.py,sha256=rYNmyB3Z955xfnKmlase6gub8Xf5xz6nQefONs_Td5U,90870
+judgeval/common/utils.py,sha256=sWdHfqgiF6AnKTQNmeUBfoEsddXgInI5M24t2-QYexk,34271
 judgeval/data/__init__.py,sha256=GX_GloDtBB35mv3INWbSTP2r9cwCU2IeIYjzRT0SAd8,530
 judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
 judgeval/data/example.py,sha256=jcK78ff-TKNl9Qtxvbd1g61crpo-s4fWHaqyMIbQNq0,6877
 judgeval/data/result.py,sha256=KfU9lhAKG_Xo2eGDm2uKVVRZpf177IDASg1cIwedJwE,3184
 judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
 judgeval/data/tool.py,sha256=eEEvGDNNYWhcQiI6cjDv3rO1VoOJJS5LWGS76Gb_gtY,1813
-judgeval/data/trace.py,sha256=ETZEb_MJfv4vWr2y_uZ7FfIua7GrV6jgSUVjjURAdlQ,4602
+judgeval/data/trace.py,sha256=S9IQunatke-Kcxi2-qXg3CtbmxBk8VGBDJzWshx7zJg,4798
 judgeval/data/trace_run.py,sha256=fiB5Z5il9U9XqvksdA2DbLNd96U_Wrz8K00RuFJBy38,2324
 judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
-judgeval/data/datasets/dataset.py,sha256=oU9hvZTifK2x8em3FhL3oIqgHOByfJWH6C_9rIKnL5g,12773
-judgeval/data/datasets/eval_dataset_client.py,sha256=3RBfkaMrkudjnmY_qFwY4I-2mOPE3XK4WxkfSweLB-Q,15016
+judgeval/data/datasets/dataset.py,sha256=pq9-A1mg2Brpjg1TufDU_eLo9sQhX0nw-UTGaf3jCXA,12952
+judgeval/data/datasets/eval_dataset_client.py,sha256=LJ1bf1sZAC4ZBCRTQ1Y4VrJuNSslYBQ1y9YKuhYxwqY,15176
 judgeval/integrations/langgraph.py,sha256=L9zPPWVLGL2HWuwHPqM5Kic4S7EfQ_Y1Y3YKBJNfGCA,23004
 judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
 judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
@@ -33,8 +33,8 @@ judgeval/judges/utils.py,sha256=vL-15_udU94JHUAiyrAvHAKMj6Fqypg01ek4YH5zVCM,2687
 judgeval/scorers/__init__.py,sha256=VKPveyGCv5Rc0YtuT7iAxSv-M5EuikqAVeaGNnYMuWE,1340
 judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
 judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
-judgeval/scorers/judgeval_scorer.py,sha256=id4s72vn4pWKbjZDnWKHGlc4kqyUkrFqdlX1SgyDj6c,7027
-judgeval/scorers/prompt_scorer.py,sha256=9MGSG2OVvX2i2CUZmXg0i3rJHQKMe2dMBdMDcnjp8mg,11845
+judgeval/scorers/judgeval_scorer.py,sha256=_qtXzl5aa1FH_50kVPnRfiwyCtuXPKyrGU71_3pOrBw,7288
+judgeval/scorers/prompt_scorer.py,sha256=Uf_QZhytd78cInKZv8wr66Angz5sxLklP5hEEcoabq4,12001
 judgeval/scorers/score.py,sha256=h4eVlbItqG8R0nQgSgeyicYSIraZV9MvV-RRaFu46mg,18762
 judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
 judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -62,7 +62,7 @@ judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256
 judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
 judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
 judgeval/utils/data_utils.py,sha256=pB4GBWi8XoM2zSR2NlLXH5kqcQ029BVhDxaVKkdmiBY,1860
-judgeval-0.0.40.dist-info/METADATA,sha256=pAFVIDRiMlCOrbfQ0-epidECcUHl_fFuiLPgGnhDJYo,56712
-judgeval-0.0.40.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.0.40.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.0.40.dist-info/RECORD,,
+judgeval-0.0.41.dist-info/METADATA,sha256=-sO68MUEmN3s4ji7Vf1gTuPv60R7Ny6bMcuuKlFSSI8,57358
+judgeval-0.0.41.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.0.41.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.0.41.dist-info/RECORD,,

{judgeval-0.0.40.dist-info → judgeval-0.0.41.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.0.40.dist-info → judgeval-0.0.41.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.0.40__py3-none-any.whl → 0.0.41__py3-none-any.whl

judgeval 0.0.40py3-none-any.whl → 0.0.41py3-none-any.whl