PyPI - judgeval - Versions diffs - 0.0.39__py3-none-any.whl → 0.0.41__py3-none-any.whl - Mend

judgeval 0.0.39py3-none-any.whl → 0.0.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

judgeval/clients.py +6 -4
judgeval/common/tracer.py +504 -257
judgeval/common/utils.py +5 -1
judgeval/constants.py +2 -0
judgeval/data/__init__.py +2 -1
judgeval/data/datasets/dataset.py +12 -6
judgeval/data/datasets/eval_dataset_client.py +3 -1
judgeval/data/example.py +7 -7
judgeval/data/tool.py +29 -1
judgeval/data/trace.py +31 -39
judgeval/data/trace_run.py +2 -1
judgeval/evaluation_run.py +4 -7
judgeval/judgment_client.py +34 -7
judgeval/run_evaluation.py +67 -19
judgeval/scorers/__init__.py +4 -1
judgeval/scorers/judgeval_scorer.py +12 -1
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +4 -0
judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +124 -0
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +20 -0
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +1 -1
judgeval/scorers/prompt_scorer.py +8 -164
judgeval/scorers/score.py +15 -15
judgeval-0.0.41.dist-info/METADATA +1450 -0
{judgeval-0.0.39.dist-info → judgeval-0.0.41.dist-info}/RECORD +26 -24
judgeval-0.0.39.dist-info/METADATA +0 -247
{judgeval-0.0.39.dist-info → judgeval-0.0.41.dist-info}/WHEEL +0 -0
{judgeval-0.0.39.dist-info → judgeval-0.0.41.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/tracer.py CHANGED Viewed

@@ -5,7 +5,6 @@ Tracing system for judgeval that allows for function tracing using decorators.
 import asyncio
 import functools
 import inspect
-import json
 import os
 import site
 import sysconfig
@@ -16,6 +15,7 @@ import uuid
 import warnings
 import contextvars
 import sys
+import json
 from contextlib import contextmanager, asynccontextmanager, AbstractAsyncContextManager, AbstractContextManager # Import context manager bases
 from dataclasses import dataclass, field
 from datetime import datetime
@@ -29,19 +29,16 @@ from typing import (
     Literal,
     Optional,
     Tuple,
-    Type,
-    TypeVar,
     Union,
     AsyncGenerator,
     TypeAlias,
 )
 from rich import print as rprint
-import types # <--- Add this import
+import types
 # Third-party imports
 import requests
-from litellm import cost_per_token
-from pydantic import BaseModel
+from litellm import cost_per_token as _original_cost_per_token
 from rich import print as rprint
 from openai import OpenAI, AsyncOpenAI
 from together import Together, AsyncTogether
@@ -59,12 +56,11 @@ from judgeval.constants import (
     JUDGMENT_TRACES_DELETE_API_URL,
     JUDGMENT_PROJECT_DELETE_API_URL,
 )
-from judgeval.data import Example, Trace, TraceSpan
+from judgeval.data import Example, Trace, TraceSpan, TraceUsage
 from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
 from judgeval.rules import Rule
 from judgeval.evaluation_run import EvaluationRun
-from judgeval.data.result import ScoringResult
-from judgeval.common.utils import validate_api_key
+from judgeval.common.utils import ExcInfo, validate_api_key
 from judgeval.common.exceptions import JudgmentAPIError
 # Standard library imports needed for the new class
@@ -155,9 +151,29 @@ class TraceManagerClient:
             NOTE we save empty traces in order to properly handle async operations; we need something in the DB to associate the async results with
         """
         # Save to Judgment API
+        def fallback_encoder(obj):
+            """
+            Custom JSON encoder fallback.
+            Tries to use obj.__repr__(), then str(obj) if that fails or for a simpler string.
+            You can choose which one you prefer or try them in sequence.
+            """
+            try:
+                # Option 1: Prefer __repr__ for a more detailed representation
+                return repr(obj)
+            except Exception:
+                # Option 2: Fallback to str() if __repr__ fails or if you prefer str()
+                try:
+                    return str(obj)
+                except Exception as e:
+                    # If both fail, you might return a placeholder or re-raise
+                    return f"<Unserializable object of type {type(obj).__name__}: {e}>"
+        serialized_trace_data = json.dumps(trace_data, default=fallback_encoder)
         response = requests.post(
             JUDGMENT_TRACES_SAVE_API_URL,
-            json=trace_data,
+            data=serialized_trace_data,
             headers={
                 "Content-Type": "application/json",
                 "Authorization": f"Bearer {self.judgment_api_key}",
@@ -286,7 +302,7 @@ class TraceClient:
         tracer: Optional["Tracer"],
         trace_id: Optional[str] = None,
         name: str = "default",
-        project_name: str = "default_project",
+        project_name: str = None,
         overwrite: bool = False,
         rules: Optional[List[Rule]] = None,
         enable_monitoring: bool = True,
@@ -296,7 +312,7 @@ class TraceClient:
     ):
         self.name = name
         self.trace_id = trace_id or str(uuid.uuid4())
-        self.project_name = project_name
+        self.project_name = project_name or str(uuid.uuid4())
         self.overwrite = overwrite
         self.tracer = tracer
         self.rules = rules or []
@@ -463,6 +479,7 @@ class TraceClient:
         if current_span_id:
             span = self.span_id_to_span[current_span_id]
             span.evaluation_runs.append(eval_run)
+            span.has_evaluation = True  # Set the has_evaluation flag
         self.evaluation_runs.append(eval_run)
     def add_annotation(self, annotation: TraceAnnotation):
@@ -474,16 +491,47 @@ class TraceClient:
         current_span_id = current_span_var.get()
         if current_span_id:
             span = self.span_id_to_span[current_span_id]
+            # Ignore self parameter
+            if "self" in inputs:
+                del inputs["self"]
             span.inputs = inputs
+    def record_agent_name(self, agent_name: str):
+        current_span_id = current_span_var.get()
+        if current_span_id:
+            span = self.span_id_to_span[current_span_id]
+            span.agent_name = agent_name
+    def record_state_before(self, state: dict):
+        """Records the agent's state before a tool execution on the current span.
+        Args:
+            state: A dictionary representing the agent's state.
+        """
+        current_span_id = current_span_var.get()
+        if current_span_id:
+            span = self.span_id_to_span[current_span_id]
+            span.state_before = state
+    def record_state_after(self, state: dict):
+        """Records the agent's state after a tool execution on the current span.
+        Args:
+            state: A dictionary representing the agent's state.
+        """
+        current_span_id = current_span_var.get()
+        if current_span_id:
+            span = self.span_id_to_span[current_span_id]
+            span.state_after = state
-    async def _update_coroutine_output(self, span: TraceSpan, coroutine: Any):
+    async def _update_coroutine(self, span: TraceSpan, coroutine: Any, field: str):
         """Helper method to update the output of a trace entry once the coroutine completes"""
         try:
             result = await coroutine
-            span.output = result
+            setattr(span, field, result)
             return result
         except Exception as e:
-            span.output = f"Error: {str(e)}"
+            setattr(span, field, f"Error: {str(e)}")
             raise
     def record_output(self, output: Any):
@@ -493,12 +541,30 @@ class TraceClient:
             span.output = "<pending>" if inspect.iscoroutine(output) else output
             if inspect.iscoroutine(output):
-                asyncio.create_task(self._update_coroutine_output(span, output))
+                asyncio.create_task(self._update_coroutine(span, output, "output"))
+            return span # Return the created entry
+        # Removed else block - original didn't have one
+        return None # Return None if no span_id found
+    def record_usage(self, usage: TraceUsage):
+        current_span_id = current_span_var.get()
+        if current_span_id:
+            span = self.span_id_to_span[current_span_id]
+            span.usage = usage
             return span # Return the created entry
         # Removed else block - original didn't have one
         return None # Return None if no span_id found
+    def record_error(self, error: Dict[str, Any]):
+        current_span_id = current_span_var.get()
+        if current_span_id:
+            span = self.span_id_to_span[current_span_id]
+            span.error = error
+            return span
+        return None
     def add_span(self, span: TraceSpan):
         """Add a trace span to this trace context"""
         self.trace_spans.append(span)
@@ -523,133 +589,6 @@ class TraceClient:
         """
         # Calculate total elapsed time
         total_duration = self.get_duration()
-        # Only count tokens for actual LLM API call spans
-        llm_span_names = {"OPENAI_API_CALL", "TOGETHER_API_CALL", "ANTHROPIC_API_CALL", "GOOGLE_API_CALL"}
-        for span in self.trace_spans:
-            span_function_name = span.function # Get function name safely
-            # Check if it's an LLM span AND function name CONTAINS an API call suffix AND output is dict
-            is_llm_span = span.span_type == "llm"
-            has_api_suffix = any(suffix in span_function_name for suffix in llm_span_names)
-            output_is_dict = isinstance(span.output, dict)
-            # --- DEBUG PRINT 1: Check if condition passes ---
-            # if is_llm_entry and has_api_suffix and output_is_dict:
-            # elif is_llm_entry:
-            #      # Print why it failed if it was an LLM entry
-            # # --- END DEBUG ---
-            if is_llm_span and has_api_suffix and output_is_dict:
-                output = span.output
-                usage = output.get("usage", {}) # Gets the 'usage' dict from the 'output' field
-                # --- DEBUG PRINT 2: Check extracted usage ---
-                # --- END DEBUG ---
-                # --- NEW: Extract model_name correctly from nested inputs ---
-                model_name = None
-                span_inputs = span.inputs
-                if span_inputs:
-                    # Try common locations for model name within the inputs structure
-                    invocation_params = span_inputs.get("invocation_params", {})
-                    serialized_data = span_inputs.get("serialized", {})
-                    # Look in invocation_params (often directly contains model)
-                    if isinstance(invocation_params, dict):
-                        model_name = invocation_params.get("model")
-                    # Fallback: Check serialized 'repr' if it contains model info
-                    if not model_name and isinstance(serialized_data, dict):
-                         serialized_repr = serialized_data.get("repr", "")
-                         if "model_name=" in serialized_repr:
-                              try: # Simple parsing attempt
-                                   model_name = serialized_repr.split("model_name='")[1].split("'")[0]
-                              except IndexError: pass # Ignore parsing errors
-                    # Fallback: Check top-level of invocation_params (sometimes passed flat)
-                    if not model_name and isinstance(invocation_params, dict):
-                        model_name = invocation_params.get("model") # Redundant check, but safe
-                    # Fallback: Check top-level of inputs itself (less likely for callbacks)
-                    if not model_name:
-                        model_name = span_inputs.get("model")
-                # --- END NEW ---
-                prompt_tokens = 0
-                completion_tokens = 0
-                # Handle OpenAI/Together format (checks within the 'usage' dict)
-                if "prompt_tokens" in usage:
-                    prompt_tokens = usage.get("prompt_tokens", 0)
-                    completion_tokens = usage.get("completion_tokens", 0)
-                # Handle Anthropic format - MAP values to standard keys
-                elif "input_tokens" in usage:
-                    prompt_tokens = usage.get("input_tokens", 0)       # Get value from input_tokens
-                    completion_tokens = usage.get("output_tokens", 0)    # Get value from output_tokens
-                    # *** Overwrite the usage dict in the entry to use standard keys ***
-                    original_total = usage.get("total_tokens", 0)
-                    original_total_cost = usage.get("total_cost_usd", 0.0) # Preserve if already calculated
-                    # Recalculate cost just in case it wasn't done correctly before
-                    temp_prompt_cost, temp_completion_cost = 0.0, 0.0
-                    if model_name:
-                        try:
-                           temp_prompt_cost, temp_completion_cost = cost_per_token(
-                                model=model_name,
-                                prompt_tokens=prompt_tokens,
-                                completion_tokens=completion_tokens
-                           )
-                        except Exception:
-                           pass # Ignore cost calculation errors here, focus on keys
-                    # Replace the usage dict with one using standard keys but Anthropic values
-                    output["usage"] = {
-                        "prompt_tokens": prompt_tokens,
-                        "completion_tokens": completion_tokens,
-                        "total_tokens": original_total,
-                        "prompt_tokens_cost_usd": temp_prompt_cost, # Use standard cost key
-                        "completion_tokens_cost_usd": temp_completion_cost, # Use standard cost key
-                        "total_cost_usd": original_total_cost if original_total_cost > 0 else (temp_prompt_cost + temp_completion_cost)
-                    }
-                    usage = output["usage"]
-                # Calculate costs if model name is available and ensure they are stored with standard keys
-                prompt_tokens = usage.get("prompt_tokens", 0)
-                completion_tokens = usage.get("completion_tokens", 0)
-                # Calculate costs if model name is available
-                if model_name:
-                    try:
-                        # Recalculate costs based on potentially mapped tokens
-                        prompt_cost, completion_cost = cost_per_token(
-                            model=model_name,
-                            prompt_tokens=prompt_tokens,
-                            completion_tokens=completion_tokens
-                        )
-                        # Add cost information directly to the usage dictionary in the condensed entry
-                        # Ensure 'usage' exists in the output dict before modifying it
-                        # Add/Update cost information using standard keys
-                        if "usage" not in output:
-                            output["usage"] = {} # Initialize if missing
-                        elif not isinstance(output["usage"], dict): # Handle cases where 'usage' might not be a dict (e.g., placeholder string)
-                            print(f"[WARN TraceClient.save] Output 'usage' for span {span.span_id} was not a dict ({type(output['usage'])}). Resetting before adding costs.")
-                            output["usage"] = {} # Reset to dict
-                        output["usage"]["prompt_tokens_cost_usd"] = prompt_cost
-                        output["usage"]["completion_tokens_cost_usd"] = completion_cost
-                        output["usage"]["total_cost_usd"] = prompt_cost + completion_cost
-                    except Exception as e:
-                        # If cost calculation fails, continue without adding costs
-                        print(f"Error calculating cost for model '{model_name}' (span: {span.span_id}): {str(e)}")
-                        pass
-                else:
-                     print(f"[WARN TraceClient.save] Could not determine model name for cost calculation (span: {span.span_id}). Inputs: {span_inputs}")
         # Create trace document - Always use standard keys for top-level counts
         trace_data = {
             "trace_id": self.trace_id,
@@ -657,7 +596,7 @@ class TraceClient:
             "project_name": self.project_name,
             "created_at": datetime.utcfromtimestamp(self.start_time).isoformat(),
             "duration": total_duration,
-            "entries": [span.model_dump() for span in self.trace_spans],
+            "trace_spans": [span.model_dump() for span in self.trace_spans],
             "evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
             "overwrite": overwrite,
             "offline_mode": self.tracer.offline_mode,
@@ -677,13 +616,46 @@ class TraceClient:
     def delete(self):
         return self.trace_manager_client.delete_trace(self.trace_id)
+def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_info: ExcInfo):
+    if not current_trace:
+        return
+    exc_type, exc_value, exc_traceback_obj = exc_info
+    formatted_exception = {
+        "type": exc_type.__name__ if exc_type else "UnknownExceptionType",
+        "message": str(exc_value) if exc_value else "No exception message",
+        "traceback": traceback.format_tb(exc_traceback_obj) if exc_traceback_obj else []
+    }
+    # This is where we specially handle exceptions that we might want to collect additional data for.
+    # When we do this, always try checking the module from sys.modules instead of importing. This will
+    # Let us support a wider range of exceptions without needing to import them for all clients.
+    # Most clients (requests, httpx, urllib) support the standard format of exposing error.request.url and error.response.status_code
+    # The alternative is to hand select libraries we want from sys.modules and check for them:
+    # As an example:  requests_module = sys.modules.get("requests", None) // then do things with requests_module;
+     # General HTTP Like errors
+    try:
+        url = getattr(getattr(exc_value, "request", None), "url", None)
+        status_code = getattr(getattr(exc_value, "response", None), "status_code", None)
+        if status_code:
+            formatted_exception["http"] = {
+                "url": url if url else "Unknown URL",
+                "status_code": status_code if status_code else None,
+            }
+    except Exception as e:
+        pass
+    current_trace.record_error(formatted_exception)
 class _DeepTracer:
     _instance: Optional["_DeepTracer"] = None
     _lock: threading.Lock = threading.Lock()
     _refcount: int = 0
     _span_stack: contextvars.ContextVar[List[Dict[str, Any]]] = contextvars.ContextVar("_deep_profiler_span_stack", default=[])
     _skip_stack: contextvars.ContextVar[List[str]] = contextvars.ContextVar("_deep_profiler_skip_stack", default=[])
+    _original_sys_trace: Optional[Callable] = None
+    _original_threading_trace: Optional[Callable] = None
     def _get_qual_name(self, frame) -> str:
         func_name = frame.f_code.co_name
@@ -731,12 +703,53 @@ class _DeepTracer:
     @functools.cache
     def _is_user_code(self, filename: str):
         return bool(filename) and not filename.startswith("<") and not os.path.realpath(filename).startswith(_TRACE_FILEPATH_BLOCKLIST)
+    def _cooperative_sys_trace(self, frame: types.FrameType, event: str, arg: Any):
+        """Cooperative trace function for sys.settrace that chains with existing tracers."""
+        # First, call the original sys trace function if it exists
+        original_result = None
+        if self._original_sys_trace:
+            try:
+                original_result = self._original_sys_trace(frame, event, arg)
+            except Exception:
+                # If the original tracer fails, continue with our tracing
+                pass
+        # Then do our own tracing
+        our_result = self._trace(frame, event, arg, self._cooperative_sys_trace)
+        # Return our tracer to continue tracing, but respect the original's decision
+        # If the original tracer returned None (stop tracing), we should respect that
+        if original_result is None and self._original_sys_trace:
+            return None
+        return our_result or original_result
-    def _trace(self, frame: types.FrameType, event: str, arg: Any):
+    def _cooperative_threading_trace(self, frame: types.FrameType, event: str, arg: Any):
+        """Cooperative trace function for threading.settrace that chains with existing tracers."""
+        # First, call the original threading trace function if it exists
+        original_result = None
+        if self._original_threading_trace:
+            try:
+                original_result = self._original_threading_trace(frame, event, arg)
+            except Exception:
+                # If the original tracer fails, continue with our tracing
+                pass
+        # Then do our own tracing
+        our_result = self._trace(frame, event, arg, self._cooperative_threading_trace)
+        # Return our tracer to continue tracing, but respect the original's decision
+        # If the original tracer returned None (stop tracing), we should respect that
+        if original_result is None and self._original_threading_trace:
+            return None
+        return our_result or original_result
+    def _trace(self, frame: types.FrameType, event: str, arg: Any, continuation_func: Callable):
         frame.f_trace_lines = False
         frame.f_trace_opcodes = False
         if not self._should_trace(frame):
             return
@@ -752,6 +765,12 @@ class _DeepTracer:
             return
         qual_name = self._get_qual_name(frame)
+        instance_name = None
+        if 'self' in frame.f_locals:
+            instance = frame.f_locals['self']
+            class_name = instance.__class__.__name__
+            class_identifiers = getattr(Tracer._instance, 'class_identifiers', {})
+            instance_name = get_instance_prefixed_name(instance, class_name, class_identifiers)
         skip_stack = self._skip_stack.get()
         if event == "call":
@@ -814,7 +833,8 @@ class _DeepTracer:
                 created_at=start_time,
                 span_type="span",
                 parent_span_id=parent_span_id,
-                function=qual_name
+                function=qual_name,
+                agent_name=instance_name
             )
             current_trace.add_span(span)
@@ -869,35 +889,40 @@ class _DeepTracer:
                 current_span_var.reset(frame.f_locals["_judgment_span_token"])
         elif event == "exception":
-            exc_type, exc_value, exc_traceback = arg
-            formatted_exception = {
-                "type": exc_type.__name__,
-                "message": str(exc_value),
-                "traceback": traceback.format_tb(exc_traceback)
-            }
-            current_trace = current_trace_var.get()
-            current_trace.record_output({
-                "error": formatted_exception
-            })
+            exc_type = arg[0]
+            if issubclass(exc_type, (StopIteration, StopAsyncIteration, GeneratorExit)):
+                return
+            _capture_exception_for_trace(current_trace, arg)
-        return self._trace
+        return continuation_func
     def __enter__(self):
         with self._lock:
             self._refcount += 1
             if self._refcount == 1:
+                # Store the existing trace functions before setting ours
+                self._original_sys_trace = sys.gettrace()
+                self._original_threading_trace = threading.gettrace()
                 self._skip_stack.set([])
                 self._span_stack.set([])
-                sys.settrace(self._trace)
-                threading.settrace(self._trace)
+                sys.settrace(self._cooperative_sys_trace)
+                threading.settrace(self._cooperative_threading_trace)
         return self
     def __exit__(self, exc_type, exc_val, exc_tb):
         with self._lock:
             self._refcount -= 1
             if self._refcount == 0:
-                sys.settrace(None)
-                threading.settrace(None)
+                # Restore the original trace functions instead of setting to None
+                sys.settrace(self._original_sys_trace)
+                threading.settrace(self._original_threading_trace)
+                # Clean up the references
+                self._original_sys_trace = None
+                self._original_threading_trace = None
 def log(self, message: str, level: str = "info"):
@@ -920,7 +945,7 @@ class Tracer:
     def __init__(
         self,
         api_key: str = os.getenv("JUDGMENT_API_KEY"),
-        project_name: str = "default_project",
+        project_name: str = None,
         rules: Optional[List[Rule]] = None,  # Added rules parameter
         organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
         enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower() == "true",
@@ -946,13 +971,9 @@ class Tracer:
                 raise ValueError("Tracer must be configured with an Organization ID")
             if use_s3 and not s3_bucket_name:
                 raise ValueError("S3 bucket name must be provided when use_s3 is True")
-            if use_s3 and not (s3_aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID")):
-                raise ValueError("AWS Access Key ID must be provided when use_s3 is True")
-            if use_s3 and not (s3_aws_secret_access_key or os.getenv("AWS_SECRET_ACCESS_KEY")):
-                raise ValueError("AWS Secret Access Key must be provided when use_s3 is True")
             self.api_key: str = api_key
-            self.project_name: str = project_name
+            self.project_name: str = project_name or str(uuid.uuid4())
             self.organization_id: str = organization_id
             self._current_trace: Optional[str] = None
             self._active_trace_client: Optional[TraceClient] = None # Add active trace client attribute
@@ -961,6 +982,7 @@ class Tracer:
             self.initialized: bool = True
             self.enable_monitoring: bool = enable_monitoring
             self.enable_evaluations: bool = enable_evaluations
+            self.class_identifiers: Dict[str, str] = {}  # Dictionary to store class identifiers
             # Initialize S3 storage if enabled
             self.use_s3 = use_s3
@@ -1084,6 +1106,92 @@ class Tracer:
         rprint(f"[bold]{label}:[/bold] {msg}")
+    def identify(self, identifier: str, track_state: bool = False, track_attributes: Optional[List[str]] = None, field_mappings: Optional[Dict[str, str]] = None):
+        """
+        Class decorator that associates a class with a custom identifier and enables state tracking.
+        This decorator creates a mapping between the class name and the provided
+        identifier, which can be useful for tagging, grouping, or referencing
+        classes in a standardized way. It also enables automatic state capture
+        for instances of the decorated class when used with tracing.
+        Args:
+            identifier: The identifier to associate with the decorated class.
+                    This will be used as the instance name in traces.
+            track_state: Whether to automatically capture the state (attributes)
+                        of instances before and after function execution. Defaults to False.
+            track_attributes: Optional list of specific attribute names to track.
+                            If None, all non-private attributes (not starting with '_')
+                            will be tracked when track_state=True.
+            field_mappings: Optional dictionary mapping internal attribute names to
+                        display names in the captured state. For example:
+                        {"system_prompt": "instructions"} will capture the
+                        'instructions' attribute as 'system_prompt' in the state.
+        Example:
+            @tracer.identify(identifier="user_model", track_state=True, track_attributes=["name", "age"], field_mappings={"system_prompt": "instructions"})
+            class User:
+                # Class implementation
+        """
+        def decorator(cls):
+            class_name = cls.__name__
+            self.class_identifiers[class_name] = {
+                "identifier": identifier,
+                "track_state": track_state,
+                "track_attributes": track_attributes,
+                "field_mappings": field_mappings or {}
+            }
+            return cls
+        return decorator
+    def _capture_instance_state(self, instance: Any, class_config: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Capture the state of an instance based on class configuration.
+        Args:
+            instance: The instance to capture the state of.
+            class_config: Configuration dictionary for state capture,
+                          expected to contain 'track_attributes' and 'field_mappings'.
+        """
+        track_attributes = class_config.get('track_attributes')
+        field_mappings = class_config.get('field_mappings')
+        if track_attributes:
+            state = {attr: getattr(instance, attr, None) for attr in track_attributes}
+        else:
+            state = {k: v for k, v in instance.__dict__.items() if not k.startswith('_')}
+        if field_mappings:
+            state['field_mappings'] = field_mappings
+        return state
+    def _get_instance_state_if_tracked(self, args):
+        """
+        Extract instance state if the instance should be tracked.
+        Returns the captured state dict if tracking is enabled, None otherwise.
+        """
+        if args and hasattr(args[0], '__class__'):
+            instance = args[0]
+            class_name = instance.__class__.__name__
+            if (class_name in self.class_identifiers and
+                isinstance(self.class_identifiers[class_name], dict) and
+                self.class_identifiers[class_name].get('track_state', False)):
+                return self._capture_instance_state(instance, self.class_identifiers[class_name])
+    def _conditionally_capture_and_record_state(self, trace_client_instance: TraceClient, args: tuple, is_before: bool):
+        """Captures instance state if tracked and records it via the trace_client."""
+        state = self._get_instance_state_if_tracked(args)
+        if state:
+            if is_before:
+                trace_client_instance.record_state_before(state)
+            else:
+                trace_client_instance.record_state_after(state)
     def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False, deep_tracing: bool = None):
         """
         Decorator to trace function execution with detailed entry/exit information.
@@ -1106,10 +1214,10 @@ class Tracer:
                                          overwrite=overwrite, deep_tracing=deep_tracing)
         # Use provided name or fall back to function name
-        span_name = name or func.__name__
+        original_span_name = name or func.__name__
         # Store custom attributes on the function object
-        func._judgment_span_name = span_name
+        func._judgment_span_name = original_span_name
         func._judgment_span_type = span_type
         # Use the provided deep_tracing value or fall back to the tracer's default
@@ -1118,6 +1226,16 @@ class Tracer:
         if asyncio.iscoroutinefunction(func):
             @functools.wraps(func)
             async def async_wrapper(*args, **kwargs):
+                nonlocal original_span_name
+                class_name = None
+                instance_name = None
+                span_name = original_span_name
+                agent_name = None
+                if args and hasattr(args[0], '__class__'):
+                    class_name = args[0].__class__.__name__
+                    agent_name = get_instance_prefixed_name(args[0], class_name, self.class_identifiers)
                 # Get current trace from context
                 current_trace = current_trace_var.get()
@@ -1141,7 +1259,7 @@ class Tracer:
                     # Save empty trace and set trace context
                     # current_trace.save(empty_save=True, overwrite=overwrite)
                     trace_token = current_trace_var.set(current_trace)
                     try:
                         # Use span for the function execution within the root trace
                         # This sets the current_span_var
@@ -1149,12 +1267,24 @@ class Tracer:
                             # Record inputs
                             inputs = combine_args_kwargs(func, args, kwargs)
                             span.record_input(inputs)
+                            if agent_name:
+                                span.record_agent_name(agent_name)
+                            # Capture state before execution
+                            self._conditionally_capture_and_record_state(span, args, is_before=True)
                             if use_deep_tracing:
                                 with _DeepTracer():
                                     result = await func(*args, **kwargs)
                             else:
-                                result = await func(*args, **kwargs)
+                                try:
+                                    result = await func(*args, **kwargs)
+                                except Exception as e:
+                                    _capture_exception_for_trace(current_trace, sys.exc_info())
+                                    raise e
+                            # Capture state after execution
+                            self._conditionally_capture_and_record_state(span, args, is_before=False)
                             # Record output
                             span.record_output(result)
@@ -1170,12 +1300,24 @@ class Tracer:
                     with current_trace.span(span_name, span_type=span_type) as span:
                         inputs = combine_args_kwargs(func, args, kwargs)
                         span.record_input(inputs)
+                        if agent_name:
+                            span.record_agent_name(agent_name)
+                        # Capture state before execution
+                        self._conditionally_capture_and_record_state(span, args, is_before=True)
                         if use_deep_tracing:
                             with _DeepTracer():
                                 result = await func(*args, **kwargs)
                         else:
-                            result = await func(*args, **kwargs)
+                            try:
+                                result = await func(*args, **kwargs)
+                            except Exception as e:
+                                _capture_exception_for_trace(current_trace, sys.exc_info())
+                                raise e
+                        # Capture state after execution
+                        self._conditionally_capture_and_record_state(span, args, is_before=False)
                         span.record_output(result)
                     return result
@@ -1184,7 +1326,15 @@ class Tracer:
         else:
             # Non-async function implementation with deep tracing
             @functools.wraps(func)
-            def wrapper(*args, **kwargs):
+            def wrapper(*args, **kwargs):
+                nonlocal original_span_name
+                class_name = None
+                instance_name = None
+                span_name = original_span_name
+                agent_name = None
+                if args and hasattr(args[0], '__class__'):
+                    class_name = args[0].__class__.__name__
+                    agent_name = get_instance_prefixed_name(args[0], class_name, self.class_identifiers)
                 # Get current trace from context
                 current_trace = current_trace_var.get()
@@ -1216,12 +1366,24 @@ class Tracer:
                             # Record inputs
                             inputs = combine_args_kwargs(func, args, kwargs)
                             span.record_input(inputs)
+                            if agent_name:
+                                span.record_agent_name(agent_name)
+                            # Capture state before execution
+                            self._conditionally_capture_and_record_state(span, args, is_before=True)
                             if use_deep_tracing:
                                 with _DeepTracer():
                                     result = func(*args, **kwargs)
                             else:
-                                result = func(*args, **kwargs)
+                                try:
+                                    result = func(*args, **kwargs)
+                                except Exception as e:
+                                    _capture_exception_for_trace(current_trace, sys.exc_info())
+                                    raise e
+                            # Capture state after execution
+                            self._conditionally_capture_and_record_state(span, args, is_before=False)
                             # Record output
                             span.record_output(result)
@@ -1238,12 +1400,24 @@ class Tracer:
                         inputs = combine_args_kwargs(func, args, kwargs)
                         span.record_input(inputs)
+                        if agent_name:
+                            span.record_agent_name(agent_name)
+                        # Capture state before execution
+                        self._conditionally_capture_and_record_state(span, args, is_before=True)
                         if use_deep_tracing:
                             with _DeepTracer():
                                 result = func(*args, **kwargs)
                         else:
-                            result = func(*args, **kwargs)
+                            try:
+                                result = func(*args, **kwargs)
+                            except Exception as e:
+                                _capture_exception_for_trace(current_trace, sys.exc_info())
+                                raise e
+                        # Capture state after execution
+                        self._conditionally_capture_and_record_state(span, args, is_before=False)
                         span.record_output(result)
                     return result
@@ -1313,17 +1487,11 @@ def wrap(client: Any) -> Any:
             return wrapper_func(response, client, output_entry)
         else:
             format_func = _format_response_output_data if is_responses else _format_output_data
-            output_data = format_func(client, response)
-            span.record_output(output_data)
+            output, usage = format_func(client, response)
+            span.record_output(output)
+            span.record_usage(usage)
             return response
-    def _handle_error(span, e, is_async):
-        """Handle and record errors"""
-        call_type = "async" if is_async else "sync"
-        print(f"Error during wrapped {call_type} API call ({span_name}): {e}")
-        span.record_output({"error": str(e)})
-        raise
     # --- Traced Async Functions ---
     async def traced_create_async(*args, **kwargs):
         current_trace = current_trace_var.get()
@@ -1337,7 +1505,8 @@ def wrap(client: Any) -> Any:
                 response_or_iterator = await original_create(*args, **kwargs)
                 return _format_and_record_output(span, response_or_iterator, is_streaming, True, False)
             except Exception as e:
-                return _handle_error(span, e, True)
+                _capture_exception_for_trace(span, sys.exc_info())
+                raise e
     # Async responses for OpenAI clients
     async def traced_response_create_async(*args, **kwargs):
@@ -1352,7 +1521,8 @@ def wrap(client: Any) -> Any:
                 response_or_iterator = await original_responses_create(*args, **kwargs)
                 return _format_and_record_output(span, response_or_iterator, is_streaming, True, True)
             except Exception as e:
-                return _handle_error(span, e, True)
+                _capture_exception_for_trace(span, sys.exc_info())
+                raise e
     # Function replacing .stream() for async clients
     def traced_stream_async(*args, **kwargs):
@@ -1383,7 +1553,8 @@ def wrap(client: Any) -> Any:
                 response_or_iterator = original_create(*args, **kwargs)
                 return _format_and_record_output(span, response_or_iterator, is_streaming, False, False)
             except Exception as e:
-                return _handle_error(span, e, False)
+                _capture_exception_for_trace(span, sys.exc_info())
+                raise e
     def traced_response_create_sync(*args, **kwargs):
         current_trace = current_trace_var.get()
@@ -1397,7 +1568,8 @@ def wrap(client: Any) -> Any:
                 response_or_iterator = original_responses_create(*args, **kwargs)
                 return _format_and_record_output(span, response_or_iterator, is_streaming, False, True)
             except Exception as e:
-                return _handle_error(span, e, False)
+                _capture_exception_for_trace(span, sys.exc_info())
+                raise e
     # Function replacing sync .stream()
     def traced_stream_sync(*args, **kwargs):
@@ -1496,18 +1668,35 @@ def _format_response_output_data(client: ApiClient, response: Any) -> dict:
     Normalizes different response formats into a consistent structure
     for tracing purposes.
     """
+    message_content = None
+    prompt_tokens = 0
+    completion_tokens = 0
+    model_name = None
     if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
-        return {
-            "content": response.output,
-            "usage": {
-                "prompt_tokens": response.usage.input_tokens,
-                "completion_tokens": response.usage.output_tokens,
-                "total_tokens": response.usage.total_tokens
-            }
-        }
+        model_name = response.model
+        prompt_tokens = response.usage.input_tokens
+        completion_tokens = response.usage.output_tokens
+        message_content = response.output
     else:
         warnings.warn(f"Unsupported client type: {type(client)}")
         return {}
+    prompt_cost, completion_cost = cost_per_token(
+        model=model_name,
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+    )
+    total_cost_usd = (prompt_cost + completion_cost) if prompt_cost and completion_cost else None
+    usage = TraceUsage(
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        total_tokens=prompt_tokens + completion_tokens,
+        prompt_tokens_cost_usd=prompt_cost,
+        completion_tokens_cost_usd=completion_cost,
+        total_cost_usd=total_cost_usd,
+        model_name=model_name
+    )
+    return message_content, usage
 def _format_output_data(client: ApiClient, response: Any) -> dict:
@@ -1521,33 +1710,46 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
             - content: The generated text
             - usage: Token usage statistics
     """
+    prompt_tokens = 0
+    completion_tokens = 0
+    model_name = None
+    message_content = None
     if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
-        return {
-            "content": response.choices[0].message.content,
-            "usage": {
-                "prompt_tokens": response.usage.prompt_tokens,
-                "completion_tokens": response.usage.completion_tokens,
-                "total_tokens": response.usage.total_tokens
-            }
-        }
+        model_name = response.model
+        prompt_tokens = response.usage.prompt_tokens
+        completion_tokens = response.usage.completion_tokens
+        message_content = response.choices[0].message.content
     elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
-        return {
-            "content": response.candidates[0].content.parts[0].text,
-            "usage": {
-                "prompt_tokens": response.usage_metadata.prompt_token_count,
-                "completion_tokens": response.usage_metadata.candidates_token_count,
-                "total_tokens": response.usage_metadata.total_token_count
-            }
-        }
-    # Anthropic has a different response structure
-    return {
-        "content": response.content[0].text,
-        "usage": {
-            "prompt_tokens": response.usage.input_tokens,
-            "completion_tokens": response.usage.output_tokens,
-            "total_tokens": response.usage.input_tokens + response.usage.output_tokens
-        }
-    }
+        model_name = response.model_version
+        prompt_tokens = response.usage_metadata.prompt_token_count
+        completion_tokens = response.usage_metadata.candidates_token_count
+        message_content = response.candidates[0].content.parts[0].text
+    elif isinstance(client, (Anthropic, AsyncAnthropic)):
+        model_name = response.model
+        prompt_tokens = response.usage.input_tokens
+        completion_tokens = response.usage.output_tokens
+        message_content = response.content[0].text
+    else:
+        warnings.warn(f"Unsupported client type: {type(client)}")
+        return None, None
+    prompt_cost, completion_cost = cost_per_token(
+        model=model_name,
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+    )
+    total_cost_usd = (prompt_cost + completion_cost) if prompt_cost and completion_cost else None
+    usage = TraceUsage(
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        total_tokens=prompt_tokens + completion_tokens,
+        prompt_tokens_cost_usd=prompt_cost,
+        completion_tokens_cost_usd=completion_cost,
+        total_cost_usd=total_cost_usd,
+        model_name=model_name
+    )
+    return message_content, usage
 def combine_args_kwargs(func, args, kwargs):
     """
@@ -1653,21 +1855,30 @@ def _extract_usage_from_final_chunk(client: ApiClient, chunk: Any) -> Optional[D
         # OpenAI/Together include usage in the *last* chunk's `usage` attribute if available
         # This typically requires specific API versions or settings. Often usage is *not* streamed.
         if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
-             # Check if usage is directly on the chunk (some models might do this)
-             if hasattr(chunk, 'usage') and chunk.usage:
-                 return {
-                     "prompt_tokens": chunk.usage.prompt_tokens,
-                     "completion_tokens": chunk.usage.completion_tokens,
-                     "total_tokens": chunk.usage.total_tokens
-                 }
-             # Check if usage is nested within choices (less common for final chunk, but check)
-             elif chunk.choices and hasattr(chunk.choices[0], 'usage') and chunk.choices[0].usage:
-                 usage = chunk.choices[0].usage
-                 return {
-                      "prompt_tokens": usage.prompt_tokens,
-                      "completion_tokens": usage.completion_tokens,
-                      "total_tokens": usage.total_tokens
-                  }
+            # Check if usage is directly on the chunk (some models might do this)
+            if hasattr(chunk, 'usage') and chunk.usage:
+                prompt_tokens = chunk.usage.prompt_tokens
+                completion_tokens = chunk.usage.completion_tokens
+            # Check if usage is nested within choices (less common for final chunk, but check)
+            elif chunk.choices and hasattr(chunk.choices[0], 'usage') and chunk.choices[0].usage:
+                prompt_tokens = chunk.choices[0].usage.prompt_tokens
+                completion_tokens = chunk.choices[0].usage.completion_tokens
+            prompt_cost, completion_cost = cost_per_token(
+                    model=chunk.model,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                )
+            total_cost_usd = (prompt_cost + completion_cost) if prompt_cost and completion_cost else None
+            return TraceUsage(
+                prompt_tokens=chunk.usage.prompt_tokens,
+                completion_tokens=chunk.usage.completion_tokens,
+                total_tokens=chunk.usage.total_tokens,
+                prompt_tokens_cost_usd=prompt_cost,
+                completion_tokens_cost_usd=completion_cost,
+                total_cost_usd=total_cost_usd,
+                model_name=chunk.model
+            )
              # Anthropic includes usage in the 'message_stop' event type
         elif isinstance(client, (Anthropic, AsyncAnthropic)):
             if chunk.type == "message_stop":
@@ -1715,11 +1926,8 @@ def _sync_stream_wrapper(
             final_usage = _extract_usage_from_final_chunk(client, last_chunk)
         # Update the trace entry with the accumulated content and usage
-        span.output = {
-            "content": "".join(content_parts),  # Join list at the end
-            "usage": final_usage if final_usage else {"info": "Usage data not available in stream."}, # Provide placeholder if None
-            "streamed": True
-        }
+        span.output = "".join(content_parts)
+        span.usage = final_usage
         # Note: We might need to adjust _serialize_output if this dict causes issues,
         # but Pydantic's model_dump should handle dicts.
@@ -1739,6 +1947,7 @@ async def _async_stream_wrapper(
     target_span_id = span.span_id
     try:
+        model_name = ""
         async for chunk in original_stream:
             # Check for OpenAI's final usage chunk
             if isinstance(client, (AsyncOpenAI, OpenAI)) and hasattr(chunk, 'usage') and chunk.usage is not None:
@@ -1747,16 +1956,18 @@ async def _async_stream_wrapper(
                     "completion_tokens": chunk.usage.completion_tokens,
                     "total_tokens": chunk.usage.total_tokens
                 }
+                model_name = chunk.model
                 yield chunk
                 continue
             if isinstance(client, (AsyncAnthropic, Anthropic)) and hasattr(chunk, 'type'):
-                 if chunk.type == "message_start":
-                     if hasattr(chunk, 'message') and hasattr(chunk.message, 'usage') and hasattr(chunk.message.usage, 'input_tokens'):
+                if chunk.type == "message_start":
+                    if hasattr(chunk, 'message') and hasattr(chunk.message, 'usage') and hasattr(chunk.message.usage, 'input_tokens'):
                          anthropic_input_tokens = chunk.message.usage.input_tokens
-                 elif chunk.type == "message_delta":
-                     if hasattr(chunk, 'usage') and hasattr(chunk.usage, 'output_tokens'):
-                         anthropic_output_tokens += chunk.usage.output_tokens
+                         model_name = chunk.message.model
+                elif chunk.type == "message_delta":
+                    if hasattr(chunk, 'usage') and hasattr(chunk.usage, 'output_tokens'):
+                        anthropic_output_tokens = chunk.usage.output_tokens
             content_part = _extract_content_from_chunk(client, chunk)
             if content_part:
@@ -1779,18 +1990,37 @@ async def _async_stream_wrapper(
         elif anthropic_final_usage:
              usage_info = anthropic_final_usage
         elif last_content_chunk:
-             usage_info = _extract_usage_from_final_chunk(client, last_content_chunk)
+            usage_info = _extract_usage_from_final_chunk(client, last_content_chunk)
+        if usage_info and not isinstance(usage_info, TraceUsage):
+            prompt_cost, completion_cost = cost_per_token(
+                model=model_name,
+                prompt_tokens=usage_info["prompt_tokens"],
+                completion_tokens=usage_info["completion_tokens"],
+            )
+            usage_info = TraceUsage(
+                prompt_tokens=usage_info["prompt_tokens"],
+                completion_tokens=usage_info["completion_tokens"],
+                total_tokens=usage_info["total_tokens"],
+                prompt_tokens_cost_usd=prompt_cost,
+                completion_tokens_cost_usd=completion_cost,
+                total_cost_usd=prompt_cost + completion_cost,
+                model_name=model_name
+            )
         if span and hasattr(span, 'output'):
-            span.output = {
-                "content": "".join(content_parts),  # Join list at the end
-                "usage": usage_info if usage_info else {"info": "Usage data not available in stream."},
-                "streamed": True
-            }
+            span.output = ''.join(content_parts)
+            span.usage = usage_info
             start_ts = getattr(span, 'created_at', time.time())
             span.duration = time.time() - start_ts
         # else: # Handle error case if necessary, but remove debug print
+def cost_per_token(*args, **kwargs):
+    try:
+        return _original_cost_per_token(*args, **kwargs)
+    except Exception as e:
+        warnings.warn(f"Error calculating cost per token: {e}")
+        return None, None
 class _BaseStreamManagerWrapper:
     def __init__(self, original_manager, client, span_name, trace_client, stream_wrapper_func, input_kwargs):
         self._original_manager = original_manager
@@ -1872,3 +2102,20 @@ class _TracedSyncStreamManagerWrapper(_BaseStreamManagerWrapper, AbstractContext
             current_span_var.reset(self._span_context_token)
             delattr(self, '_span_context_token')
         return self._original_manager.__exit__(exc_type, exc_val, exc_tb)
+# --- Helper function for instance-prefixed qual_name ---
+def get_instance_prefixed_name(instance, class_name, class_identifiers):
+    """
+    Returns the agent name (prefix) if the class and attribute are found in class_identifiers.
+    Otherwise, returns None.
+    """
+    if class_name in class_identifiers:
+        class_config = class_identifiers[class_name]
+        attr = class_config['identifier']
+        if hasattr(instance, attr):
+            instance_name = getattr(instance, attr)
+            return instance_name
+        else:
+            raise Exception(f"Attribute {attr} does not exist for {class_name}. Check your identify() decorator.")
+    return None

judgeval 0.0.39__py3-none-any.whl → 0.0.41__py3-none-any.whl

judgeval 0.0.39py3-none-any.whl → 0.0.41py3-none-any.whl