PyPI - judgeval - Versions diffs - 0.0.38__py3-none-any.whl → 0.0.40__py3-none-any.whl - Mend

judgeval 0.0.38py3-none-any.whl → 0.0.40py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

judgeval/clients.py +6 -4
judgeval/common/tracer.py +361 -236
judgeval/constants.py +3 -0
judgeval/data/__init__.py +2 -1
judgeval/data/example.py +14 -13
judgeval/data/tool.py +47 -0
judgeval/data/trace.py +28 -39
judgeval/data/trace_run.py +2 -1
judgeval/evaluation_run.py +4 -7
judgeval/judgment_client.py +27 -6
judgeval/run_evaluation.py +395 -37
judgeval/scorers/__init__.py +4 -1
judgeval/scorers/judgeval_scorer.py +8 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +4 -0
judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +124 -0
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +20 -0
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +1 -1
judgeval/scorers/prompt_scorer.py +5 -164
judgeval/scorers/score.py +15 -15
judgeval-0.0.40.dist-info/METADATA +1441 -0
{judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/RECORD +23 -20
judgeval-0.0.38.dist-info/METADATA +0 -247
{judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/WHEEL +0 -0
{judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/tracer.py CHANGED Viewed

@@ -34,13 +34,14 @@ from typing import (
     Union,
     AsyncGenerator,
     TypeAlias,
+    Set
 )
 from rich import print as rprint
 import types # <--- Add this import
 # Third-party imports
 import requests
-from litellm import cost_per_token
+from litellm import cost_per_token as _original_cost_per_token
 from pydantic import BaseModel
 from rich import print as rprint
 from openai import OpenAI, AsyncOpenAI
@@ -59,7 +60,7 @@ from judgeval.constants import (
     JUDGMENT_TRACES_DELETE_API_URL,
     JUDGMENT_PROJECT_DELETE_API_URL,
 )
-from judgeval.data import Example, Trace, TraceSpan
+from judgeval.data import Example, Trace, TraceSpan, TraceUsage
 from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
 from judgeval.rules import Rule
 from judgeval.evaluation_run import EvaluationRun
@@ -155,9 +156,29 @@ class TraceManagerClient:
             NOTE we save empty traces in order to properly handle async operations; we need something in the DB to associate the async results with
         """
         # Save to Judgment API
+        def fallback_encoder(obj):
+            """
+            Custom JSON encoder fallback.
+            Tries to use obj.__repr__(), then str(obj) if that fails or for a simpler string.
+            You can choose which one you prefer or try them in sequence.
+            """
+            try:
+                # Option 1: Prefer __repr__ for a more detailed representation
+                return repr(obj)
+            except Exception:
+                # Option 2: Fallback to str() if __repr__ fails or if you prefer str()
+                try:
+                    return str(obj)
+                except Exception as e:
+                    # If both fail, you might return a placeholder or re-raise
+                    return f"<Unserializable object of type {type(obj).__name__}: {e}>"
+        serialized_trace_data = json.dumps(trace_data, default=fallback_encoder)
         response = requests.post(
             JUDGMENT_TRACES_SAVE_API_URL,
-            json=trace_data,
+            data=serialized_trace_data,
             headers={
                 "Content-Type": "application/json",
                 "Authorization": f"Bearer {self.judgment_api_key}",
@@ -463,6 +484,7 @@ class TraceClient:
         if current_span_id:
             span = self.span_id_to_span[current_span_id]
             span.evaluation_runs.append(eval_run)
+            span.has_evaluation = True  # Set the has_evaluation flag
         self.evaluation_runs.append(eval_run)
     def add_annotation(self, annotation: TraceAnnotation):
@@ -474,16 +496,25 @@ class TraceClient:
         current_span_id = current_span_var.get()
         if current_span_id:
             span = self.span_id_to_span[current_span_id]
+            # Ignore self parameter
+            if "self" in inputs:
+                del inputs["self"]
             span.inputs = inputs
+    def record_agent_name(self, agent_name: str):
+        current_span_id = current_span_var.get()
+        if current_span_id:
+            span = self.span_id_to_span[current_span_id]
+            span.agent_name = agent_name
-    async def _update_coroutine_output(self, span: TraceSpan, coroutine: Any):
+    async def _update_coroutine(self, span: TraceSpan, coroutine: Any, field: str):
         """Helper method to update the output of a trace entry once the coroutine completes"""
         try:
             result = await coroutine
-            span.output = result
+            setattr(span, field, result)
             return result
         except Exception as e:
-            span.output = f"Error: {str(e)}"
+            setattr(span, field, f"Error: {str(e)}")
             raise
     def record_output(self, output: Any):
@@ -493,12 +524,30 @@ class TraceClient:
             span.output = "<pending>" if inspect.iscoroutine(output) else output
             if inspect.iscoroutine(output):
-                asyncio.create_task(self._update_coroutine_output(span, output))
+                asyncio.create_task(self._update_coroutine(span, output, "output"))
+            return span # Return the created entry
+        # Removed else block - original didn't have one
+        return None # Return None if no span_id found
+    def record_usage(self, usage: TraceUsage):
+        current_span_id = current_span_var.get()
+        if current_span_id:
+            span = self.span_id_to_span[current_span_id]
+            span.usage = usage
             return span # Return the created entry
         # Removed else block - original didn't have one
         return None # Return None if no span_id found
+    def record_error(self, error: Any):
+        current_span_id = current_span_var.get()
+        if current_span_id:
+            span = self.span_id_to_span[current_span_id]
+            span.error = error
+            return span
+        return None
     def add_span(self, span: TraceSpan):
         """Add a trace span to this trace context"""
         self.trace_spans.append(span)
@@ -523,133 +572,6 @@ class TraceClient:
         """
         # Calculate total elapsed time
         total_duration = self.get_duration()
-        # Only count tokens for actual LLM API call spans
-        llm_span_names = {"OPENAI_API_CALL", "TOGETHER_API_CALL", "ANTHROPIC_API_CALL", "GOOGLE_API_CALL"}
-        for span in self.trace_spans:
-            span_function_name = span.function # Get function name safely
-            # Check if it's an LLM span AND function name CONTAINS an API call suffix AND output is dict
-            is_llm_span = span.span_type == "llm"
-            has_api_suffix = any(suffix in span_function_name for suffix in llm_span_names)
-            output_is_dict = isinstance(span.output, dict)
-            # --- DEBUG PRINT 1: Check if condition passes ---
-            # if is_llm_entry and has_api_suffix and output_is_dict:
-            # elif is_llm_entry:
-            #      # Print why it failed if it was an LLM entry
-            # # --- END DEBUG ---
-            if is_llm_span and has_api_suffix and output_is_dict:
-                output = span.output
-                usage = output.get("usage", {}) # Gets the 'usage' dict from the 'output' field
-                # --- DEBUG PRINT 2: Check extracted usage ---
-                # --- END DEBUG ---
-                # --- NEW: Extract model_name correctly from nested inputs ---
-                model_name = None
-                span_inputs = span.inputs
-                if span_inputs:
-                    # Try common locations for model name within the inputs structure
-                    invocation_params = span_inputs.get("invocation_params", {})
-                    serialized_data = span_inputs.get("serialized", {})
-                    # Look in invocation_params (often directly contains model)
-                    if isinstance(invocation_params, dict):
-                        model_name = invocation_params.get("model")
-                    # Fallback: Check serialized 'repr' if it contains model info
-                    if not model_name and isinstance(serialized_data, dict):
-                         serialized_repr = serialized_data.get("repr", "")
-                         if "model_name=" in serialized_repr:
-                              try: # Simple parsing attempt
-                                   model_name = serialized_repr.split("model_name='")[1].split("'")[0]
-                              except IndexError: pass # Ignore parsing errors
-                    # Fallback: Check top-level of invocation_params (sometimes passed flat)
-                    if not model_name and isinstance(invocation_params, dict):
-                        model_name = invocation_params.get("model") # Redundant check, but safe
-                    # Fallback: Check top-level of inputs itself (less likely for callbacks)
-                    if not model_name:
-                        model_name = span_inputs.get("model")
-                # --- END NEW ---
-                prompt_tokens = 0
-                completion_tokens = 0
-                # Handle OpenAI/Together format (checks within the 'usage' dict)
-                if "prompt_tokens" in usage:
-                    prompt_tokens = usage.get("prompt_tokens", 0)
-                    completion_tokens = usage.get("completion_tokens", 0)
-                # Handle Anthropic format - MAP values to standard keys
-                elif "input_tokens" in usage:
-                    prompt_tokens = usage.get("input_tokens", 0)       # Get value from input_tokens
-                    completion_tokens = usage.get("output_tokens", 0)    # Get value from output_tokens
-                    # *** Overwrite the usage dict in the entry to use standard keys ***
-                    original_total = usage.get("total_tokens", 0)
-                    original_total_cost = usage.get("total_cost_usd", 0.0) # Preserve if already calculated
-                    # Recalculate cost just in case it wasn't done correctly before
-                    temp_prompt_cost, temp_completion_cost = 0.0, 0.0
-                    if model_name:
-                        try:
-                           temp_prompt_cost, temp_completion_cost = cost_per_token(
-                                model=model_name,
-                                prompt_tokens=prompt_tokens,
-                                completion_tokens=completion_tokens
-                           )
-                        except Exception:
-                           pass # Ignore cost calculation errors here, focus on keys
-                    # Replace the usage dict with one using standard keys but Anthropic values
-                    output["usage"] = {
-                        "prompt_tokens": prompt_tokens,
-                        "completion_tokens": completion_tokens,
-                        "total_tokens": original_total,
-                        "prompt_tokens_cost_usd": temp_prompt_cost, # Use standard cost key
-                        "completion_tokens_cost_usd": temp_completion_cost, # Use standard cost key
-                        "total_cost_usd": original_total_cost if original_total_cost > 0 else (temp_prompt_cost + temp_completion_cost)
-                    }
-                    usage = output["usage"]
-                # Calculate costs if model name is available and ensure they are stored with standard keys
-                prompt_tokens = usage.get("prompt_tokens", 0)
-                completion_tokens = usage.get("completion_tokens", 0)
-                # Calculate costs if model name is available
-                if model_name:
-                    try:
-                        # Recalculate costs based on potentially mapped tokens
-                        prompt_cost, completion_cost = cost_per_token(
-                            model=model_name,
-                            prompt_tokens=prompt_tokens,
-                            completion_tokens=completion_tokens
-                        )
-                        # Add cost information directly to the usage dictionary in the condensed entry
-                        # Ensure 'usage' exists in the output dict before modifying it
-                        # Add/Update cost information using standard keys
-                        if "usage" not in output:
-                            output["usage"] = {} # Initialize if missing
-                        elif not isinstance(output["usage"], dict): # Handle cases where 'usage' might not be a dict (e.g., placeholder string)
-                            print(f"[WARN TraceClient.save] Output 'usage' for span {span.span_id} was not a dict ({type(output['usage'])}). Resetting before adding costs.")
-                            output["usage"] = {} # Reset to dict
-                        output["usage"]["prompt_tokens_cost_usd"] = prompt_cost
-                        output["usage"]["completion_tokens_cost_usd"] = completion_cost
-                        output["usage"]["total_cost_usd"] = prompt_cost + completion_cost
-                    except Exception as e:
-                        # If cost calculation fails, continue without adding costs
-                        print(f"Error calculating cost for model '{model_name}' (span: {span.span_id}): {str(e)}")
-                        pass
-                else:
-                     print(f"[WARN TraceClient.save] Could not determine model name for cost calculation (span: {span.span_id}). Inputs: {span_inputs}")
         # Create trace document - Always use standard keys for top-level counts
         trace_data = {
             "trace_id": self.trace_id,
@@ -677,13 +599,25 @@ class TraceClient:
     def delete(self):
         return self.trace_manager_client.delete_trace(self.trace_id)
+def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_info: Tuple[Optional[type], Optional[BaseException], Optional[types.TracebackType]]):
+    if not current_trace:
+        return
+    exc_type, exc_value, exc_traceback_obj = exc_info
+    formatted_exception = {
+        "type": exc_type.__name__ if exc_type else "UnknownExceptionType",
+        "message": str(exc_value) if exc_value else "No exception message",
+        "traceback": traceback.format_tb(exc_traceback_obj) if exc_traceback_obj else []
+    }
+    current_trace.record_error(formatted_exception)
 class _DeepTracer:
     _instance: Optional["_DeepTracer"] = None
     _lock: threading.Lock = threading.Lock()
     _refcount: int = 0
     _span_stack: contextvars.ContextVar[List[Dict[str, Any]]] = contextvars.ContextVar("_deep_profiler_span_stack", default=[])
     _skip_stack: contextvars.ContextVar[List[str]] = contextvars.ContextVar("_deep_profiler_skip_stack", default=[])
+    _original_sys_trace: Optional[Callable] = None
+    _original_threading_trace: Optional[Callable] = None
     def _get_qual_name(self, frame) -> str:
         func_name = frame.f_code.co_name
@@ -731,12 +665,53 @@ class _DeepTracer:
     @functools.cache
     def _is_user_code(self, filename: str):
         return bool(filename) and not filename.startswith("<") and not os.path.realpath(filename).startswith(_TRACE_FILEPATH_BLOCKLIST)
+    def _cooperative_sys_trace(self, frame: types.FrameType, event: str, arg: Any):
+        """Cooperative trace function for sys.settrace that chains with existing tracers."""
+        # First, call the original sys trace function if it exists
+        original_result = None
+        if self._original_sys_trace:
+            try:
+                original_result = self._original_sys_trace(frame, event, arg)
+            except Exception:
+                # If the original tracer fails, continue with our tracing
+                pass
+        # Then do our own tracing
+        our_result = self._trace(frame, event, arg, self._cooperative_sys_trace)
+        # Return our tracer to continue tracing, but respect the original's decision
+        # If the original tracer returned None (stop tracing), we should respect that
+        if original_result is None and self._original_sys_trace:
+            return None
+        return our_result or original_result
+    def _cooperative_threading_trace(self, frame: types.FrameType, event: str, arg: Any):
+        """Cooperative trace function for threading.settrace that chains with existing tracers."""
+        # First, call the original threading trace function if it exists
+        original_result = None
+        if self._original_threading_trace:
+            try:
+                original_result = self._original_threading_trace(frame, event, arg)
+            except Exception:
+                # If the original tracer fails, continue with our tracing
+                pass
+        # Then do our own tracing
+        our_result = self._trace(frame, event, arg, self._cooperative_threading_trace)
+        # Return our tracer to continue tracing, but respect the original's decision
+        # If the original tracer returned None (stop tracing), we should respect that
+        if original_result is None and self._original_threading_trace:
+            return None
+        return our_result or original_result
-    def _trace(self, frame: types.FrameType, event: str, arg: Any):
+    def _trace(self, frame: types.FrameType, event: str, arg: Any, continuation_func: Callable):
         frame.f_trace_lines = False
         frame.f_trace_opcodes = False
         if not self._should_trace(frame):
             return
@@ -752,6 +727,12 @@ class _DeepTracer:
             return
         qual_name = self._get_qual_name(frame)
+        instance_name = None
+        if 'self' in frame.f_locals:
+            instance = frame.f_locals['self']
+            class_name = instance.__class__.__name__
+            class_identifiers = getattr(Tracer._instance, 'class_identifiers', {})
+            instance_name = get_instance_prefixed_name(instance, class_name, class_identifiers)
         skip_stack = self._skip_stack.get()
         if event == "call":
@@ -814,7 +795,8 @@ class _DeepTracer:
                 created_at=start_time,
                 span_type="span",
                 parent_span_id=parent_span_id,
-                function=qual_name
+                function=qual_name,
+                agent_name=instance_name
             )
             current_trace.add_span(span)
@@ -869,35 +851,40 @@ class _DeepTracer:
                 current_span_var.reset(frame.f_locals["_judgment_span_token"])
         elif event == "exception":
-            exc_type, exc_value, exc_traceback = arg
-            formatted_exception = {
-                "type": exc_type.__name__,
-                "message": str(exc_value),
-                "traceback": traceback.format_tb(exc_traceback)
-            }
-            current_trace = current_trace_var.get()
-            current_trace.record_output({
-                "error": formatted_exception
-            })
+            exc_type = arg[0]
+            if issubclass(exc_type, (StopIteration, StopAsyncIteration, GeneratorExit)):
+                return
+            _capture_exception_for_trace(current_trace, arg)
-        return self._trace
+        return continuation_func
     def __enter__(self):
         with self._lock:
             self._refcount += 1
             if self._refcount == 1:
+                # Store the existing trace functions before setting ours
+                self._original_sys_trace = sys.gettrace()
+                self._original_threading_trace = threading.gettrace()
                 self._skip_stack.set([])
                 self._span_stack.set([])
-                sys.settrace(self._trace)
-                threading.settrace(self._trace)
+                sys.settrace(self._cooperative_sys_trace)
+                threading.settrace(self._cooperative_threading_trace)
         return self
     def __exit__(self, exc_type, exc_val, exc_tb):
         with self._lock:
             self._refcount -= 1
             if self._refcount == 0:
-                sys.settrace(None)
-                threading.settrace(None)
+                # Restore the original trace functions instead of setting to None
+                sys.settrace(self._original_sys_trace)
+                threading.settrace(self._original_threading_trace)
+                # Clean up the references
+                self._original_sys_trace = None
+                self._original_threading_trace = None
 def log(self, message: str, level: str = "info"):
@@ -946,10 +933,6 @@ class Tracer:
                 raise ValueError("Tracer must be configured with an Organization ID")
             if use_s3 and not s3_bucket_name:
                 raise ValueError("S3 bucket name must be provided when use_s3 is True")
-            if use_s3 and not (s3_aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID")):
-                raise ValueError("AWS Access Key ID must be provided when use_s3 is True")
-            if use_s3 and not (s3_aws_secret_access_key or os.getenv("AWS_SECRET_ACCESS_KEY")):
-                raise ValueError("AWS Secret Access Key must be provided when use_s3 is True")
             self.api_key: str = api_key
             self.project_name: str = project_name
@@ -961,6 +944,7 @@ class Tracer:
             self.initialized: bool = True
             self.enable_monitoring: bool = enable_monitoring
             self.enable_evaluations: bool = enable_evaluations
+            self.class_identifiers: Dict[str, str] = {}  # Dictionary to store class identifiers
             # Initialize S3 storage if enabled
             self.use_s3 = use_s3
@@ -1084,6 +1068,32 @@ class Tracer:
         rprint(f"[bold]{label}:[/bold] {msg}")
+    def identify(self, identifier: str):
+        """
+        Class decorator that associates a class with a custom identifier.
+        This decorator creates a mapping between the class name and the provided
+        identifier, which can be useful for tagging, grouping, or referencing
+        classes in a standardized way.
+        Args:
+            identifier: The identifier to associate with the decorated class
+        Returns:
+            A decorator function that registers the class with the given identifier
+        Example:
+            @tracer.identify(identifier="user_model")
+            class User:
+                # Class implementation
+        """
+        def decorator(cls):
+            class_name = cls.__name__
+            self.class_identifiers[class_name] = identifier
+            return cls
+        return decorator
     def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False, deep_tracing: bool = None):
         """
         Decorator to trace function execution with detailed entry/exit information.
@@ -1106,10 +1116,10 @@ class Tracer:
                                          overwrite=overwrite, deep_tracing=deep_tracing)
         # Use provided name or fall back to function name
-        span_name = name or func.__name__
+        original_span_name = name or func.__name__
         # Store custom attributes on the function object
-        func._judgment_span_name = span_name
+        func._judgment_span_name = original_span_name
         func._judgment_span_type = span_type
         # Use the provided deep_tracing value or fall back to the tracer's default
@@ -1118,6 +1128,16 @@ class Tracer:
         if asyncio.iscoroutinefunction(func):
             @functools.wraps(func)
             async def async_wrapper(*args, **kwargs):
+                nonlocal original_span_name
+                class_name = None
+                instance_name = None
+                span_name = original_span_name
+                agent_name = None
+                if args and hasattr(args[0], '__class__'):
+                    class_name = args[0].__class__.__name__
+                    agent_name = get_instance_prefixed_name(args[0], class_name, self.class_identifiers)
                 # Get current trace from context
                 current_trace = current_trace_var.get()
@@ -1141,7 +1161,7 @@ class Tracer:
                     # Save empty trace and set trace context
                     # current_trace.save(empty_save=True, overwrite=overwrite)
                     trace_token = current_trace_var.set(current_trace)
                     try:
                         # Use span for the function execution within the root trace
                         # This sets the current_span_var
@@ -1149,13 +1169,19 @@ class Tracer:
                             # Record inputs
                             inputs = combine_args_kwargs(func, args, kwargs)
                             span.record_input(inputs)
+                            if agent_name:
+                                span.record_agent_name(agent_name)
                             if use_deep_tracing:
                                 with _DeepTracer():
                                     result = await func(*args, **kwargs)
                             else:
-                                result = await func(*args, **kwargs)
+                                try:
+                                    result = await func(*args, **kwargs)
+                                except Exception as e:
+                                    _capture_exception_for_trace(current_trace, sys.exc_info())
+                                    raise e
                             # Record output
                             span.record_output(result)
                         return result
@@ -1170,12 +1196,18 @@ class Tracer:
                     with current_trace.span(span_name, span_type=span_type) as span:
                         inputs = combine_args_kwargs(func, args, kwargs)
                         span.record_input(inputs)
+                        if agent_name:
+                            span.record_agent_name(agent_name)
                         if use_deep_tracing:
                             with _DeepTracer():
                                 result = await func(*args, **kwargs)
                         else:
-                            result = await func(*args, **kwargs)
+                            try:
+                                result = await func(*args, **kwargs)
+                            except Exception as e:
+                                _capture_exception_for_trace(current_trace, sys.exc_info())
+                                raise e
                         span.record_output(result)
                     return result
@@ -1184,7 +1216,15 @@ class Tracer:
         else:
             # Non-async function implementation with deep tracing
             @functools.wraps(func)
-            def wrapper(*args, **kwargs):
+            def wrapper(*args, **kwargs):
+                nonlocal original_span_name
+                class_name = None
+                instance_name = None
+                span_name = original_span_name
+                agent_name = None
+                if args and hasattr(args[0], '__class__'):
+                    class_name = args[0].__class__.__name__
+                    agent_name = get_instance_prefixed_name(args[0], class_name, self.class_identifiers)
                 # Get current trace from context
                 current_trace = current_trace_var.get()
@@ -1216,12 +1256,17 @@ class Tracer:
                             # Record inputs
                             inputs = combine_args_kwargs(func, args, kwargs)
                             span.record_input(inputs)
+                            if agent_name:
+                                span.record_agent_name(agent_name)
                             if use_deep_tracing:
                                 with _DeepTracer():
                                     result = func(*args, **kwargs)
                             else:
-                                result = func(*args, **kwargs)
+                                try:
+                                    result = func(*args, **kwargs)
+                                except Exception as e:
+                                    _capture_exception_for_trace(current_trace, sys.exc_info())
+                                    raise e
                             # Record output
                             span.record_output(result)
@@ -1238,12 +1283,18 @@ class Tracer:
                         inputs = combine_args_kwargs(func, args, kwargs)
                         span.record_input(inputs)
+                        if agent_name:
+                            span.record_agent_name(agent_name)
                         if use_deep_tracing:
                             with _DeepTracer():
                                 result = func(*args, **kwargs)
                         else:
-                            result = func(*args, **kwargs)
+                            try:
+                                result = func(*args, **kwargs)
+                            except Exception as e:
+                                _capture_exception_for_trace(current_trace, sys.exc_info())
+                                raise e
                         span.record_output(result)
                     return result
@@ -1313,8 +1364,9 @@ def wrap(client: Any) -> Any:
             return wrapper_func(response, client, output_entry)
         else:
             format_func = _format_response_output_data if is_responses else _format_output_data
-            output_data = format_func(client, response)
-            span.record_output(output_data)
+            output, usage = format_func(client, response)
+            span.record_output(output)
+            span.record_usage(usage)
             return response
     def _handle_error(span, e, is_async):
@@ -1496,18 +1548,35 @@ def _format_response_output_data(client: ApiClient, response: Any) -> dict:
     Normalizes different response formats into a consistent structure
     for tracing purposes.
     """
+    message_content = None
+    prompt_tokens = 0
+    completion_tokens = 0
+    model_name = None
     if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
-        return {
-            "content": response.output,
-            "usage": {
-                "prompt_tokens": response.usage.input_tokens,
-                "completion_tokens": response.usage.output_tokens,
-                "total_tokens": response.usage.total_tokens
-            }
-        }
+        model_name = response.model
+        prompt_tokens = response.usage.input_tokens
+        completion_tokens = response.usage.output_tokens
+        message_content = response.output
     else:
         warnings.warn(f"Unsupported client type: {type(client)}")
         return {}
+    prompt_cost, completion_cost = cost_per_token(
+        model=model_name,
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+    )
+    total_cost_usd = (prompt_cost + completion_cost) if prompt_cost and completion_cost else None
+    usage = TraceUsage(
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        total_tokens=prompt_tokens + completion_tokens,
+        prompt_tokens_cost_usd=prompt_cost,
+        completion_tokens_cost_usd=completion_cost,
+        total_cost_usd=total_cost_usd,
+        model_name=model_name
+    )
+    return message_content, usage
 def _format_output_data(client: ApiClient, response: Any) -> dict:
@@ -1521,33 +1590,46 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
             - content: The generated text
             - usage: Token usage statistics
     """
+    prompt_tokens = 0
+    completion_tokens = 0
+    model_name = None
+    message_content = None
     if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
-        return {
-            "content": response.choices[0].message.content,
-            "usage": {
-                "prompt_tokens": response.usage.prompt_tokens,
-                "completion_tokens": response.usage.completion_tokens,
-                "total_tokens": response.usage.total_tokens
-            }
-        }
+        model_name = response.model
+        prompt_tokens = response.usage.prompt_tokens
+        completion_tokens = response.usage.completion_tokens
+        message_content = response.choices[0].message.content
     elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
-        return {
-            "content": response.candidates[0].content.parts[0].text,
-            "usage": {
-                "prompt_tokens": response.usage_metadata.prompt_token_count,
-                "completion_tokens": response.usage_metadata.candidates_token_count,
-                "total_tokens": response.usage_metadata.total_token_count
-            }
-        }
-    # Anthropic has a different response structure
-    return {
-        "content": response.content[0].text,
-        "usage": {
-            "prompt_tokens": response.usage.input_tokens,
-            "completion_tokens": response.usage.output_tokens,
-            "total_tokens": response.usage.input_tokens + response.usage.output_tokens
-        }
-    }
+        model_name = response.model_version
+        prompt_tokens = response.usage_metadata.prompt_token_count
+        completion_tokens = response.usage_metadata.candidates_token_count
+        message_content = response.candidates[0].content.parts[0].text
+    elif isinstance(client, (Anthropic, AsyncAnthropic)):
+        model_name = response.model
+        prompt_tokens = response.usage.input_tokens
+        completion_tokens = response.usage.output_tokens
+        message_content = response.content[0].text
+    else:
+        warnings.warn(f"Unsupported client type: {type(client)}")
+        return None, None
+    prompt_cost, completion_cost = cost_per_token(
+        model=model_name,
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+    )
+    total_cost_usd = (prompt_cost + completion_cost) if prompt_cost and completion_cost else None
+    usage = TraceUsage(
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        total_tokens=prompt_tokens + completion_tokens,
+        prompt_tokens_cost_usd=prompt_cost,
+        completion_tokens_cost_usd=completion_cost,
+        total_cost_usd=total_cost_usd,
+        model_name=model_name
+    )
+    return message_content, usage
 def combine_args_kwargs(func, args, kwargs):
     """
@@ -1653,21 +1735,30 @@ def _extract_usage_from_final_chunk(client: ApiClient, chunk: Any) -> Optional[D
         # OpenAI/Together include usage in the *last* chunk's `usage` attribute if available
         # This typically requires specific API versions or settings. Often usage is *not* streamed.
         if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
-             # Check if usage is directly on the chunk (some models might do this)
-             if hasattr(chunk, 'usage') and chunk.usage:
-                 return {
-                     "prompt_tokens": chunk.usage.prompt_tokens,
-                     "completion_tokens": chunk.usage.completion_tokens,
-                     "total_tokens": chunk.usage.total_tokens
-                 }
-             # Check if usage is nested within choices (less common for final chunk, but check)
-             elif chunk.choices and hasattr(chunk.choices[0], 'usage') and chunk.choices[0].usage:
-                 usage = chunk.choices[0].usage
-                 return {
-                      "prompt_tokens": usage.prompt_tokens,
-                      "completion_tokens": usage.completion_tokens,
-                      "total_tokens": usage.total_tokens
-                  }
+            # Check if usage is directly on the chunk (some models might do this)
+            if hasattr(chunk, 'usage') and chunk.usage:
+                prompt_tokens = chunk.usage.prompt_tokens
+                completion_tokens = chunk.usage.completion_tokens
+            # Check if usage is nested within choices (less common for final chunk, but check)
+            elif chunk.choices and hasattr(chunk.choices[0], 'usage') and chunk.choices[0].usage:
+                prompt_tokens = chunk.choices[0].usage.prompt_tokens
+                completion_tokens = chunk.choices[0].usage.completion_tokens
+            prompt_cost, completion_cost = cost_per_token(
+                    model=chunk.model,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                )
+            total_cost_usd = (prompt_cost + completion_cost) if prompt_cost and completion_cost else None
+            return TraceUsage(
+                prompt_tokens=chunk.usage.prompt_tokens,
+                completion_tokens=chunk.usage.completion_tokens,
+                total_tokens=chunk.usage.total_tokens,
+                prompt_tokens_cost_usd=prompt_cost,
+                completion_tokens_cost_usd=completion_cost,
+                total_cost_usd=total_cost_usd,
+                model_name=chunk.model
+            )
              # Anthropic includes usage in the 'message_stop' event type
         elif isinstance(client, (Anthropic, AsyncAnthropic)):
             if chunk.type == "message_stop":
@@ -1715,11 +1806,8 @@ def _sync_stream_wrapper(
             final_usage = _extract_usage_from_final_chunk(client, last_chunk)
         # Update the trace entry with the accumulated content and usage
-        span.output = {
-            "content": "".join(content_parts),  # Join list at the end
-            "usage": final_usage if final_usage else {"info": "Usage data not available in stream."}, # Provide placeholder if None
-            "streamed": True
-        }
+        span.output = "".join(content_parts)
+        span.usage = final_usage
         # Note: We might need to adjust _serialize_output if this dict causes issues,
         # but Pydantic's model_dump should handle dicts.
@@ -1739,6 +1827,7 @@ async def _async_stream_wrapper(
     target_span_id = span.span_id
     try:
+        model_name = ""
         async for chunk in original_stream:
             # Check for OpenAI's final usage chunk
             if isinstance(client, (AsyncOpenAI, OpenAI)) and hasattr(chunk, 'usage') and chunk.usage is not None:
@@ -1747,16 +1836,18 @@ async def _async_stream_wrapper(
                     "completion_tokens": chunk.usage.completion_tokens,
                     "total_tokens": chunk.usage.total_tokens
                 }
+                model_name = chunk.model
                 yield chunk
                 continue
             if isinstance(client, (AsyncAnthropic, Anthropic)) and hasattr(chunk, 'type'):
-                 if chunk.type == "message_start":
-                     if hasattr(chunk, 'message') and hasattr(chunk.message, 'usage') and hasattr(chunk.message.usage, 'input_tokens'):
+                if chunk.type == "message_start":
+                    if hasattr(chunk, 'message') and hasattr(chunk.message, 'usage') and hasattr(chunk.message.usage, 'input_tokens'):
                          anthropic_input_tokens = chunk.message.usage.input_tokens
-                 elif chunk.type == "message_delta":
-                     if hasattr(chunk, 'usage') and hasattr(chunk.usage, 'output_tokens'):
-                         anthropic_output_tokens += chunk.usage.output_tokens
+                         model_name = chunk.message.model
+                elif chunk.type == "message_delta":
+                    if hasattr(chunk, 'usage') and hasattr(chunk.usage, 'output_tokens'):
+                        anthropic_output_tokens = chunk.usage.output_tokens
             content_part = _extract_content_from_chunk(client, chunk)
             if content_part:
@@ -1779,18 +1870,37 @@ async def _async_stream_wrapper(
         elif anthropic_final_usage:
              usage_info = anthropic_final_usage
         elif last_content_chunk:
-             usage_info = _extract_usage_from_final_chunk(client, last_content_chunk)
+            usage_info = _extract_usage_from_final_chunk(client, last_content_chunk)
+        if usage_info and not isinstance(usage_info, TraceUsage):
+            prompt_cost, completion_cost = cost_per_token(
+                model=model_name,
+                prompt_tokens=usage_info["prompt_tokens"],
+                completion_tokens=usage_info["completion_tokens"],
+            )
+            usage_info = TraceUsage(
+                prompt_tokens=usage_info["prompt_tokens"],
+                completion_tokens=usage_info["completion_tokens"],
+                total_tokens=usage_info["total_tokens"],
+                prompt_tokens_cost_usd=prompt_cost,
+                completion_tokens_cost_usd=completion_cost,
+                total_cost_usd=prompt_cost + completion_cost,
+                model_name=model_name
+            )
         if span and hasattr(span, 'output'):
-            span.output = {
-                "content": "".join(content_parts),  # Join list at the end
-                "usage": usage_info if usage_info else {"info": "Usage data not available in stream."},
-                "streamed": True
-            }
+            span.output = ''.join(content_parts)
+            span.usage = usage_info
             start_ts = getattr(span, 'created_at', time.time())
             span.duration = time.time() - start_ts
         # else: # Handle error case if necessary, but remove debug print
+def cost_per_token(*args, **kwargs):
+    try:
+        return _original_cost_per_token(*args, **kwargs)
+    except Exception as e:
+        warnings.warn(f"Error calculating cost per token: {e}")
+        return None, None
 class _BaseStreamManagerWrapper:
     def __init__(self, original_manager, client, span_name, trace_client, stream_wrapper_func, input_kwargs):
         self._original_manager = original_manager
@@ -1872,3 +1982,18 @@ class _TracedSyncStreamManagerWrapper(_BaseStreamManagerWrapper, AbstractContext
             current_span_var.reset(self._span_context_token)
             delattr(self, '_span_context_token')
         return self._original_manager.__exit__(exc_type, exc_val, exc_tb)
+# --- Helper function for instance-prefixed qual_name ---
+def get_instance_prefixed_name(instance, class_name, class_identifiers):
+    """
+    Returns the agent name (prefix) if the class and attribute are found in class_identifiers.
+    Otherwise, returns None.
+    """
+    if class_name in class_identifiers:
+        attr = class_identifiers[class_name]
+        if hasattr(instance, attr):
+            instance_name = getattr(instance, attr)
+            return instance_name
+        else:
+            raise Exception(f"Attribute {class_identifiers[class_name]} does not exist for {class_name}. Check your identify() decorator.")
+    return None

judgeval 0.0.38__py3-none-any.whl → 0.0.40__py3-none-any.whl

judgeval 0.0.38py3-none-any.whl → 0.0.40py3-none-any.whl