PyPI - judgeval - Versions diffs - 0.0.36__py3-none-any.whl → 0.0.38__py3-none-any.whl - Mend

judgeval 0.0.36py3-none-any.whl → 0.0.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

judgeval/common/tracer.py +663 -1105
judgeval/common/utils.py +19 -1
judgeval/constants.py +3 -3
judgeval/data/__init__.py +4 -2
judgeval/data/datasets/dataset.py +2 -11
judgeval/data/datasets/eval_dataset_client.py +1 -62
judgeval/data/example.py +29 -8
judgeval/data/result.py +3 -3
judgeval/data/trace.py +132 -0
judgeval/data/{sequence_run.py → trace_run.py} +7 -6
judgeval/evaluation_run.py +2 -2
judgeval/integrations/langgraph.py +189 -1769
judgeval/judges/litellm_judge.py +1 -1
judgeval/judges/mixture_of_judges.py +1 -1
judgeval/judges/utils.py +1 -1
judgeval/judgment_client.py +85 -78
judgeval/run_evaluation.py +98 -51
judgeval/scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +20 -0
judgeval/scorers/score.py +1 -1
judgeval/utils/data_utils.py +57 -0
judgeval-0.0.38.dist-info/METADATA +247 -0
{judgeval-0.0.36.dist-info → judgeval-0.0.38.dist-info}/RECORD +26 -24
judgeval/data/sequence.py +0 -49
judgeval-0.0.36.dist-info/METADATA +0 -169
{judgeval-0.0.36.dist-info → judgeval-0.0.38.dist-info}/WHEEL +0 -0
{judgeval-0.0.36.dist-info → judgeval-0.0.38.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/tracer.py CHANGED Viewed

@@ -7,7 +7,11 @@ import functools
 import inspect
 import json
 import os
+import site
+import sysconfig
+import threading
 import time
+import traceback
 import uuid
 import warnings
 import contextvars
@@ -35,7 +39,6 @@ from rich import print as rprint
 import types # <--- Add this import
 # Third-party imports
-import pika
 import requests
 from litellm import cost_per_token
 from pydantic import BaseModel
@@ -44,10 +47,10 @@ from openai import OpenAI, AsyncOpenAI
 from together import Together, AsyncTogether
 from anthropic import Anthropic, AsyncAnthropic
 from google import genai
-from judgeval.run_evaluation import check_examples
 # Local application/library-specific imports
 from judgeval.constants import (
+    JUDGMENT_TRACES_ADD_ANNOTATION_API_URL,
     JUDGMENT_TRACES_SAVE_API_URL,
     JUDGMENT_TRACES_FETCH_API_URL,
     RABBITMQ_HOST,
@@ -56,25 +59,24 @@ from judgeval.constants import (
     JUDGMENT_TRACES_DELETE_API_URL,
     JUDGMENT_PROJECT_DELETE_API_URL,
 )
-from judgeval.judgment_client import JudgmentClient
-from judgeval.data import Example
+from judgeval.data import Example, Trace, TraceSpan
 from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
 from judgeval.rules import Rule
 from judgeval.evaluation_run import EvaluationRun
 from judgeval.data.result import ScoringResult
+from judgeval.common.utils import validate_api_key
+from judgeval.common.exceptions import JudgmentAPIError
 # Standard library imports needed for the new class
 import concurrent.futures
 from collections.abc import Iterator, AsyncIterator # Add Iterator and AsyncIterator
 # Define context variables for tracking the current trace and the current span within a trace
-current_trace_var = contextvars.ContextVar('current_trace', default=None)
+current_trace_var = contextvars.ContextVar[Optional['TraceClient']]('current_trace', default=None)
 current_span_var = contextvars.ContextVar('current_span', default=None) # ContextVar for the active span name
-in_traced_function_var = contextvars.ContextVar('in_traced_function', default=False) # Track if we're in a traced function
 # Define type aliases for better code readability and maintainability
 ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic, AsyncOpenAI, AsyncAnthropic, AsyncTogether, genai.Client, genai.client.AsyncClient]  # Supported API clients
-TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation']  # Valid trace entry types
 SpanType = Literal['span', 'tool', 'llm', 'evaluation', 'chain']
 # --- Evaluation Config Dataclass (Moved from langgraph.py) ---
@@ -87,154 +89,26 @@ class EvaluationConfig:
     log_results: Optional[bool] = True
 # --- End Evaluation Config Dataclass ---
+# Temporary as a POC to have log use the existing annotations feature until log endpoints are ready
 @dataclass
-class TraceEntry:
-    """Represents a single trace entry with its visual representation.
-    Visual representations:
-    - enter: → (function entry)
-    - exit: ← (function exit)
-    - output: Output: (function return value)
-    - input: Input: (function parameters)
-    - evaluation: Evaluation: (evaluation results)
-    """
-    type: TraceEntryType
-    span_id: str # Unique ID for this specific span instance
-    depth: int    # Indentation level for nested calls
-    created_at: float # Unix timestamp when entry was created, replacing the deprecated 'timestamp' field
-    function: Optional[str] = None  # Name of the function being traced
-    message: Optional[str] = None  # Human-readable description
-    duration: Optional[float] = None  # Time taken (for exit/evaluation entries)
-    trace_id: str = None # ID of the trace this entry belongs to
-    output: Any = None  # Function output value
-    # Use field() for mutable defaults to avoid shared state issues
-    inputs: dict = field(default_factory=dict)
-    span_type: SpanType = "span"
-    evaluation_runs: List[Optional[EvaluationRun]] = field(default=None)
-    parent_span_id: Optional[str] = None # ID of the parent span instance
-    def print_entry(self):
-        """Print a trace entry with proper formatting and parent relationship information."""
-        indent = "  " * self.depth
-        if self.type == "enter":
-            # Format parent info if present
-            parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
-            print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info} (trace: {self.message})")
-        elif self.type == "exit":
-            print(f"{indent}← {self.function} (id: {self.span_id}) ({self.duration:.3f}s)")
-        elif self.type == "output":
-            # Format output to align properly
-            output_str = str(self.output)
-            print(f"{indent}Output (for id: {self.span_id}): {output_str}")
-        elif self.type == "input":
-            # Format inputs to align properly
-            print(f"{indent}Input (for id: {self.span_id}): {self.inputs}")
-        elif self.type == "evaluation":
-            for evaluation_run in self.evaluation_runs:
-                print(f"{indent}Evaluation (for id: {self.span_id}): {evaluation_run.model_dump()}")
-    def _serialize_inputs(self) -> dict:
-        """Helper method to serialize input data safely.
-        Returns a dict with serializable versions of inputs, converting non-serializable
-        objects to None with a warning.
-        """
-        serialized_inputs = {}
-        for key, value in self.inputs.items():
-            if isinstance(value, BaseModel):
-                serialized_inputs[key] = value.model_dump()
-            elif isinstance(value, (list, tuple)):
-                # Handle lists/tuples of arguments
-                serialized_inputs[key] = [
-                    item.model_dump() if isinstance(item, BaseModel)
-                    else None if not self._is_json_serializable(item)
-                    else item
-                    for item in value
-                ]
-            else:
-                if self._is_json_serializable(value):
-                    serialized_inputs[key] = value
-                else:
-                    serialized_inputs[key] = self.safe_stringify(value, self.function)
-        return serialized_inputs
-    def _is_json_serializable(self, obj: Any) -> bool:
-        """Helper method to check if an object is JSON serializable."""
-        try:
-            json.dumps(obj)
-            return True
-        except (TypeError, OverflowError, ValueError):
-            return False
-    def safe_stringify(self, output, function_name):
-        """
-        Safely converts an object to a string or repr, handling serialization issues gracefully.
-        """
-        try:
-            return str(output)
-        except (TypeError, OverflowError, ValueError):
-            pass
-        try:
-            return repr(output)
-        except (TypeError, OverflowError, ValueError):
-            pass
-        warnings.warn(
-            f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
-        )
-        return None
+class TraceAnnotation:
+    """Represents a single annotation for a trace span."""
+    span_id: str
+    text: str
+    label: str
+    score: int
     def to_dict(self) -> dict:
-        """Convert the trace entry to a dictionary format for storage/transmission."""
+        """Convert the annotation to a dictionary format for storage/transmission."""
         return {
-            "type": self.type,
-            "function": self.function,
             "span_id": self.span_id,
-            "trace_id": self.trace_id,
-            "depth": self.depth,
-            "message": self.message,
-            "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
-            "duration": self.duration,
-            "output": self._serialize_output(),
-            "inputs": self._serialize_inputs(),
-            "evaluation_runs": [evaluation_run.model_dump() for evaluation_run in self.evaluation_runs] if self.evaluation_runs else [],
-            "span_type": self.span_type,
-            "parent_span_id": self.parent_span_id,
+            "annotation": {
+                "text": self.text,
+                "label": self.label,
+                "score": self.score
+            }
         }
-    def _serialize_output(self) -> Any:
-        """Helper method to serialize output data safely.
-        Handles special cases:
-        - Pydantic models are converted using model_dump()
-        - Dictionaries are processed recursively to handle non-serializable values.
-        - We try to serialize into JSON, then string, then the base representation (__repr__)
-        - Non-serializable objects return None with a warning
-        """
-        def serialize_value(value):
-            if isinstance(value, BaseModel):
-                return value.model_dump()
-            elif isinstance(value, dict):
-                # Recursively serialize dictionary values
-                return {k: serialize_value(v) for k, v in value.items()}
-            elif isinstance(value, (list, tuple)):
-                # Recursively serialize list/tuple items
-                return [serialize_value(item) for item in value]
-            else:
-                # Try direct JSON serialization first
-                try:
-                    json.dumps(value)
-                    return value
-                except (TypeError, OverflowError, ValueError):
-                    # Fallback to safe stringification
-                    return self.safe_stringify(value, self.function)
-        # Start serialization with the top-level output
-        return serialize_value(self.output)
 class TraceManagerClient:
     """
     Client for handling trace endpoints with the Judgment API
@@ -271,10 +145,8 @@ class TraceManagerClient:
             raise ValueError(f"Failed to fetch traces: {response.text}")
         return response.json()
-    def save_trace(self, trace_data: dict):
+    def save_trace(self, trace_data: dict, offline_mode: bool = False):
         """
         Saves a trace to the Judgment Supabase and optionally to S3 if configured.
@@ -311,10 +183,37 @@ class TraceManagerClient:
             except Exception as e:
                 warnings.warn(f"Failed to save trace to S3: {str(e)}")
-        if "ui_results_url" in response.json():
+        if not offline_mode and "ui_results_url" in response.json():
             pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
             rprint(pretty_str)
+    ## TODO: Should have a log endpoint, endpoint should also support batched payloads
+    def save_annotation(self, annotation: TraceAnnotation):
+        json_data = {
+            "span_id": annotation.span_id,
+            "annotation": {
+                "text": annotation.text,
+                "label": annotation.label,
+                "score": annotation.score
+            }
+        }
+        response = requests.post(
+            JUDGMENT_TRACES_ADD_ANNOTATION_API_URL,
+            json=json_data,
+            headers={
+                'Content-Type': 'application/json',
+                'Authorization': f'Bearer {self.judgment_api_key}',
+                'X-Organization-Id': self.organization_id
+            },
+            verify=True
+        )
+        if response.status_code != HTTPStatus.OK:
+            raise ValueError(f"Failed to save annotation: {response.text}")
+        return response.json()
     def delete_trace(self, trace_id: str):
         """
         Delete a trace from the database.
@@ -405,15 +304,17 @@ class TraceClient:
         self.enable_evaluations = enable_evaluations
         self.parent_trace_id = parent_trace_id
         self.parent_name = parent_name
-        self.client: JudgmentClient = tracer.client
-        self.entries: List[TraceEntry] = []
+        self.trace_spans: List[TraceSpan] = []
+        self.span_id_to_span: Dict[str, TraceSpan] = {}
+        self.evaluation_runs: List[EvaluationRun] = []
+        self.annotations: List[TraceAnnotation] = []
         self.start_time = time.time()
         self.trace_manager_client = TraceManagerClient(tracer.api_key, tracer.organization_id, tracer)
         self.visited_nodes = []
         self.executed_tools = []
         self.executed_node_tools = []
         self._span_depths: Dict[str, int] = {} # NEW: To track depth of active spans
     def get_current_span(self):
         """Get the current span from the context var"""
         return current_span_var.get()
@@ -443,9 +344,7 @@ class TraceClient:
         self._span_depths[span_id] = current_depth # Store depth by span_id
-        entry = TraceEntry(
-            type="enter",
-            function=name,
+        span = TraceSpan(
             span_id=span_id,
             trace_id=self.trace_id,
             depth=current_depth,
@@ -453,25 +352,15 @@ class TraceClient:
             created_at=start_time,
             span_type=span_type,
             parent_span_id=parent_span_id,
+            function=name,
         )
-        self.add_entry(entry)
+        self.add_span(span)
         try:
             yield self
         finally:
             duration = time.time() - start_time
-            exit_depth = self._span_depths.get(span_id, 0) # Get depth using this span's ID
-            self.add_entry(TraceEntry(
-                type="exit",
-                function=name,
-                span_id=span_id, # Use the same span_id for exit
-                trace_id=self.trace_id, # Use the trace_id from the trace client
-                depth=exit_depth,
-                message=f"← {name}",
-                created_at=time.time(),
-                duration=duration,
-                span_type=span_type,
-            ))
+            span.duration = duration
             # Clean up depth tracking for this span_id
             if span_id in self._span_depths:
                 del self._span_depths[span_id]
@@ -528,19 +417,20 @@ class TraceClient:
                     tools_called=tools_called,
                     expected_tools=expected_tools,
                     additional_metadata=additional_metadata,
-                    trace_id=self.trace_id
                 )
             else:
                 raise ValueError("Either 'example' or at least one of the individual parameters (input, actual_output, etc.) must be provided")
         # Check examples before creating evaluation run
-        check_examples([example], scorers)
+        # check_examples([example], scorers)
         # --- Modification: Capture span_id immediately ---
         # span_id_at_eval_call = current_span_var.get()
         # print(f"[TraceClient.async_evaluate] Captured span ID at eval call: {span_id_at_eval_call}")
         # Prioritize explicitly passed span_id, fallback to context var
-        span_id_to_use = span_id if span_id is not None else current_span_var.get()
+        current_span_ctx_var = current_span_var.get()
+        span_id_to_use = span_id if span_id is not None else current_span_ctx_var if current_span_ctx_var is not None else self.tracer.get_current_span()
         # print(f"[TraceClient.async_evaluate] Using span_id: {span_id_to_use}")
         # --- End Modification ---
@@ -550,7 +440,7 @@ class TraceClient:
             log_results=log_results,
             project_name=self.project_name,
             eval_name=f"{self.name.capitalize()}-"
-                f"{current_span_var.get()}-" # Keep original eval name format using context var if available
+                f"{span_id_to_use}-" # Keep original eval name format using context var if available
                 f"[{','.join(scorer.score_type.capitalize() for scorer in scorers)}]",
             examples=[example],
             scorers=scorers,
@@ -571,290 +461,60 @@ class TraceClient:
         # --- End Modification ---
         if current_span_id:
-            duration = time.time() - start_time
-            prev_entry = self.entries[-1] if self.entries else None
-            # Determine function name based on previous entry or context var (less ideal)
-            function_name = "unknown_function" # Default
-            if prev_entry and prev_entry.span_type == "llm":
-                 function_name = prev_entry.function
-            else:
-                 # Try to find the function name associated with the current span_id
-                 for entry in reversed(self.entries):
-                     if entry.span_id == current_span_id and entry.type == 'enter':
-                         function_name = entry.function
-                         break
-            # Get depth for the current span
-            current_depth = self._span_depths.get(current_span_id, 0)
-            self.add_entry(TraceEntry(
-                type="evaluation",
-                function=function_name,
-                span_id=current_span_id, # Associate with current span
-                trace_id=self.trace_id, # Use the trace_id from the trace client
-                depth=current_depth,
-                message=f"Evaluation results for {function_name}",
-                created_at=time.time(),
-                evaluation_runs=[eval_run],
-                duration=duration,
-                span_type="evaluation"
-            ))
+            span = self.span_id_to_span[current_span_id]
+            span.evaluation_runs.append(eval_run)
+        self.evaluation_runs.append(eval_run)
+    def add_annotation(self, annotation: TraceAnnotation):
+       """Add an annotation to this trace context"""
+       self.annotations.append(annotation)
+       return self
     def record_input(self, inputs: dict):
         current_span_id = current_span_var.get()
         if current_span_id:
-            entry_span_type = "span"
-            current_depth = self._span_depths.get(current_span_id, 0)
-            function_name = "unknown_function" # Default
-            for entry in reversed(self.entries):
-                 if entry.span_id == current_span_id and entry.type == 'enter':
-                      entry_span_type = entry.span_type
-                      function_name = entry.function
-                      break
-            self.add_entry(TraceEntry(
-                type="input",
-                function=function_name,
-                span_id=current_span_id, # Use current span_id from context
-                trace_id=self.trace_id, # Use the trace_id from the trace client
-                depth=current_depth,
-                message=f"Inputs to {function_name}",
-                created_at=time.time(),
-                inputs=inputs,
-                span_type=entry_span_type,
-            ))
-        # Removed else block - original didn't have one
+            span = self.span_id_to_span[current_span_id]
+            span.inputs = inputs
-    async def _update_coroutine_output(self, entry: TraceEntry, coroutine: Any):
+    async def _update_coroutine_output(self, span: TraceSpan, coroutine: Any):
         """Helper method to update the output of a trace entry once the coroutine completes"""
         try:
             result = await coroutine
-            entry.output = result
+            span.output = result
             return result
         except Exception as e:
-            entry.output = f"Error: {str(e)}"
+            span.output = f"Error: {str(e)}"
             raise
     def record_output(self, output: Any):
         current_span_id = current_span_var.get()
         if current_span_id:
-            entry_span_type = "span"
-            current_depth = self._span_depths.get(current_span_id, 0)
-            function_name = "unknown_function" # Default
-            for entry in reversed(self.entries):
-                 if entry.span_id == current_span_id and entry.type == 'enter':
-                      entry_span_type = entry.span_type
-                      function_name = entry.function
-                      break
-            entry = TraceEntry(
-                type="output",
-                function=function_name,
-                span_id=current_span_id, # Use current span_id from context
-                depth=current_depth,
-                message=f"Output from {function_name}",
-                created_at=time.time(),
-                output="<pending>" if inspect.iscoroutine(output) else output,
-                span_type=entry_span_type,
-                trace_id=self.trace_id # Added trace_id for consistency
-            )
-            self.add_entry(entry)
+            span = self.span_id_to_span[current_span_id]
+            span.output = "<pending>" if inspect.iscoroutine(output) else output
             if inspect.iscoroutine(output):
-                asyncio.create_task(self._update_coroutine_output(entry, output))
+                asyncio.create_task(self._update_coroutine_output(span, output))
-            return entry # Return the created entry
+            return span # Return the created entry
         # Removed else block - original didn't have one
         return None # Return None if no span_id found
-    def add_entry(self, entry: TraceEntry):
-        """Add a trace entry to this trace context"""
-        self.entries.append(entry)
+    def add_span(self, span: TraceSpan):
+        """Add a trace span to this trace context"""
+        self.trace_spans.append(span)
+        self.span_id_to_span[span.span_id] = span
         return self
     def print(self):
         """Print the complete trace with proper visual structure"""
-        for entry in self.entries:
-            entry.print_entry()
-    def print_hierarchical(self):
-        """Print the trace in a hierarchical structure based on parent-child relationships"""
-        # First, build a map of spans
-        spans = {}
-        root_spans = []
-        # Collect all enter events first
-        for entry in self.entries:
-            if entry.type == "enter":
-                spans[entry.function] = {
-                    "name": entry.function,
-                    "depth": entry.depth,
-                    "parent_id": entry.parent_span_id,
-                    "children": []
-                }
-                # If no parent, it's a root span
-                if not entry.parent_span_id:
-                    root_spans.append(entry.function)
-                elif entry.parent_span_id not in spans:
-                    # If parent doesn't exist yet, temporarily treat as root
-                    # (we'll fix this later)
-                    root_spans.append(entry.function)
-        # Build parent-child relationships
-        for span_name, span in spans.items():
-            parent = span["parent_id"]
-            if parent and parent in spans:
-                spans[parent]["children"].append(span_name)
-                # Remove from root spans if it was temporarily there
-                if span_name in root_spans:
-                    root_spans.remove(span_name)
-        # Now print the hierarchy
-        def print_span(span_name, level=0):
-            if span_name not in spans:
-                return
-            span = spans[span_name]
-            indent = "  " * level
-            parent_info = f" (parent_id: {span['parent_id']})" if span["parent_id"] else ""
-            print(f"{indent}→ {span_name}{parent_info}")
-            # Print children
-            for child in span["children"]:
-                print_span(child, level + 1)
-        # Print starting with root spans
-        print("\nHierarchical Trace Structure:")
-        for root in root_spans:
-            print_span(root)
+        for span in self.trace_spans:
+            span.print_span()
     def get_duration(self) -> float:
         """
         Get the total duration of this trace
         """
         return time.time() - self.start_time
-    def condense_trace(self, entries: List[dict]) -> List[dict]:
-        """
-        Condenses trace entries into a single entry for each span instance,
-        preserving parent-child span relationships using span_id and parent_span_id.
-        """
-        spans_by_id: Dict[str, dict] = {}
-        evaluation_runs: List[EvaluationRun] = []
-        # First pass: Group entries by span_id and gather data
-        for entry in entries:
-            span_id = entry.get("span_id")
-            if not span_id:
-                continue # Skip entries without a span_id (should not happen)
-            if entry["type"] == "enter":
-                if span_id not in spans_by_id:
-                    spans_by_id[span_id] = {
-                        "span_id": span_id,
-                        "function": entry["function"],
-                        "depth": entry["depth"], # Use the depth recorded at entry time
-                        "created_at": entry["created_at"],
-                        "trace_id": entry["trace_id"],
-                        "parent_span_id": entry.get("parent_span_id"),
-                        "span_type": entry.get("span_type", "span"),
-                        "inputs": None,
-                        "output": None,
-                        "evaluation_runs": [],
-                        "duration": None
-                    }
-                # Handle potential duplicate enter events if necessary (e.g., log warning)
-            elif span_id in spans_by_id:
-                current_span_data = spans_by_id[span_id]
-                if entry["type"] == "input" and entry["inputs"]:
-                    # Merge inputs if multiple are recorded, or just assign
-                    if current_span_data["inputs"] is None:
-                        current_span_data["inputs"] = entry["inputs"]
-                    elif isinstance(current_span_data["inputs"], dict) and isinstance(entry["inputs"], dict):
-                        current_span_data["inputs"].update(entry["inputs"])
-                    # Add more sophisticated merging if needed
-                elif entry["type"] == "output" and "output" in entry:
-                    current_span_data["output"] = entry["output"]
-                elif entry["type"] == "evaluation" and entry.get("evaluation_runs"):
-                    if current_span_data.get("evaluation_runs") is not None:
-                        evaluation_runs.extend(entry["evaluation_runs"])
-                elif entry["type"] == "exit":
-                    if current_span_data["duration"] is None: # Calculate duration only once
-                        start_time = datetime.fromisoformat(current_span_data.get("created_at", entry["created_at"]))
-                        end_time = datetime.fromisoformat(entry["created_at"])
-                        current_span_data["duration"] = (end_time - start_time).total_seconds()
-                    # Update depth if exit depth is different (though current span() implementation keeps it same)
-                    # current_span_data["depth"] = entry["depth"]
-        # Convert dictionary to a list initially for easier access
-        spans_list = list(spans_by_id.values())
-        # Build tree structure (adjacency list) and find roots
-        children_map: Dict[Optional[str], List[dict]] = {}
-        roots = []
-        span_map = {span['span_id']: span for span in spans_list} # Map for quick lookup
-        for span in spans_list:
-            parent_id = span.get("parent_span_id")
-            if parent_id is None:
-                roots.append(span)
-            else:
-                if parent_id not in children_map:
-                    children_map[parent_id] = []
-                children_map[parent_id].append(span)
-        # Sort roots by timestamp
-        roots.sort(key=lambda x: datetime.fromisoformat(x.get("created_at", "1970-01-01T00:00:00")))
-        # Perform depth-first traversal to get the final sorted list
-        sorted_condensed_list = []
-        visited = set() # To handle potential cycles, though unlikely with UUIDs
-        def dfs(span_data):
-            span_id = span_data['span_id']
-            if span_id in visited:
-                return # Avoid infinite loops in case of cycles
-            visited.add(span_id)
-            sorted_condensed_list.append(span_data) # Add parent before children
-            # Get children, sort them by created_at, and visit them
-            span_children = children_map.get(span_id, [])
-            span_children.sort(key=lambda x: datetime.fromisoformat(x.get("created_at", "1970-01-01T00:00:00")))
-            for child in span_children:
-                # Ensure the child exists in our map before recursing
-                if child['span_id'] in span_map:
-                    dfs(child)
-                else:
-                    # This case might indicate an issue, but we'll add the child directly
-                    # if its parent was processed but the child itself wasn't in the initial list?
-                    # Or if the child's 'enter' event was missing. For robustness, add it.
-                    if child['span_id'] not in visited:
-                         visited.add(child['span_id'])
-                         sorted_condensed_list.append(child)
-        # Start DFS from each root
-        for root_span in roots:
-            if root_span['span_id'] not in visited:
-                dfs(root_span)
-        # Handle spans that might not have been reachable from roots (orphans)
-        # Though ideally, all spans should descend from a root.
-        for span_data in spans_list:
-             if span_data['span_id'] not in visited:
-                  # Decide how to handle orphans, maybe append them at the end sorted by time?
-                  # For now, let's just add them to ensure they aren't lost.
-                  sorted_condensed_list.append(span_data)
-        return sorted_condensed_list, evaluation_runs
     def save(self, overwrite: bool = False) -> Tuple[str, dict]:
         """
@@ -863,44 +523,36 @@ class TraceClient:
         """
         # Calculate total elapsed time
         total_duration = self.get_duration()
-        raw_entries = [entry.to_dict() for entry in self.entries]
-        condensed_entries, evaluation_runs = self.condense_trace(raw_entries)
         # Only count tokens for actual LLM API call spans
         llm_span_names = {"OPENAI_API_CALL", "TOGETHER_API_CALL", "ANTHROPIC_API_CALL", "GOOGLE_API_CALL"}
-        for entry in condensed_entries:
-            entry_function_name = entry.get("function", "") # Get function name safely
+        for span in self.trace_spans:
+            span_function_name = span.function # Get function name safely
             # Check if it's an LLM span AND function name CONTAINS an API call suffix AND output is dict
-            is_llm_entry = entry.get("span_type") == "llm"
-            has_api_suffix = any(suffix in entry_function_name for suffix in llm_span_names)
-            output_is_dict = isinstance(entry.get("output"), dict)
+            is_llm_span = span.span_type == "llm"
+            has_api_suffix = any(suffix in span_function_name for suffix in llm_span_names)
+            output_is_dict = isinstance(span.output, dict)
             # --- DEBUG PRINT 1: Check if condition passes ---
             # if is_llm_entry and has_api_suffix and output_is_dict:
-            #   #  print(f"[DEBUG TraceClient.save] Processing entry: {entry.get('span_id')} ({entry_function_name}) - Condition PASSED")
             # elif is_llm_entry:
             #      # Print why it failed if it was an LLM entry
-            #      print(f"[DEBUG TraceClient.save] Skipping LLM entry: {entry.get('span_id')} ({entry_function_name}) - Suffix Match: {has_api_suffix}, Output is Dict: {output_is_dict}")
             # # --- END DEBUG ---
-            if is_llm_entry and has_api_suffix and output_is_dict:
-                output = entry["output"]
+            if is_llm_span and has_api_suffix and output_is_dict:
+                output = span.output
                 usage = output.get("usage", {}) # Gets the 'usage' dict from the 'output' field
                 # --- DEBUG PRINT 2: Check extracted usage ---
-                # print(f"[DEBUG TraceClient.save]   Extracted usage dict: {usage}")
                 # --- END DEBUG ---
                 # --- NEW: Extract model_name correctly from nested inputs ---
                 model_name = None
-                entry_inputs = entry.get("inputs", {})
-                # print(f"[DEBUG TraceClient.save]   Inspecting inputs for span {entry.get('span_id')}: {entry_inputs}") # DEBUG Inputs
-                if entry_inputs:
+                span_inputs = span.inputs
+                if span_inputs:
                     # Try common locations for model name within the inputs structure
-                    invocation_params = entry_inputs.get("invocation_params", {})
-                    serialized_data = entry_inputs.get("serialized", {})
+                    invocation_params = span_inputs.get("invocation_params", {})
+                    serialized_data = span_inputs.get("serialized", {})
                     # Look in invocation_params (often directly contains model)
                     if isinstance(invocation_params, dict):
@@ -920,10 +572,9 @@ class TraceClient:
                     # Fallback: Check top-level of inputs itself (less likely for callbacks)
                     if not model_name:
-                        model_name = entry_inputs.get("model")
+                        model_name = span_inputs.get("model")
-                # print(f"[DEBUG TraceClient.save]     Determined model_name: {model_name}") # DEBUG Model Name
                 # --- END NEW ---
                 prompt_tokens = 0
@@ -985,7 +636,7 @@ class TraceClient:
                         if "usage" not in output:
                             output["usage"] = {} # Initialize if missing
                         elif not isinstance(output["usage"], dict): # Handle cases where 'usage' might not be a dict (e.g., placeholder string)
-                            print(f"[WARN TraceClient.save] Output 'usage' for span {entry.get('span_id')} was not a dict ({type(output['usage'])}). Resetting before adding costs.")
+                            print(f"[WARN TraceClient.save] Output 'usage' for span {span.span_id} was not a dict ({type(output['usage'])}). Resetting before adding costs.")
                             output["usage"] = {} # Reset to dict
                         output["usage"]["prompt_tokens_cost_usd"] = prompt_cost
@@ -993,10 +644,10 @@ class TraceClient:
                         output["usage"]["total_cost_usd"] = prompt_cost + completion_cost
                     except Exception as e:
                         # If cost calculation fails, continue without adding costs
-                        print(f"Error calculating cost for model '{model_name}' (span: {entry.get('span_id')}): {str(e)}")
+                        print(f"Error calculating cost for model '{model_name}' (span: {span.span_id}): {str(e)}")
                         pass
                 else:
-                     print(f"[WARN TraceClient.save] Could not determine model name for cost calculation (span: {entry.get('span_id')}). Inputs: {entry_inputs}")
+                     print(f"[WARN TraceClient.save] Could not determine model name for cost calculation (span: {span.span_id}). Inputs: {span_inputs}")
         # Create trace document - Always use standard keys for top-level counts
@@ -1006,20 +657,258 @@ class TraceClient:
             "project_name": self.project_name,
             "created_at": datetime.utcfromtimestamp(self.start_time).isoformat(),
             "duration": total_duration,
-            "entries": condensed_entries,
-            "evaluation_runs": evaluation_runs,
+            "entries": [span.model_dump() for span in self.trace_spans],
+            "evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
             "overwrite": overwrite,
+            "offline_mode": self.tracer.offline_mode,
             "parent_trace_id": self.parent_trace_id,
             "parent_name": self.parent_name
         }
         # --- Log trace data before saving ---
-        self.trace_manager_client.save_trace(trace_data)
+        self.trace_manager_client.save_trace(trace_data, offline_mode=self.tracer.offline_mode)
+        # upload annotations
+        # TODO: batch to the log endpoint
+        for annotation in self.annotations:
+            self.trace_manager_client.save_annotation(annotation)
         return self.trace_id, trace_data
     def delete(self):
         return self.trace_manager_client.delete_trace(self.trace_id)
+class _DeepTracer:
+    _instance: Optional["_DeepTracer"] = None
+    _lock: threading.Lock = threading.Lock()
+    _refcount: int = 0
+    _span_stack: contextvars.ContextVar[List[Dict[str, Any]]] = contextvars.ContextVar("_deep_profiler_span_stack", default=[])
+    _skip_stack: contextvars.ContextVar[List[str]] = contextvars.ContextVar("_deep_profiler_skip_stack", default=[])
+    def _get_qual_name(self, frame) -> str:
+        func_name = frame.f_code.co_name
+        module_name = frame.f_globals.get("__name__", "unknown_module")
+        try:
+            func = frame.f_globals.get(func_name)
+            if func is None:
+                return f"{module_name}.{func_name}"
+            if hasattr(func, "__qualname__"):
+                 return f"{module_name}.{func.__qualname__}"
+        except Exception:
+            return f"{module_name}.{func_name}"
+    def __new__(cls):
+        with cls._lock:
+            if cls._instance is None:
+                cls._instance = super().__new__(cls)
+        return cls._instance
+    def _should_trace(self, frame):
+        # Skip stack is maintained by the tracer as an optimization to skip earlier
+        # frames in the call stack that we've already determined should be skipped
+        skip_stack = self._skip_stack.get()
+        if len(skip_stack) > 0:
+            return False
+        func_name = frame.f_code.co_name
+        module_name = frame.f_globals.get("__name__", None)
+        func = frame.f_globals.get(func_name)
+        if func and (hasattr(func, '_judgment_span_name') or hasattr(func, '_judgment_span_type')):
+            return False
+        if (
+            not module_name
+            or func_name.startswith("<") # ex: <listcomp>
+            or func_name.startswith("__") and func_name != "__call__" # dunders
+            or not self._is_user_code(frame.f_code.co_filename)
+        ):
+            return False
+        return True
+    @functools.cache
+    def _is_user_code(self, filename: str):
+        return bool(filename) and not filename.startswith("<") and not os.path.realpath(filename).startswith(_TRACE_FILEPATH_BLOCKLIST)
+    def _trace(self, frame: types.FrameType, event: str, arg: Any):
+        frame.f_trace_lines = False
+        frame.f_trace_opcodes = False
+        if not self._should_trace(frame):
+            return
+        if event not in ("call", "return", "exception"):
+            return
+        current_trace = current_trace_var.get()
+        if not current_trace:
+            return
+        parent_span_id = current_span_var.get()
+        if not parent_span_id:
+            return
+        qual_name = self._get_qual_name(frame)
+        skip_stack = self._skip_stack.get()
+        if event == "call":
+            # If we have entries in the skip stack and the current qual_name matches the top entry,
+            # push it again to track nesting depth and skip
+            # As an optimization, we only care about duplicate qual_names.
+            if skip_stack:
+                if qual_name == skip_stack[-1]:
+                    skip_stack.append(qual_name)
+                    self._skip_stack.set(skip_stack)
+                return
+            should_trace = self._should_trace(frame)
+            if not should_trace:
+                if not skip_stack:
+                    self._skip_stack.set([qual_name])
+                return
+        elif event == "return":
+            # If we have entries in skip stack and current qual_name matches the top entry,
+            # pop it to track exiting from the skipped section
+            if skip_stack and qual_name == skip_stack[-1]:
+                skip_stack.pop()
+                self._skip_stack.set(skip_stack)
+                return
+            if skip_stack:
+                return
+        span_stack = self._span_stack.get()
+        if event == "call":
+            if not self._should_trace(frame):
+                return
+            span_id = str(uuid.uuid4())
+            parent_depth = current_trace._span_depths.get(parent_span_id, 0)
+            depth = parent_depth + 1
+            current_trace._span_depths[span_id] = depth
+            start_time = time.time()
+            span_stack.append({
+                "span_id": span_id,
+                "parent_span_id": parent_span_id,
+                "function": qual_name,
+                "start_time": start_time
+            })
+            self._span_stack.set(span_stack)
+            token = current_span_var.set(span_id)
+            frame.f_locals["_judgment_span_token"] = token
+            span = TraceSpan(
+                span_id=span_id,
+                trace_id=current_trace.trace_id,
+                depth=depth,
+                message=qual_name,
+                created_at=start_time,
+                span_type="span",
+                parent_span_id=parent_span_id,
+                function=qual_name
+            )
+            current_trace.add_span(span)
+            inputs = {}
+            try:
+                args_info = inspect.getargvalues(frame)
+                for arg in args_info.args:
+                    try:
+                        inputs[arg] = args_info.locals.get(arg)
+                    except:
+                        inputs[arg] = "<<Unserializable>>"
+                current_trace.record_input(inputs)
+            except Exception as e:
+                current_trace.record_input({
+                    "error": str(e)
+                })
+        elif event == "return":
+            if not span_stack:
+                return
+            current_id = current_span_var.get()
+            span_data = None
+            for i, entry in enumerate(reversed(span_stack)):
+                if entry["span_id"] == current_id:
+                    span_data = span_stack.pop(-(i+1))
+                    self._span_stack.set(span_stack)
+                    break
+            if not span_data:
+                return
+            start_time = span_data["start_time"]
+            duration = time.time() - start_time
+            current_trace.span_id_to_span[span_data["span_id"]].duration = duration
+            if arg is not None:
+                # exception handling will take priority.
+                current_trace.record_output(arg)
+            if span_data["span_id"] in current_trace._span_depths:
+                del current_trace._span_depths[span_data["span_id"]]
+            if span_stack:
+                current_span_var.set(span_stack[-1]["span_id"])
+            else:
+                current_span_var.set(span_data["parent_span_id"])
+            if "_judgment_span_token" in frame.f_locals:
+                current_span_var.reset(frame.f_locals["_judgment_span_token"])
+        elif event == "exception":
+            exc_type, exc_value, exc_traceback = arg
+            formatted_exception = {
+                "type": exc_type.__name__,
+                "message": str(exc_value),
+                "traceback": traceback.format_tb(exc_traceback)
+            }
+            current_trace = current_trace_var.get()
+            current_trace.record_output({
+                "error": formatted_exception
+            })
+        return self._trace
+    def __enter__(self):
+        with self._lock:
+            self._refcount += 1
+            if self._refcount == 1:
+                self._skip_stack.set([])
+                self._span_stack.set([])
+                sys.settrace(self._trace)
+                threading.settrace(self._trace)
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        with self._lock:
+            self._refcount -= 1
+            if self._refcount == 0:
+                sys.settrace(None)
+                threading.settrace(None)
+def log(self, message: str, level: str = "info"):
+        """ Log a message with the span context """
+        current_trace = current_trace_var.get()
+        if current_trace:
+            current_trace.log(message, level)
+        else:
+            print(f"[{level}] {message}")
+        current_trace.record_output({"log": message})
 class Tracer:
     _instance = None
@@ -1042,12 +931,17 @@ class Tracer:
         s3_aws_access_key_id: Optional[str] = None,
         s3_aws_secret_access_key: Optional[str] = None,
         s3_region_name: Optional[str] = None,
-        deep_tracing: bool = True  # NEW: Enable deep tracing by default
+        offline_mode: bool = False,
+        deep_tracing: bool = True  # Deep tracing is enabled by default
         ):
         if not hasattr(self, 'initialized'):
             if not api_key:
                 raise ValueError("Tracer must be configured with a Judgment API key")
+            result, response = validate_api_key(api_key)
+            if not result:
+                raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
             if not organization_id:
                 raise ValueError("Tracer must be configured with an Organization ID")
             if use_s3 and not s3_bucket_name:
@@ -1059,11 +953,11 @@ class Tracer:
             self.api_key: str = api_key
             self.project_name: str = project_name
-            self.client: JudgmentClient = JudgmentClient(judgment_api_key=api_key)
             self.organization_id: str = organization_id
             self._current_trace: Optional[str] = None
             self._active_trace_client: Optional[TraceClient] = None # Add active trace client attribute
             self.rules: List[Rule] = rules or []  # Store rules at tracer level
+            self.traces: List[Trace] = []
             self.initialized: bool = True
             self.enable_monitoring: bool = enable_monitoring
             self.enable_evaluations: bool = enable_evaluations
@@ -1078,6 +972,7 @@ class Tracer:
                     aws_secret_access_key=s3_aws_secret_access_key,
                     region_name=s3_region_name
                 )
+            self.offline_mode: bool = offline_mode
             self.deep_tracing: bool = deep_tracing  # NEW: Store deep tracing setting
         elif hasattr(self, 'project_name') and self.project_name != project_name:
@@ -1087,6 +982,12 @@ class Tracer:
                 "To use a different project name, ensure the first Tracer initialization uses the desired project name.",
                 RuntimeWarning
             )
+    def set_current_span(self, span_id: str):
+        self.current_span_id = span_id
+    def get_current_span(self) -> Optional[str]:
+        return getattr(self, 'current_span_id', None)
     def set_current_trace(self, trace: TraceClient):
         """
@@ -1119,45 +1020,6 @@ class Tracer:
         """Returns the TraceClient instance currently marked as active by the handler."""
         return self._active_trace_client
-    def _apply_deep_tracing(self, func, span_type="span"):
-        """
-        Apply deep tracing to all functions in the same module as the given function.
-        Args:
-            func: The function being traced
-            span_type: Type of span to use for traced functions
-        Returns:
-            A tuple of (module, original_functions_dict) where original_functions_dict
-            contains the original functions that were replaced with traced versions.
-        """
-        module = inspect.getmodule(func)
-        if not module:
-            return None, {}
-        # Save original functions
-        original_functions = {}
-        # Find all functions in the module
-        for name, obj in inspect.getmembers(module, inspect.isfunction):
-            # Skip already wrapped functions
-            if hasattr(obj, '_judgment_traced'):
-                continue
-            # Create a traced version of the function
-            # Always use default span type "span" for child functions
-            traced_func = _create_deep_tracing_wrapper(obj, self, "span")
-            # Mark the function as traced to avoid double wrapping
-            traced_func._judgment_traced = True
-            # Save the original function
-            original_functions[name] = obj
-            # Replace with traced version
-            setattr(module, name, traced_func)
-        return module, original_functions
     @contextmanager
     def trace(
@@ -1204,6 +1066,23 @@ class Tracer:
             finally:
                 # Reset the context variable
                 current_trace_var.reset(token)
+    def log(self, msg: str, label: str = "log", score: int = 1):
+        """Log a message with the current span context"""
+        current_span_id = current_span_var.get()
+        current_trace = current_trace_var.get()
+        if current_span_id:
+            annotation = TraceAnnotation(
+                span_id=current_span_id,
+                text=msg,
+                label=label,
+                score=score
+            )
+            current_trace.add_annotation(annotation)
+        rprint(f"[bold]{label}:[/bold] {msg}")
     def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False, deep_tracing: bool = None):
         """
@@ -1239,13 +1118,6 @@ class Tracer:
         if asyncio.iscoroutinefunction(func):
             @functools.wraps(func)
             async def async_wrapper(*args, **kwargs):
-                # Check if we're already in a traced function
-                if in_traced_function_var.get():
-                    return await func(*args, **kwargs)
-                # Set in_traced_function_var to True
-                token = in_traced_function_var.set(True)
                 # Get current trace from context
                 current_trace = current_trace_var.get()
@@ -1275,81 +1147,47 @@ class Tracer:
                         # This sets the current_span_var
                         with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
                             # Record inputs
-                            span.record_input({
-                                'args': str(args),
-                                'kwargs': kwargs
-                            })
+                            inputs = combine_args_kwargs(func, args, kwargs)
+                            span.record_input(inputs)
-                            # If deep tracing is enabled, apply monkey patching
                             if use_deep_tracing:
-                                module, original_functions = self._apply_deep_tracing(func, span_type)
-                            # Execute function
-                            result = await func(*args, **kwargs)
-                            # Restore original functions if deep tracing was enabled
-                            if use_deep_tracing and module and 'original_functions' in locals():
-                                for name, obj in original_functions.items():
-                                    setattr(module, name, obj)
+                                with _DeepTracer():
+                                    result = await func(*args, **kwargs)
+                            else:
+                                result = await func(*args, **kwargs)
                             # Record output
                             span.record_output(result)
-                        # Save the completed trace
-                        current_trace.save(overwrite=overwrite)
                         return result
                     finally:
+                        # Save the completed trace
+                        trace_id, trace = current_trace.save(overwrite=overwrite)
+                        self.traces.append(trace)
                         # Reset trace context (span context resets automatically)
                         current_trace_var.reset(trace_token)
-                        # Reset in_traced_function_var
-                        in_traced_function_var.reset(token)
                 else:
-                    # Already have a trace context, just create a span in it
-                    # The span method handles current_span_var
-                    try:
-                        with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
-                            # Record inputs
-                            span.record_input({
-                                'args': str(args),
-                                'kwargs': kwargs
-                            })
-                            # If deep tracing is enabled, apply monkey patching
-                            if use_deep_tracing:
-                                module, original_functions = self._apply_deep_tracing(func, span_type)
-                            # Execute function
+                    with current_trace.span(span_name, span_type=span_type) as span:
+                        inputs = combine_args_kwargs(func, args, kwargs)
+                        span.record_input(inputs)
+                        if use_deep_tracing:
+                            with _DeepTracer():
+                                result = await func(*args, **kwargs)
+                        else:
                             result = await func(*args, **kwargs)
-                            # Restore original functions if deep tracing was enabled
-                            if use_deep_tracing and module and 'original_functions' in locals():
-                                for name, obj in original_functions.items():
-                                    setattr(module, name, obj)
-                            # Record output
-                            span.record_output(result)
-                        return result
-                    finally:
-                        # Reset in_traced_function_var
-                        in_traced_function_var.reset(token)
+                        span.record_output(result)
+                    return result
             return async_wrapper
         else:
             # Non-async function implementation with deep tracing
             @functools.wraps(func)
-            def wrapper(*args, **kwargs):
-                # Check if we're already in a traced function
-                if in_traced_function_var.get():
-                    return func(*args, **kwargs)
-                # Set in_traced_function_var to True
-                token = in_traced_function_var.set(True)
+            def wrapper(*args, **kwargs):
                 # Get current trace from context
                 current_trace = current_trace_var.get()
                 # If there's no current trace, create a root trace
                 if not current_trace:
                     trace_id = str(uuid.uuid4())
@@ -1376,66 +1214,40 @@ class Tracer:
                         # This sets the current_span_var
                         with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
                             # Record inputs
-                            span.record_input({
-                                'args': str(args),
-                                'kwargs': kwargs
-                            })
+                            inputs = combine_args_kwargs(func, args, kwargs)
+                            span.record_input(inputs)
-                            # If deep tracing is enabled, apply monkey patching
                             if use_deep_tracing:
-                                module, original_functions = self._apply_deep_tracing(func, span_type)
-                            # Execute function
-                            result = func(*args, **kwargs)
-                            # Restore original functions if deep tracing was enabled
-                            if use_deep_tracing and module and 'original_functions' in locals():
-                                for name, obj in original_functions.items():
-                                    setattr(module, name, obj)
+                                with _DeepTracer():
+                                    result = func(*args, **kwargs)
+                            else:
+                                result = func(*args, **kwargs)
                             # Record output
                             span.record_output(result)
-                        # Save the completed trace
-                        current_trace.save(overwrite=overwrite)
                         return result
                     finally:
+                        # Save the completed trace
+                        trace_id, trace = current_trace.save(overwrite=overwrite)
+                        self.traces.append(trace)
                         # Reset trace context (span context resets automatically)
                         current_trace_var.reset(trace_token)
-                        # Reset in_traced_function_var
-                        in_traced_function_var.reset(token)
                 else:
-                    # Already have a trace context, just create a span in it
-                    # The span method handles current_span_var
-                    try:
-                        with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
-                            # Record inputs
-                            span.record_input({
-                                'args': str(args),
-                                'kwargs': kwargs
-                            })
-                            # If deep tracing is enabled, apply monkey patching
-                            if use_deep_tracing:
-                                module, original_functions = self._apply_deep_tracing(func, span_type)
-                            # Execute function
+                    with current_trace.span(span_name, span_type=span_type) as span:
+                        inputs = combine_args_kwargs(func, args, kwargs)
+                        span.record_input(inputs)
+                        if use_deep_tracing:
+                            with _DeepTracer():
+                                result = func(*args, **kwargs)
+                        else:
                             result = func(*args, **kwargs)
-                            # Restore original functions if deep tracing was enabled
-                            if use_deep_tracing and module and 'original_functions' in locals():
-                                for name, obj in original_functions.items():
-                                    setattr(module, name, obj)
-                            # Record output
-                            span.record_output(result)
-                        return result
-                    finally:
-                        # Reset in_traced_function_var
-                        in_traced_function_var.reset(token)
+                        span.record_output(result)
+                    return result
             return wrapper
     def async_evaluate(self, *args, **kwargs):
@@ -1462,64 +1274,94 @@ class Tracer:
         else:
             warnings.warn("No trace found (context var or fallback), skipping evaluation") # Modified warning
 def wrap(client: Any) -> Any:
     """
     Wraps an API client to add tracing capabilities.
     Supports OpenAI, Together, Anthropic, and Google GenAI clients.
     Patches both '.create' and Anthropic's '.stream' methods using a wrapper class.
     """
-    span_name, original_create, original_stream = _get_client_config(client)
+    span_name, original_create, original_responses_create, original_stream = _get_client_config(client)
+    def _record_input_and_check_streaming(span, kwargs, is_responses=False):
+        """Record input and check for streaming"""
+        is_streaming = kwargs.get("stream", False)
-    # --- Define Traced Async Functions ---
+            # Record input based on whether this is a responses endpoint
+        if is_responses:
+            span.record_input(kwargs)
+        else:
+            input_data = _format_input_data(client, **kwargs)
+            span.record_input(input_data)
+        # Warn about token counting limitations with streaming
+        if isinstance(client, (AsyncOpenAI, OpenAI)) and is_streaming:
+            if not kwargs.get("stream_options", {}).get("include_usage"):
+                warnings.warn(
+                    "OpenAI streaming calls don't include token counts by default. "
+                    "To enable token counting with streams, set stream_options={'include_usage': True} "
+                    "in your API call arguments.",
+                    UserWarning
+                )
+        return is_streaming
+    def _format_and_record_output(span, response, is_streaming, is_async, is_responses):
+        """Format and record the output in the span"""
+        if is_streaming:
+            output_entry = span.record_output("<pending stream>")
+            wrapper_func = _async_stream_wrapper if is_async else _sync_stream_wrapper
+            return wrapper_func(response, client, output_entry)
+        else:
+            format_func = _format_response_output_data if is_responses else _format_output_data
+            output_data = format_func(client, response)
+            span.record_output(output_data)
+            return response
+    def _handle_error(span, e, is_async):
+        """Handle and record errors"""
+        call_type = "async" if is_async else "sync"
+        print(f"Error during wrapped {call_type} API call ({span_name}): {e}")
+        span.record_output({"error": str(e)})
+        raise
+    # --- Traced Async Functions ---
     async def traced_create_async(*args, **kwargs):
-        # [Existing logic - unchanged]
         current_trace = current_trace_var.get()
         if not current_trace:
-            if asyncio.iscoroutinefunction(original_create):
-                 return await original_create(*args, **kwargs)
-            else:
-                 return original_create(*args, **kwargs)
-        is_streaming = kwargs.get("stream", False)
+            return await original_create(*args, **kwargs)
         with current_trace.span(span_name, span_type="llm") as span:
-            input_data = _format_input_data(client, **kwargs)
-            span.record_input(input_data)
-            # Warn about token counting limitations with streaming
-            if isinstance(client, (AsyncOpenAI, OpenAI)) and is_streaming:
-                if not kwargs.get("stream_options", {}).get("include_usage"):
-                    warnings.warn(
-                        "OpenAI streaming calls don't include token counts by default. "
-                        "To enable token counting with streams, set stream_options={'include_usage': True} "
-                        "in your API call arguments.",
-                        UserWarning
-                    )
+            is_streaming = _record_input_and_check_streaming(span, kwargs)
             try:
-                if is_streaming:
-                    stream_iterator = await original_create(*args, **kwargs)
-                    output_entry = span.record_output("<pending stream>")
-                    return _async_stream_wrapper(stream_iterator, client, output_entry)
-                else:
-                    awaited_response = await original_create(*args, **kwargs)
-                    output_data = _format_output_data(client, awaited_response)
-                    span.record_output(output_data)
-                    return awaited_response
+                response_or_iterator = await original_create(*args, **kwargs)
+                return _format_and_record_output(span, response_or_iterator, is_streaming, True, False)
             except Exception as e:
-                print(f"Error during wrapped async API call ({span_name}): {e}")
-                span.record_output({"error": str(e)})
-                raise
-    # Function replacing .stream() - NOW returns the wrapper class instance
+                return _handle_error(span, e, True)
+    # Async responses for OpenAI clients
+    async def traced_response_create_async(*args, **kwargs):
+        current_trace = current_trace_var.get()
+        if not current_trace:
+            return await original_responses_create(*args, **kwargs)
+        with current_trace.span(span_name, span_type="llm") as span:
+            is_streaming = _record_input_and_check_streaming(span, kwargs, is_responses=True)
+            try:
+                response_or_iterator = await original_responses_create(*args, **kwargs)
+                return _format_and_record_output(span, response_or_iterator, is_streaming, True, True)
+            except Exception as e:
+                return _handle_error(span, e, True)
+    # Function replacing .stream() for async clients
     def traced_stream_async(*args, **kwargs):
         current_trace = current_trace_var.get()
         if not current_trace or not original_stream:
             return original_stream(*args, **kwargs)
         original_manager = original_stream(*args, **kwargs)
-        wrapper_manager = _TracedAsyncStreamManagerWrapper(
+        return _TracedAsyncStreamManagerWrapper(
             original_manager=original_manager,
             client=client,
             span_name=span_name,
@@ -1527,104 +1369,74 @@ def wrap(client: Any) -> Any:
             stream_wrapper_func=_async_stream_wrapper,
             input_kwargs=kwargs
         )
-        return wrapper_manager
-    # --- Define Traced Sync Functions ---
+    # --- Traced Sync Functions ---
     def traced_create_sync(*args, **kwargs):
-         # [Existing logic - unchanged]
         current_trace = current_trace_var.get()
         if not current_trace:
-             return original_create(*args, **kwargs)
-        is_streaming = kwargs.get("stream", False)
+            return original_create(*args, **kwargs)
         with current_trace.span(span_name, span_type="llm") as span:
-             input_data = _format_input_data(client, **kwargs)
-             span.record_input(input_data)
-             # Warn about token counting limitations with streaming
-             if isinstance(client, (AsyncOpenAI, OpenAI)) and is_streaming:
-                 if not kwargs.get("stream_options", {}).get("include_usage"):
-                     warnings.warn(
-                         "OpenAI streaming calls don't include token counts by default. "
-                         "To enable token counting with streams, set stream_options={'include_usage': True} "
-                         "in your API call arguments.",
-                         UserWarning
-                     )
-             try:
-                 response_or_iterator = original_create(*args, **kwargs)
-             except Exception as e:
-                 print(f"Error during wrapped sync API call ({span_name}): {e}")
-                 span.record_output({"error": str(e)})
-                 raise
-             if is_streaming:
-                 output_entry = span.record_output("<pending stream>")
-                 return _sync_stream_wrapper(response_or_iterator, client, output_entry)
-             else:
-                 output_data = _format_output_data(client, response_or_iterator)
-                 span.record_output(output_data)
-                 return response_or_iterator
+            is_streaming = _record_input_and_check_streaming(span, kwargs)
+            try:
+                response_or_iterator = original_create(*args, **kwargs)
+                return _format_and_record_output(span, response_or_iterator, is_streaming, False, False)
+            except Exception as e:
+                return _handle_error(span, e, False)
+    def traced_response_create_sync(*args, **kwargs):
+        current_trace = current_trace_var.get()
+        if not current_trace:
+            return original_responses_create(*args, **kwargs)
+        with current_trace.span(span_name, span_type="llm") as span:
+            is_streaming = _record_input_and_check_streaming(span, kwargs, is_responses=True)
+            try:
+                response_or_iterator = original_responses_create(*args, **kwargs)
+                return _format_and_record_output(span, response_or_iterator, is_streaming, False, True)
+            except Exception as e:
+                return _handle_error(span, e, False)
     # Function replacing sync .stream()
     def traced_stream_sync(*args, **kwargs):
-         current_trace = current_trace_var.get()
-         if not current_trace or not original_stream:
-             return original_stream(*args, **kwargs)
-         original_manager = original_stream(*args, **kwargs)
-         wrapper_manager = _TracedSyncStreamManagerWrapper(
-             original_manager=original_manager,
-             client=client,
-             span_name=span_name,
-             trace_client=current_trace,
-             stream_wrapper_func=_sync_stream_wrapper,
-             input_kwargs=kwargs
-         )
-         return wrapper_manager
+        current_trace = current_trace_var.get()
+        if not current_trace or not original_stream:
+            return original_stream(*args, **kwargs)
+        original_manager = original_stream(*args, **kwargs)
+        return _TracedSyncStreamManagerWrapper(
+            original_manager=original_manager,
+            client=client,
+            span_name=span_name,
+            trace_client=current_trace,
+            stream_wrapper_func=_sync_stream_wrapper,
+            input_kwargs=kwargs
+        )
     # --- Assign Traced Methods to Client Instance ---
-    # [Assignment logic remains the same]
     if isinstance(client, (AsyncOpenAI, AsyncTogether)):
         client.chat.completions.create = traced_create_async
-        # Wrap the Responses API endpoint for OpenAI clients
         if hasattr(client, "responses") and hasattr(client.responses, "create"):
-            # Capture the original responses.create
-            original_responses_create = client.responses.create
-            def traced_responses(*args, **kwargs):
-                # Get the current trace from contextvars
-                current_trace = current_trace_var.get()
-                # If no active trace, call the original
-                if not current_trace:
-                    return original_responses_create(*args, **kwargs)
-                # Trace this responses.create call
-                with current_trace.span(span_name, span_type="llm") as span:
-                    # Record raw input kwargs
-                    span.record_input(kwargs)
-                    # Make the actual API call
-                    response = original_responses_create(*args, **kwargs)
-                    # Record the output object
-                    span.record_output(response)
-                    return response
-            # Assign the traced wrapper
-            client.responses.create = traced_responses
+            client.responses.create = traced_response_create_async
     elif isinstance(client, AsyncAnthropic):
         client.messages.create = traced_create_async
         if original_stream:
-             client.messages.stream = traced_stream_async
+            client.messages.stream = traced_stream_async
     elif isinstance(client, genai.client.AsyncClient):
-        client.generate_content = traced_create_async
+        client.models.generate_content = traced_create_async
     elif isinstance(client, (OpenAI, Together)):
-         client.chat.completions.create = traced_create_sync
+        client.chat.completions.create = traced_create_sync
+        if hasattr(client, "responses") and hasattr(client.responses, "create"):
+            client.responses.create = traced_response_create_sync
     elif isinstance(client, Anthropic):
-         client.messages.create = traced_create_sync
-         if original_stream:
-             client.messages.stream = traced_stream_sync
+        client.messages.create = traced_create_sync
+        if original_stream:
+            client.messages.stream = traced_stream_sync
     elif isinstance(client, genai.Client):
-         client.generate_content = traced_create_sync
+        client.models.generate_content = traced_create_sync
     return client
 # Helper functions for client-specific operations
@@ -1639,19 +1451,20 @@ def _get_client_config(client: ApiClient) -> tuple[str, callable, Optional[calla
         tuple: (span_name, create_method, stream_method)
             - span_name: String identifier for tracing
             - create_method: Reference to the client's creation method
+            - responses_method: Reference to the client's responses method (if applicable)
             - stream_method: Reference to the client's stream method (if applicable)
     Raises:
         ValueError: If client type is not supported
     """
     if isinstance(client, (OpenAI, AsyncOpenAI)):
-        return "OPENAI_API_CALL", client.chat.completions.create, None
+        return "OPENAI_API_CALL", client.chat.completions.create, client.responses.create, None
     elif isinstance(client, (Together, AsyncTogether)):
-        return "TOGETHER_API_CALL", client.chat.completions.create, None
+        return "TOGETHER_API_CALL", client.chat.completions.create, None, None
     elif isinstance(client, (Anthropic, AsyncAnthropic)):
-        return "ANTHROPIC_API_CALL", client.messages.create, client.messages.stream
+        return "ANTHROPIC_API_CALL", client.messages.create, None, client.messages.stream
     elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
-        return "GOOGLE_API_CALL", client.models.generate_content, None
+        return "GOOGLE_API_CALL", client.models.generate_content, None, None
     raise ValueError(f"Unsupported client type: {type(client)}")
 def _format_input_data(client: ApiClient, **kwargs) -> dict:
@@ -1677,6 +1490,26 @@ def _format_input_data(client: ApiClient, **kwargs) -> dict:
         "max_tokens": kwargs.get("max_tokens")
     }
+def _format_response_output_data(client: ApiClient, response: Any) -> dict:
+    """Format API response data based on client type.
+    Normalizes different response formats into a consistent structure
+    for tracing purposes.
+    """
+    if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
+        return {
+            "content": response.output,
+            "usage": {
+                "prompt_tokens": response.usage.input_tokens,
+                "completion_tokens": response.usage.output_tokens,
+                "total_tokens": response.usage.total_tokens
+            }
+        }
+    else:
+        warnings.warn(f"Unsupported client type: {type(client)}")
+        return {}
 def _format_output_data(client: ApiClient, response: Any) -> dict:
     """Format API response data based on client type.
@@ -1716,117 +1549,51 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
         }
     }
-# Define a blocklist of functions that should not be traced
-# These are typically utility functions, print statements, logging, etc.
-_TRACE_BLOCKLIST = {
-    # Built-in functions
-    'print', 'str', 'int', 'float', 'bool', 'list', 'dict', 'set', 'tuple',
-    'len', 'range', 'enumerate', 'zip', 'map', 'filter', 'sorted', 'reversed',
-    'min', 'max', 'sum', 'any', 'all', 'abs', 'round', 'format',
-    # Logging functions
-    'debug', 'info', 'warning', 'error', 'critical', 'exception', 'log',
-    # Common utility functions
-    'sleep', 'time', 'datetime', 'json', 'dumps', 'loads',
-    # String operations
-    'join', 'split', 'strip', 'lstrip', 'rstrip', 'replace', 'lower', 'upper',
-    # Dict operations
-    'get', 'items', 'keys', 'values', 'update',
-    # List operations
-    'append', 'extend', 'insert', 'remove', 'pop', 'clear', 'index', 'count', 'sort',
-}
-# Add a new function for deep tracing at the module level
-def _create_deep_tracing_wrapper(func, tracer, span_type="span"):
+def combine_args_kwargs(func, args, kwargs):
     """
-    Creates a wrapper for a function that automatically traces it when called within a traced function.
-    This enables deep tracing without requiring explicit @observe decorators on every function.
+    Combine positional arguments and keyword arguments into a single dictionary.
     Args:
-        func: The function to wrap
-        tracer: The Tracer instance
-        span_type: Type of span (default "span")
+        func: The function being called
+        args: Tuple of positional arguments
+        kwargs: Dictionary of keyword arguments
     Returns:
-        A wrapped function that will be traced when called
+        A dictionary combining both args and kwargs
     """
-    # Skip wrapping if the function is not callable or is a built-in
-    if not callable(func) or isinstance(func, type) or func.__module__ == 'builtins':
-        return func
-    # Skip functions in the blocklist
-    if func.__name__ in _TRACE_BLOCKLIST:
-        return func
-    # Skip functions from certain modules (logging, sys, etc.)
-    if func.__module__ and any(func.__module__.startswith(m) for m in ['logging', 'sys', 'os', 'json', 'time', 'datetime']):
-        return func
-    # Get function name for the span - check for custom name set by @observe
-    func_name = getattr(func, '_judgment_span_name', func.__name__)
-    # Check for custom span_type set by @observe
-    func_span_type = getattr(func, '_judgment_span_type', "span")
-    # Store original function to prevent losing reference
-    original_func = func
-    # Create appropriate wrapper based on whether the function is async or not
-    if asyncio.iscoroutinefunction(func):
-        @functools.wraps(func)
-        async def async_deep_wrapper(*args, **kwargs):
-            # Get current trace from context
-            current_trace = current_trace_var.get()
-            # If no trace context, just call the function
-            if not current_trace:
-                return await original_func(*args, **kwargs)
-            # Create a span for this function call - use custom span_type if available
-            with current_trace.span(func_name, span_type=func_span_type) as span:
-                # Record inputs
-                span.record_input({
-                    'args': str(args),
-                    'kwargs': kwargs
-                })
-                # Execute function
-                result = await original_func(*args, **kwargs)
-                # Record output
-                span.record_output(result)
-                return result
-        return async_deep_wrapper
-    else:
-        @functools.wraps(func)
-        def deep_wrapper(*args, **kwargs):
-            # Get current trace from context
-            current_trace = current_trace_var.get()
-            # If no trace context, just call the function
-            if not current_trace:
-                return original_func(*args, **kwargs)
-            # Create a span for this function call - use custom span_type if available
-            with current_trace.span(func_name, span_type=func_span_type) as span:
-                # Record inputs
-                span.record_input({
-                    'args': str(args),
-                    'kwargs': kwargs
-                })
-                # Execute function
-                result = original_func(*args, **kwargs)
-                # Record output
-                span.record_output(result)
-                return result
-        return deep_wrapper
+    try:
+        import inspect
+        sig = inspect.signature(func)
+        param_names = list(sig.parameters.keys())
+        args_dict = {}
+        for i, arg in enumerate(args):
+            if i < len(param_names):
+                args_dict[param_names[i]] = arg
+            else:
+                args_dict[f"arg{i}"] = arg
+        return {**args_dict, **kwargs}
+    except Exception as e:
+        # Fallback if signature inspection fails
+        return {**{f"arg{i}": arg for i, arg in enumerate(args)}, **kwargs}
+# NOTE: This builds once, can be tweaked if we are missing / capturing other unncessary modules
+# @link https://docs.python.org/3.13/library/sysconfig.html
+_TRACE_FILEPATH_BLOCKLIST = tuple(
+    os.path.realpath(p) + os.sep
+    for p in {
+        sysconfig.get_paths()['stdlib'],
+        sysconfig.get_paths().get('platstdlib', ''),
+        *site.getsitepackages(),
+        site.getusersitepackages(),
+        *(
+            [os.path.join(os.path.dirname(__file__), '../../judgeval/')]
+            if os.environ.get('JUDGMENT_DEV')
+            else []
+        ),
+    } if p
+)
 # Add the new TraceThreadPoolExecutor class
 class TraceThreadPoolExecutor(concurrent.futures.ThreadPoolExecutor):
@@ -1929,7 +1696,7 @@ def _extract_usage_from_final_chunk(client: ApiClient, chunk: Any) -> Optional[D
 def _sync_stream_wrapper(
     original_stream: Iterator,
     client: ApiClient,
-    output_entry: TraceEntry
+    span: TraceSpan
 ) -> Generator[Any, None, None]:
     """Wraps a synchronous stream iterator to capture content and update the trace."""
     content_parts = []  # Use a list instead of string concatenation
@@ -1948,7 +1715,7 @@ def _sync_stream_wrapper(
             final_usage = _extract_usage_from_final_chunk(client, last_chunk)
         # Update the trace entry with the accumulated content and usage
-        output_entry.output = {
+        span.output = {
             "content": "".join(content_parts),  # Join list at the end
             "usage": final_usage if final_usage else {"info": "Usage data not available in stream."}, # Provide placeholder if None
             "streamed": True
@@ -1960,7 +1727,7 @@ def _sync_stream_wrapper(
 async def _async_stream_wrapper(
     original_stream: AsyncIterator,
     client: ApiClient,
-    output_entry: TraceEntry
+    span: TraceSpan
 ) -> AsyncGenerator[Any, None]:
     # [Existing logic - unchanged]
     content_parts = []  # Use a list instead of string concatenation
@@ -1969,7 +1736,7 @@ async def _async_stream_wrapper(
     anthropic_input_tokens = 0
     anthropic_output_tokens = 0
-    target_span_id = getattr(output_entry, 'span_id', 'UNKNOWN')
+    target_span_id = span.span_id
     try:
         async for chunk in original_stream:
@@ -2014,19 +1781,17 @@ async def _async_stream_wrapper(
         elif last_content_chunk:
              usage_info = _extract_usage_from_final_chunk(client, last_content_chunk)
-        if output_entry and hasattr(output_entry, 'output'):
-            output_entry.output = {
+        if span and hasattr(span, 'output'):
+            span.output = {
                 "content": "".join(content_parts),  # Join list at the end
                 "usage": usage_info if usage_info else {"info": "Usage data not available in stream."},
                 "streamed": True
             }
-            start_ts = getattr(output_entry, 'created_at', time.time())
-            output_entry.duration = time.time() - start_ts
+            start_ts = getattr(span, 'created_at', time.time())
+            span.duration = time.time() - start_ts
         # else: # Handle error case if necessary, but remove debug print
-# --- Define Context Manager Wrapper Classes ---
-class _TracedAsyncStreamManagerWrapper(AbstractAsyncContextManager):
-    """Wraps an original async stream manager to add tracing."""
+class _BaseStreamManagerWrapper:
     def __init__(self, original_manager, client, span_name, trace_client, stream_wrapper_func, input_kwargs):
         self._original_manager = original_manager
         self._client = client
@@ -2036,281 +1801,74 @@ class _TracedAsyncStreamManagerWrapper(AbstractAsyncContextManager):
         self._input_kwargs = input_kwargs
         self._parent_span_id_at_entry = None
-    async def __aenter__(self):
-        self._parent_span_id_at_entry = current_span_var.get()
-        if not self._trace_client:
-             # If no trace, just delegate to the original manager
-             return await self._original_manager.__aenter__()
-        # --- Manually create the 'enter' entry ---
+    def _create_span(self):
         start_time = time.time()
         span_id = str(uuid.uuid4())
         current_depth = 0
         if self._parent_span_id_at_entry and self._parent_span_id_at_entry in self._trace_client._span_depths:
             current_depth = self._trace_client._span_depths[self._parent_span_id_at_entry] + 1
         self._trace_client._span_depths[span_id] = current_depth
-        enter_entry = TraceEntry(
-             type="enter", function=self._span_name, span_id=span_id,
-             trace_id=self._trace_client.trace_id, depth=current_depth, message=self._span_name,
-             created_at=start_time, span_type="llm", parent_span_id=self._parent_span_id_at_entry
+        span = TraceSpan(
+            function=self._span_name,
+            span_id=span_id,
+            trace_id=self._trace_client.trace_id,
+            depth=current_depth,
+            message=self._span_name,
+            created_at=start_time,
+            span_type="llm",
+            parent_span_id=self._parent_span_id_at_entry
         )
-        self._trace_client.add_entry(enter_entry)
-        # --- End manual 'enter' entry ---
+        self._trace_client.add_span(span)
+        return span_id, span
-        # Set the current span ID in contextvars
-        self._span_context_token = current_span_var.set(span_id)
+    def _finalize_span(self, span_id):
+        span = self._trace_client.span_id_to_span.get(span_id)
+        if span:
+            span.duration = time.time() - span.created_at
+        if span_id in self._trace_client._span_depths:
+            del self._trace_client._span_depths[span_id]
-        # Manually create 'input' entry
-        input_data = _format_input_data(self._client, **self._input_kwargs)
-        input_entry = TraceEntry(
-             type="input", function=self._span_name, span_id=span_id,
-             trace_id=self._trace_client.trace_id, depth=current_depth, message=f"Inputs to {self._span_name}",
-             created_at=time.time(), inputs=input_data, span_type="llm"
-        )
-        self._trace_client.add_entry(input_entry)
-        # Call the original __aenter__
-        raw_iterator = await self._original_manager.__aenter__()
+class _TracedAsyncStreamManagerWrapper(_BaseStreamManagerWrapper, AbstractAsyncContextManager):
+    async def __aenter__(self):
+        self._parent_span_id_at_entry = current_span_var.get()
+        if not self._trace_client:
+            return await self._original_manager.__aenter__()
-        # Manually create pending 'output' entry
-        output_entry = TraceEntry(
-            type="output", function=self._span_name, span_id=span_id,
-            trace_id=self._trace_client.trace_id, depth=current_depth, message=f"Output from {self._span_name}",
-            created_at=time.time(), output="<pending stream>", span_type="llm"
-        )
-        self._trace_client.add_entry(output_entry)
+        span_id, span = self._create_span()
+        self._span_context_token = current_span_var.set(span_id)
+        span.inputs = _format_input_data(self._client, **self._input_kwargs)
-        # Wrap the raw iterator
-        wrapped_iterator = self._stream_wrapper_func(raw_iterator, self._client, output_entry)
-        return wrapped_iterator
+        # Call the original __aenter__ and expect it to be an async generator
+        raw_iterator = await self._original_manager.__aenter__()
+        span.output = "<pending stream>"
+        return self._stream_wrapper_func(raw_iterator, self._client, span)
     async def __aexit__(self, exc_type, exc_val, exc_tb):
-        # Manually create the 'exit' entry
         if hasattr(self, '_span_context_token'):
-             span_id = current_span_var.get()
-             start_time_for_duration = 0
-             for entry in reversed(self._trace_client.entries):
-                  if entry.span_id == span_id and entry.type == 'enter':
-                       start_time_for_duration = entry.created_at
-                       break
-             duration = time.time() - start_time_for_duration if start_time_for_duration else None
-             exit_depth = self._trace_client._span_depths.get(span_id, 0)
-             exit_entry = TraceEntry(
-                  type="exit", function=self._span_name, span_id=span_id,
-                  trace_id=self._trace_client.trace_id, depth=exit_depth, message=f"← {self._span_name}",
-                  created_at=time.time(), duration=duration, span_type="llm"
-             )
-             self._trace_client.add_entry(exit_entry)
-             if span_id in self._trace_client._span_depths: del self._trace_client._span_depths[span_id]
-             current_span_var.reset(self._span_context_token)
-             delattr(self, '_span_context_token')
-        # Delegate __aexit__
-        if hasattr(self._original_manager, "__aexit__"):
-             return await self._original_manager.__aexit__(exc_type, exc_val, exc_tb)
-        return None
-class _TracedSyncStreamManagerWrapper(AbstractContextManager):
-    """Wraps an original sync stream manager to add tracing."""
-    def __init__(self, original_manager, client, span_name, trace_client, stream_wrapper_func, input_kwargs):
-        self._original_manager = original_manager
-        self._client = client
-        self._span_name = span_name
-        self._trace_client = trace_client
-        self._stream_wrapper_func = stream_wrapper_func
-        self._input_kwargs = input_kwargs
-        self._parent_span_id_at_entry = None
+            span_id = current_span_var.get()
+            self._finalize_span(span_id)
+            current_span_var.reset(self._span_context_token)
+            delattr(self, '_span_context_token')
+        return await self._original_manager.__aexit__(exc_type, exc_val, exc_tb)
+class _TracedSyncStreamManagerWrapper(_BaseStreamManagerWrapper, AbstractContextManager):
     def __enter__(self):
         self._parent_span_id_at_entry = current_span_var.get()
         if not self._trace_client:
-             return self._original_manager.__enter__()
+            return self._original_manager.__enter__()
-        # Manually create 'enter' entry
-        start_time = time.time()
-        span_id = str(uuid.uuid4())
-        current_depth = 0
-        if self._parent_span_id_at_entry and self._parent_span_id_at_entry in self._trace_client._span_depths:
-            current_depth = self._trace_client._span_depths[self._parent_span_id_at_entry] + 1
-        self._trace_client._span_depths[span_id] = current_depth
-        enter_entry = TraceEntry(
-             type="enter", function=self._span_name, span_id=span_id,
-             trace_id=self._trace_client.trace_id, depth=current_depth, message=self._span_name,
-             created_at=start_time, span_type="llm", parent_span_id=self._parent_span_id_at_entry
-        )
-        self._trace_client.add_entry(enter_entry)
+        span_id, span = self._create_span()
         self._span_context_token = current_span_var.set(span_id)
+        span.inputs = _format_input_data(self._client, **self._input_kwargs)
-        # Manually create 'input' entry
-        input_data = _format_input_data(self._client, **self._input_kwargs)
-        input_entry = TraceEntry(
-             type="input", function=self._span_name, span_id=span_id,
-             trace_id=self._trace_client.trace_id, depth=current_depth, message=f"Inputs to {self._span_name}",
-             created_at=time.time(), inputs=input_data, span_type="llm"
-        )
-        self._trace_client.add_entry(input_entry)
-        # Call original __enter__
         raw_iterator = self._original_manager.__enter__()
-        # Manually create 'output' entry (pending)
-        output_entry = TraceEntry(
-            type="output", function=self._span_name, span_id=span_id,
-            trace_id=self._trace_client.trace_id, depth=current_depth, message=f"Output from {self._span_name}",
-            created_at=time.time(), output="<pending stream>", span_type="llm"
-        )
-        self._trace_client.add_entry(output_entry)
-        # Wrap the raw iterator
-        wrapped_iterator = self._stream_wrapper_func(raw_iterator, self._client, output_entry)
-        return wrapped_iterator
+        span.output = "<pending stream>"
+        return self._stream_wrapper_func(raw_iterator, self._client, span)
     def __exit__(self, exc_type, exc_val, exc_tb):
-        # Manually create 'exit' entry
         if hasattr(self, '_span_context_token'):
-             span_id = current_span_var.get()
-             start_time_for_duration = 0
-             for entry in reversed(self._trace_client.entries):
-                  if entry.span_id == span_id and entry.type == 'enter':
-                       start_time_for_duration = entry.created_at
-                       break
-             duration = time.time() - start_time_for_duration if start_time_for_duration else None
-             exit_depth = self._trace_client._span_depths.get(span_id, 0)
-             exit_entry = TraceEntry(
-                  type="exit", function=self._span_name, span_id=span_id,
-                  trace_id=self._trace_client.trace_id, depth=exit_depth, message=f"← {self._span_name}",
-                  created_at=time.time(), duration=duration, span_type="llm"
-             )
-             self._trace_client.add_entry(exit_entry)
-             if span_id in self._trace_client._span_depths: del self._trace_client._span_depths[span_id]
-             current_span_var.reset(self._span_context_token)
-             delattr(self, '_span_context_token')
-        # Delegate __exit__
-        if hasattr(self._original_manager, "__exit__"):
-             return self._original_manager.__exit__(exc_type, exc_val, exc_tb)
-        return None
-# --- NEW Generalized Helper Function (Moved from demo) ---
-def prepare_evaluation_for_state(
-    scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
-    example: Optional[Example] = None,
-    # --- Individual components (alternative to 'example') ---
-    input: Optional[str] = None,
-    actual_output: Optional[Union[str, List[str]]] = None,
-    expected_output: Optional[Union[str, List[str]]] = None,
-    context: Optional[List[str]] = None,
-    retrieval_context: Optional[List[str]] = None,
-    tools_called: Optional[List[str]] = None,
-    expected_tools: Optional[List[str]] = None,
-    additional_metadata: Optional[Dict[str, Any]] = None,
-    # --- Other eval parameters ---
-    model: Optional[str] = None,
-    log_results: Optional[bool] = True
-) -> Optional[EvaluationConfig]:
-    """
-    Prepares an EvaluationConfig object, similar to TraceClient.async_evaluate.
-    Accepts either a pre-made Example object or individual components to construct one.
-    Returns the EvaluationConfig object ready to be placed in the state, or None.
-    """
-    final_example = example
-    # If example is not provided, try to construct one from individual parts
-    if final_example is None:
-        # Basic validation: Ensure at least actual_output is present for most scorers
-        if actual_output is None:
-      #      print("[prepare_evaluation_for_state] Warning: 'actual_output' is required when 'example' is not provided. Skipping evaluation setup.")
-            return None
-        try:
-            final_example = Example(
-                input=input,
-                actual_output=actual_output,
-                expected_output=expected_output,
-                context=context,
-                retrieval_context=retrieval_context,
-                tools_called=tools_called,
-                expected_tools=expected_tools,
-                additional_metadata=additional_metadata,
-                # trace_id will be set by the handler later if needed
-            )
-       #     print("[prepare_evaluation_for_state] Constructed Example from individual components.")
-        except Exception as e:
-      #      print(f"[prepare_evaluation_for_state] Error constructing Example: {e}. Skipping evaluation setup.")
-            return None
-    # If we have a valid example (provided or constructed) and scorers
-    if final_example and scorers:
-        # TODO: Add validation like check_examples if needed here,
-        # although the handler might implicitly handle some checks via TraceClient.
-        return EvaluationConfig(
-            scorers=scorers,
-            example=final_example,
-            model=model,
-            log_results=log_results
-        )
-    elif not scorers:
-    #    print("[prepare_evaluation_for_state] No scorers provided. Skipping evaluation setup.")
-        return None
-    else: # No valid example
-    #   print("[prepare_evaluation_for_state] No valid Example available. Skipping evaluation setup.")
-        return None
-# --- End NEW Helper Function ---
-# --- NEW: Helper function to simplify adding eval config to state ---
-def add_evaluation_to_state(
-    state: Dict[str, Any], # The LangGraph state dictionary
-    scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
-    # --- Evaluation components (same as prepare_evaluation_for_state) ---
-    input: Optional[str] = None,
-    actual_output: Optional[Union[str, List[str]]] = None,
-    expected_output: Optional[Union[str, List[str]]] = None,
-    context: Optional[List[str]] = None,
-    retrieval_context: Optional[List[str]] = None,
-    tools_called: Optional[List[str]] = None,
-    expected_tools: Optional[List[str]] = None,
-    additional_metadata: Optional[Dict[str, Any]] = None,
-    # --- Other eval parameters ---
-    model: Optional[str] = None,
-    log_results: Optional[bool] = True
-) -> None:
-    """
-    Prepares an EvaluationConfig and adds it to the state dictionary
-    under the '_judgeval_eval' key if successful.
-    This simplifies the process of setting up evaluations within LangGraph nodes.
-    Args:
-        state: The LangGraph state dictionary to modify.
-        scorers: List of scorer instances.
-        input: Input for the evaluation example.
-        actual_output: Actual output for the evaluation example.
-        expected_output: Expected output for the evaluation example.
-        context: Context for the evaluation example.
-        retrieval_context: Retrieval context for the evaluation example.
-        tools_called: Tools called for the evaluation example.
-        expected_tools: Expected tools for the evaluation example.
-        additional_metadata: Additional metadata for the evaluation example.
-        model: Model name used for generation (optional).
-        log_results: Whether to log evaluation results (optional, defaults to True).
-    """
-    eval_config = prepare_evaluation_for_state(
-        scorers=scorers,
-        input=input,
-        actual_output=actual_output,
-        expected_output=expected_output,
-        context=context,
-        retrieval_context=retrieval_context,
-        tools_called=tools_called,
-        expected_tools=expected_tools,
-        additional_metadata=additional_metadata,
-        model=model,
-        log_results=log_results
-    )
-    if eval_config:
-        state["_judgeval_eval"] = eval_config
-   #     print(f"[_judgeval_eval added to state for node]") # Optional: Log confirmation
-     #   print("[Skipped adding _judgeval_eval to state: prepare_evaluation_for_state failed]")
-# --- End NEW Helper ---
+            span_id = current_span_var.get()
+            self._finalize_span(span_id)
+            current_span_var.reset(self._span_context_token)
+            delattr(self, '_span_context_token')
+        return self._original_manager.__exit__(exc_type, exc_val, exc_tb)

judgeval 0.0.36__py3-none-any.whl → 0.0.38__py3-none-any.whl

judgeval 0.0.36py3-none-any.whl → 0.0.38py3-none-any.whl