PyPI - judgeval - Versions diffs - 0.0.31__py3-none-any.whl → 0.0.33__py3-none-any.whl - Mend

judgeval 0.0.31py3-none-any.whl → 0.0.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

judgeval/common/tracer.py CHANGED Viewed

@@ -11,12 +11,28 @@ import time
 import uuid
 import warnings
 import contextvars
-from contextlib import contextmanager
+import sys
+from contextlib import contextmanager, asynccontextmanager, AbstractAsyncContextManager, AbstractContextManager # Import context manager bases
 from dataclasses import dataclass, field
 from datetime import datetime
 from http import HTTPStatus
-from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, TypeAlias, Union, Callable, Awaitable
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+    AsyncGenerator,
+    TypeAlias,
+)
 from rich import print as rprint
+import types # <--- Add this import
 # Third-party imports
 import pika
@@ -27,6 +43,7 @@ from rich import print as rprint
 from openai import OpenAI, AsyncOpenAI
 from together import Together, AsyncTogether
 from anthropic import Anthropic, AsyncAnthropic
+from google import genai
 # Local application/library-specific imports
 from judgeval.constants import (
@@ -40,20 +57,22 @@ from judgeval.constants import (
 )
 from judgeval.judgment_client import JudgmentClient
 from judgeval.data import Example
-from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
+from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
 from judgeval.rules import Rule
 from judgeval.evaluation_run import EvaluationRun
 from judgeval.data.result import ScoringResult
 # Standard library imports needed for the new class
 import concurrent.futures
+from collections.abc import Iterator, AsyncIterator # Add Iterator and AsyncIterator
 # Define context variables for tracking the current trace and the current span within a trace
 current_trace_var = contextvars.ContextVar('current_trace', default=None)
-current_span_var = contextvars.ContextVar('current_span', default=None) # NEW: ContextVar for the active span name
+current_span_var = contextvars.ContextVar('current_span', default=None) # ContextVar for the active span name
+in_traced_function_var = contextvars.ContextVar('in_traced_function', default=False) # Track if we're in a traced function
 # Define type aliases for better code readability and maintainability
-ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic, AsyncOpenAI, AsyncAnthropic, AsyncTogether]  # Supported API clients
+ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic, AsyncOpenAI, AsyncAnthropic, AsyncTogether, genai.Client, genai.client.AsyncClient]  # Supported API clients
 TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation']  # Valid trace entry types
 SpanType = Literal['span', 'tool', 'llm', 'evaluation', 'chain']
 @dataclass
@@ -170,7 +189,7 @@ class TraceEntry:
             "inputs": self._serialize_inputs(),
             "evaluation_runs": [evaluation_run.model_dump() for evaluation_run in self.evaluation_runs] if self.evaluation_runs else [],
             "span_type": self.span_type,
-            "parent_span_id": self.parent_span_id
+            "parent_span_id": self.parent_span_id,
         }
     def _serialize_output(self) -> Any:
@@ -185,6 +204,15 @@ class TraceEntry:
         if isinstance(self.output, BaseModel):
             return self.output.model_dump()
+        # NEW check: If output is the dict structure from our stream wrapper
+        if isinstance(self.output, dict) and 'streamed' in self.output:
+            # Assume it's already JSON-serializable (content is string, usage is dict or None)
+            return self.output
+        # NEW check: If output is the placeholder string before stream completes
+        elif self.output == "<pending stream>":
+             # Represent this state clearly in the serialized data
+            return {"status": "pending stream"}
         try:
             # Try to serialize the output to verify it's JSON compatible
             json.dumps(self.output)
@@ -203,9 +231,10 @@ class TraceManagerClient:
     - Saving a trace
     - Deleting a trace
     """
-    def __init__(self, judgment_api_key: str, organization_id: str):
+    def __init__(self, judgment_api_key: str, organization_id: str, tracer: Optional["Tracer"] = None):
         self.judgment_api_key = judgment_api_key
         self.organization_id = organization_id
+        self.tracer = tracer
     def fetch_trace(self, trace_id: str):
         """
@@ -233,12 +262,13 @@ class TraceManagerClient:
     def save_trace(self, trace_data: dict):
         """
-        Saves a trace to the database
+        Saves a trace to the Judgment Supabase and optionally to S3 if configured.
         Args:
             trace_data: The trace data to save
             NOTE we save empty traces in order to properly handle async operations; we need something in the DB to associate the async results with
         """
+        # Save to Judgment API
         response = requests.post(
             JUDGMENT_TRACES_SAVE_API_URL,
             json=trace_data,
@@ -255,6 +285,18 @@ class TraceManagerClient:
         elif response.status_code != HTTPStatus.OK:
             raise ValueError(f"Failed to save trace data: {response.text}")
+        # If S3 storage is enabled, save to S3 as well
+        if self.tracer and self.tracer.use_s3:
+            try:
+                s3_key = self.tracer.s3_storage.save_trace(
+                    trace_data=trace_data,
+                    trace_id=trace_data["trace_id"],
+                    project_name=trace_data["project_name"]
+                )
+                print(f"Trace also saved to S3 at key: {s3_key}")
+            except Exception as e:
+                warnings.warn(f"Failed to save trace to S3: {str(e)}")
         if "ui_results_url" in response.json():
             pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
             rprint(pretty_str)
@@ -352,7 +394,7 @@ class TraceClient:
         self.client: JudgmentClient = tracer.client
         self.entries: List[TraceEntry] = []
         self.start_time = time.time()
-        self.trace_manager_client = TraceManagerClient(tracer.api_key, tracer.organization_id)
+        self.trace_manager_client = TraceManagerClient(tracer.api_key, tracer.organization_id, tracer)
         self.visited_nodes = []
         self.executed_tools = []
         self.executed_node_tools = []
@@ -390,13 +432,13 @@ class TraceClient:
         entry = TraceEntry(
             type="enter",
             function=name,
-            span_id=span_id, # Use the generated span_id
-            trace_id=self.trace_id, # Use the trace_id from the trace client
+            span_id=span_id,
+            trace_id=self.trace_id,
             depth=current_depth,
             message=name,
             created_at=start_time,
             span_type=span_type,
-            parent_span_id=parent_span_id # Use the parent_id from context var
+            parent_span_id=parent_span_id,
         )
         self.add_entry(entry)
@@ -414,7 +456,7 @@ class TraceClient:
                 message=f"← {name}",
                 created_at=time.time(),
                 duration=duration,
-                span_type=span_type
+                span_type=span_type,
             ))
             # Clean up depth tracking for this span_id
             if span_id in self._span_depths:
@@ -451,47 +493,14 @@ class TraceClient:
             additional_metadata=additional_metadata,
             trace_id=self.trace_id
         )
-        loaded_rules = None
-        if self.rules:
-            loaded_rules = []
-            for rule in self.rules:
-                processed_conditions = []
-                for condition in rule.conditions:
-                    # Convert metric if it's a ScorerWrapper
-                    try:
-                        if isinstance(condition.metric, ScorerWrapper):
-                            condition_copy = condition.model_copy()
-                            condition_copy.metric = condition.metric.load_implementation(use_judgment=True)
-                            processed_conditions.append(condition_copy)
-                        else:
-                            processed_conditions.append(condition)
-                    except Exception as e:
-                        warnings.warn(f"Failed to convert ScorerWrapper in rule '{rule.name}', condition metric '{condition.metric_name}': {str(e)}")
-                        processed_conditions.append(condition)  # Keep original condition as fallback
-                # Create new rule with processed conditions
-                new_rule = rule.model_copy()
-                new_rule.conditions = processed_conditions
-                loaded_rules.append(new_rule)
         try:
             # Load appropriate implementations for all scorers
-            loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
-            for scorer in scorers:
-                try:
-                    if isinstance(scorer, ScorerWrapper):
-                        loaded_scorers.append(scorer.load_implementation(use_judgment=True))
-                    else:
-                        loaded_scorers.append(scorer)
-                except Exception as e:
-                    warnings.warn(f"Failed to load implementation for scorer {scorer}: {str(e)}")
-                    # Skip this scorer
-            if not loaded_scorers:
+            if not scorers:
                 warnings.warn("No valid scorers available for evaluation")
                 return
             # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
-            if loaded_rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
+            if self.rules and any(isinstance(scorer, JudgevalScorer) for scorer in scorers):
                 raise ValueError("Cannot use Judgeval scorers, you can only use API scorers when using rules. Please either remove rules or use only APIJudgmentScorer types.")
         except Exception as e:
@@ -505,15 +514,15 @@ class TraceClient:
             project_name=self.project_name,
             eval_name=f"{self.name.capitalize()}-"
                 f"{current_span_var.get()}-"
-                f"[{','.join(scorer.score_type.capitalize() for scorer in loaded_scorers)}]",
+                f"[{','.join(scorer.score_type.capitalize() for scorer in scorers)}]",
             examples=[example],
-            scorers=loaded_scorers,
+            scorers=scorers,
             model=model,
             metadata={},
             judgment_api_key=self.tracer.api_key,
             override=self.overwrite,
             trace_span_id=current_span_var.get(),
-            rules=loaded_rules # Use the combined rules
+            rules=self.rules # Use the combined rules
         )
         self.add_eval_run(eval_run, start_time)  # Pass start_time to record_evaluation
@@ -571,7 +580,7 @@ class TraceClient:
                 message=f"Inputs to {function_name}",
                 created_at=time.time(),
                 inputs=inputs,
-                span_type=entry_span_type
+                span_type=entry_span_type,
             ))
     async def _update_coroutine_output(self, entry: TraceEntry, coroutine: Any):
@@ -604,12 +613,15 @@ class TraceClient:
                 message=f"Output from {function_name}",
                 created_at=time.time(),
                 output="<pending>" if inspect.iscoroutine(output) else output,
-                span_type=entry_span_type
+                span_type=entry_span_type,
             )
             self.add_entry(entry)
             if inspect.iscoroutine(output):
                 asyncio.create_task(self._update_coroutine_output(entry, output))
+            # Return the created entry
+            return entry
     def add_entry(self, entry: TraceEntry):
         """Add a trace entry to this trace context"""
@@ -821,8 +833,10 @@ class TraceClient:
         total_completion_tokens_cost = 0.0
         total_cost = 0.0
+        # Only count tokens for actual LLM API call spans
+        llm_span_names = {"OPENAI_API_CALL", "TOGETHER_API_CALL", "ANTHROPIC_API_CALL", "GOOGLE_API_CALL"}
         for entry in condensed_entries:
-            if entry.get("span_type") == "llm" and isinstance(entry.get("output"), dict):
+            if entry.get("span_type") == "llm" and entry.get("function") in llm_span_names and isinstance(entry.get("output"), dict):
                 output = entry["output"]
                 usage = output.get("usage", {})
                 model_name = entry.get("inputs", {}).get("model", "")
@@ -888,6 +902,13 @@ class TraceClient:
             "parent_trace_id": self.parent_trace_id,
             "parent_name": self.parent_name
         }
+        # --- Log trace data before saving ---
+        try:
+            rprint(f"[TraceClient.save] Saving trace data for trace_id {self.trace_id}:")
+            rprint(json.dumps(trace_data, indent=2))
+        except Exception as log_e:
+            rprint(f"[TraceClient.save] Error logging trace data: {log_e}")
+        # --- End logging ---
         self.trace_manager_client.save_trace(trace_data)
         return self.trace_id, trace_data
@@ -910,7 +931,14 @@ class Tracer:
         rules: Optional[List[Rule]] = None,  # Added rules parameter
         organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
         enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower() == "true",
-        enable_evaluations: bool = os.getenv("JUDGMENT_EVALUATIONS", "true").lower() == "true"
+        enable_evaluations: bool = os.getenv("JUDGMENT_EVALUATIONS", "true").lower() == "true",
+        # S3 configuration
+        use_s3: bool = False,
+        s3_bucket_name: Optional[str] = None,
+        s3_aws_access_key_id: Optional[str] = None,
+        s3_aws_secret_access_key: Optional[str] = None,
+        s3_region_name: Optional[str] = None,
+        deep_tracing: bool = True  # NEW: Enable deep tracing by default
         ):
         if not hasattr(self, 'initialized'):
             if not api_key:
@@ -918,6 +946,13 @@ class Tracer:
             if not organization_id:
                 raise ValueError("Tracer must be configured with an Organization ID")
+            if use_s3 and not s3_bucket_name:
+                raise ValueError("S3 bucket name must be provided when use_s3 is True")
+            if use_s3 and not (s3_aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID")):
+                raise ValueError("AWS Access Key ID must be provided when use_s3 is True")
+            if use_s3 and not (s3_aws_secret_access_key or os.getenv("AWS_SECRET_ACCESS_KEY")):
+                raise ValueError("AWS Secret Access Key must be provided when use_s3 is True")
             self.api_key: str = api_key
             self.project_name: str = project_name
             self.client: JudgmentClient = JudgmentClient(judgment_api_key=api_key)
@@ -927,6 +962,19 @@ class Tracer:
             self.initialized: bool = True
             self.enable_monitoring: bool = enable_monitoring
             self.enable_evaluations: bool = enable_evaluations
+            # Initialize S3 storage if enabled
+            self.use_s3 = use_s3
+            if use_s3:
+                from judgeval.common.s3_storage import S3Storage
+                self.s3_storage = S3Storage(
+                    bucket_name=s3_bucket_name,
+                    aws_access_key_id=s3_aws_access_key_id,
+                    aws_secret_access_key=s3_aws_secret_access_key,
+                    region_name=s3_region_name
+                )
+            self.deep_tracing: bool = deep_tracing  # NEW: Store deep tracing setting
         elif hasattr(self, 'project_name') and self.project_name != project_name:
             warnings.warn(
                 f"Attempting to initialize Tracer with project_name='{project_name}' but it was already initialized with "
@@ -941,12 +989,52 @@ class Tracer:
         """
         current_trace_var.set(trace)
-    def get_current_trace(self):
+    def get_current_trace(self) -> Optional[TraceClient]:
         """
         Get the current trace context from contextvars
         """
         return current_trace_var.get()
+    def _apply_deep_tracing(self, func, span_type="span"):
+        """
+        Apply deep tracing to all functions in the same module as the given function.
+        Args:
+            func: The function being traced
+            span_type: Type of span to use for traced functions
+        Returns:
+            A tuple of (module, original_functions_dict) where original_functions_dict
+            contains the original functions that were replaced with traced versions.
+        """
+        module = inspect.getmodule(func)
+        if not module:
+            return None, {}
+        # Save original functions
+        original_functions = {}
+        # Find all functions in the module
+        for name, obj in inspect.getmembers(module, inspect.isfunction):
+            # Skip already wrapped functions
+            if hasattr(obj, '_judgment_traced'):
+                continue
+            # Create a traced version of the function
+            # Always use default span type "span" for child functions
+            traced_func = _create_deep_tracing_wrapper(obj, self, "span")
+            # Mark the function as traced to avoid double wrapping
+            traced_func._judgment_traced = True
+            # Save the original function
+            original_functions[name] = obj
+            # Replace with traced version
+            setattr(module, name, traced_func)
+        return module, original_functions
     @contextmanager
     def trace(
         self,
@@ -992,14 +1080,8 @@ class Tracer:
             finally:
                 # Reset the context variable
                 current_trace_var.reset(token)
-    def get_current_trace(self) -> Optional[TraceClient]:
-        """
-        Get the current trace context from contextvars
-        """
-        return current_trace_var.get()
-    def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False):
+    def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False, deep_tracing: bool = None):
         """
         Decorator to trace function execution with detailed entry/exit information.
@@ -1009,20 +1091,37 @@ class Tracer:
             span_type: Type of span (default "span")
             project_name: Optional project name override
             overwrite: Whether to overwrite existing traces
+            deep_tracing: Whether to enable deep tracing for this function and all nested calls.
+                          If None, uses the tracer's default setting.
         """
         # If monitoring is disabled, return the function as is
         if not self.enable_monitoring:
             return func if func else lambda f: f
         if func is None:
-            return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name, overwrite=overwrite)
+            return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name,
+                                         overwrite=overwrite, deep_tracing=deep_tracing)
         # Use provided name or fall back to function name
         span_name = name or func.__name__
+        # Store custom attributes on the function object
+        func._judgment_span_name = span_name
+        func._judgment_span_type = span_type
+        # Use the provided deep_tracing value or fall back to the tracer's default
+        use_deep_tracing = deep_tracing if deep_tracing is not None else self.deep_tracing
         if asyncio.iscoroutinefunction(func):
             @functools.wraps(func)
             async def async_wrapper(*args, **kwargs):
+                # Check if we're already in a traced function
+                if in_traced_function_var.get():
+                    return await func(*args, **kwargs)
+                # Set in_traced_function_var to True
+                token = in_traced_function_var.set(True)
                 # Get current trace from context
                 current_trace = current_trace_var.get()
@@ -1057,9 +1156,18 @@ class Tracer:
                                 'kwargs': kwargs
                             })
+                            # If deep tracing is enabled, apply monkey patching
+                            if use_deep_tracing:
+                                module, original_functions = self._apply_deep_tracing(func, span_type)
                             # Execute function
                             result = await func(*args, **kwargs)
+                            # Restore original functions if deep tracing was enabled
+                            if use_deep_tracing and module and 'original_functions' in locals():
+                                for name, obj in original_functions.items():
+                                    setattr(module, name, obj)
                             # Record output
                             span.record_output(result)
@@ -1069,29 +1177,52 @@ class Tracer:
                     finally:
                         # Reset trace context (span context resets automatically)
                         current_trace_var.reset(trace_token)
+                        # Reset in_traced_function_var
+                        in_traced_function_var.reset(token)
                 else:
                     # Already have a trace context, just create a span in it
                     # The span method handles current_span_var
-                    with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
-                        # Record inputs
-                        span.record_input({
-                            'args': str(args),
-                            'kwargs': kwargs
-                        })
-                        # Execute function
-                        result = await func(*args, **kwargs)
-                        # Record output
-                        span.record_output(result)
+                    try:
+                        with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
+                            # Record inputs
+                            span.record_input({
+                                'args': str(args),
+                                'kwargs': kwargs
+                            })
+                            # If deep tracing is enabled, apply monkey patching
+                            if use_deep_tracing:
+                                module, original_functions = self._apply_deep_tracing(func, span_type)
+                            # Execute function
+                            result = await func(*args, **kwargs)
+                            # Restore original functions if deep tracing was enabled
+                            if use_deep_tracing and module and 'original_functions' in locals():
+                                for name, obj in original_functions.items():
+                                    setattr(module, name, obj)
+                            # Record output
+                            span.record_output(result)
                         return result
+                    finally:
+                        # Reset in_traced_function_var
+                        in_traced_function_var.reset(token)
             return async_wrapper
         else:
-            # Non-async function implementation remains unchanged
+            # Non-async function implementation with deep tracing
             @functools.wraps(func)
             def wrapper(*args, **kwargs):
+                # Check if we're already in a traced function
+                if in_traced_function_var.get():
+                    return func(*args, **kwargs)
+                # Set in_traced_function_var to True
+                token = in_traced_function_var.set(True)
                 # Get current trace from context
                 current_trace = current_trace_var.get()
@@ -1126,9 +1257,18 @@ class Tracer:
                                 'kwargs': kwargs
                             })
+                            # If deep tracing is enabled, apply monkey patching
+                            if use_deep_tracing:
+                                module, original_functions = self._apply_deep_tracing(func, span_type)
                             # Execute function
                             result = func(*args, **kwargs)
+                            # Restore original functions if deep tracing was enabled
+                            if use_deep_tracing and module and 'original_functions' in locals():
+                                for name, obj in original_functions.items():
+                                    setattr(module, name, obj)
                             # Record output
                             span.record_output(result)
@@ -1138,24 +1278,40 @@ class Tracer:
                     finally:
                         # Reset trace context (span context resets automatically)
                         current_trace_var.reset(trace_token)
+                        # Reset in_traced_function_var
+                        in_traced_function_var.reset(token)
                 else:
                     # Already have a trace context, just create a span in it
                     # The span method handles current_span_var
-                    with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
-                        # Record inputs
-                        span.record_input({
-                            'args': str(args),
-                            'kwargs': kwargs
-                        })
-                        # Execute function
-                        result = func(*args, **kwargs)
-                        # Record output
-                        span.record_output(result)
+                    try:
+                        with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
+                            # Record inputs
+                            span.record_input({
+                                'args': str(args),
+                                'kwargs': kwargs
+                            })
+                            # If deep tracing is enabled, apply monkey patching
+                            if use_deep_tracing:
+                                module, original_functions = self._apply_deep_tracing(func, span_type)
+                            # Execute function
+                            result = func(*args, **kwargs)
+                            # Restore original functions if deep tracing was enabled
+                            if use_deep_tracing and module and 'original_functions' in locals():
+                                for name, obj in original_functions.items():
+                                    setattr(module, name, obj)
+                            # Record output
+                            span.record_output(result)
                         return result
+                    finally:
+                        # Reset in_traced_function_var
+                        in_traced_function_var.reset(token)
             return wrapper
     def score(self, func=None, scorers: List[Union[APIJudgmentScorer, JudgevalScorer]] = None, model: str = None, log_results: bool = True, *, name: str = None, span_type: SpanType = "span"):
@@ -1200,96 +1356,192 @@ class Tracer:
 def wrap(client: Any) -> Any:
     """
     Wraps an API client to add tracing capabilities.
-    Supports OpenAI, Together, and Anthropic clients.
+    Supports OpenAI, Together, Anthropic, and Google GenAI clients.
+    Patches both '.create' and Anthropic's '.stream' methods using a wrapper class.
     """
-    # Get the appropriate configuration for this client type
-    span_name, original_create = _get_client_config(client)
+    span_name, original_create, original_stream = _get_client_config(client)
-    # Handle async clients differently than synchronous clients (need an async function for async clients)
-    if (isinstance(client, (AsyncOpenAI, AsyncAnthropic, AsyncTogether))):
-        async def traced_create(*args, **kwargs):
-            # Get the current trace from contextvars
-            current_trace = current_trace_var.get()
-            # Skip tracing if no active trace
-            if not current_trace:
-                return original_create(*args, **kwargs)
+    # --- Define Traced Async Functions ---
+    async def traced_create_async(*args, **kwargs):
+        # [Existing logic - unchanged]
+        current_trace = current_trace_var.get()
+        if not current_trace:
+            if asyncio.iscoroutinefunction(original_create):
+                 return await original_create(*args, **kwargs)
+            else:
+                 return original_create(*args, **kwargs)
+        is_streaming = kwargs.get("stream", False)
+        with current_trace.span(span_name, span_type="llm") as span:
+            input_data = _format_input_data(client, **kwargs)
+            span.record_input(input_data)
+            # Warn about token counting limitations with streaming
+            if isinstance(client, (AsyncOpenAI, OpenAI)) and is_streaming:
+                if not kwargs.get("stream_options", {}).get("include_usage"):
+                    warnings.warn(
+                        "OpenAI streaming calls don't include token counts by default. "
+                        "To enable token counting with streams, set stream_options={'include_usage': True} "
+                        "in your API call arguments.",
+                        UserWarning
+                    )
-            with current_trace.span(span_name, span_type="llm") as span:
-                # Format and record the input parameters
-                input_data = _format_input_data(client, **kwargs)
-                span.record_input(input_data)
-                # Make the actual API call
-                try:
-                    response = await original_create(*args, **kwargs)
-                except Exception as e:
-                    print(f"Error during API call: {e}")
-                    raise
-                # Format and record the output
-                output_data = _format_output_data(client, response)
-                span.record_output(output_data)
-                return response
-    else:
-        def traced_create(*args, **kwargs):
-            # Get the current trace from contextvars
-            current_trace = current_trace_var.get()
-            # Skip tracing if no active trace
-            if not current_trace:
-                return original_create(*args, **kwargs)
+            try:
+                if is_streaming:
+                    stream_iterator = await original_create(*args, **kwargs)
+                    output_entry = span.record_output("<pending stream>")
+                    return _async_stream_wrapper(stream_iterator, client, output_entry)
+                else:
+                    awaited_response = await original_create(*args, **kwargs)
+                    output_data = _format_output_data(client, awaited_response)
+                    span.record_output(output_data)
+                    return awaited_response
+            except Exception as e:
+                print(f"Error during wrapped async API call ({span_name}): {e}")
+                span.record_output({"error": str(e)})
+                raise
+    # Function replacing .stream() - NOW returns the wrapper class instance
+    def traced_stream_async(*args, **kwargs):
+        current_trace = current_trace_var.get()
+        if not current_trace or not original_stream:
+            return original_stream(*args, **kwargs)
+        original_manager = original_stream(*args, **kwargs)
+        wrapper_manager = _TracedAsyncStreamManagerWrapper(
+            original_manager=original_manager,
+            client=client,
+            span_name=span_name,
+            trace_client=current_trace,
+            stream_wrapper_func=_async_stream_wrapper,
+            input_kwargs=kwargs
+        )
+        return wrapper_manager
+    # --- Define Traced Sync Functions ---
+    def traced_create_sync(*args, **kwargs):
+         # [Existing logic - unchanged]
+        current_trace = current_trace_var.get()
+        if not current_trace:
+             return original_create(*args, **kwargs)
+        is_streaming = kwargs.get("stream", False)
+        with current_trace.span(span_name, span_type="llm") as span:
+             input_data = _format_input_data(client, **kwargs)
+             span.record_input(input_data)
+             # Warn about token counting limitations with streaming
+             if isinstance(client, (AsyncOpenAI, OpenAI)) and is_streaming:
+                 if not kwargs.get("stream_options", {}).get("include_usage"):
+                     warnings.warn(
+                         "OpenAI streaming calls don't include token counts by default. "
+                         "To enable token counting with streams, set stream_options={'include_usage': True} "
+                         "in your API call arguments.",
+                         UserWarning
+                     )
+             try:
+                 response_or_iterator = original_create(*args, **kwargs)
+             except Exception as e:
+                 print(f"Error during wrapped sync API call ({span_name}): {e}")
+                 span.record_output({"error": str(e)})
+                 raise
+             if is_streaming:
+                 output_entry = span.record_output("<pending stream>")
+                 return _sync_stream_wrapper(response_or_iterator, client, output_entry)
+             else:
+                 output_data = _format_output_data(client, response_or_iterator)
+                 span.record_output(output_data)
+                 return response_or_iterator
+    # Function replacing sync .stream()
+    def traced_stream_sync(*args, **kwargs):
+         current_trace = current_trace_var.get()
+         if not current_trace or not original_stream:
+             return original_stream(*args, **kwargs)
+         original_manager = original_stream(*args, **kwargs)
+         wrapper_manager = _TracedSyncStreamManagerWrapper(
+             original_manager=original_manager,
+             client=client,
+             span_name=span_name,
+             trace_client=current_trace,
+             stream_wrapper_func=_sync_stream_wrapper,
+             input_kwargs=kwargs
+         )
+         return wrapper_manager
+    # --- Assign Traced Methods to Client Instance ---
+    # [Assignment logic remains the same]
+    if isinstance(client, (AsyncOpenAI, AsyncTogether)):
+        client.chat.completions.create = traced_create_async
+        # Wrap the Responses API endpoint for OpenAI clients
+        if hasattr(client, "responses") and hasattr(client.responses, "create"):
+            # Capture the original responses.create
+            original_responses_create = client.responses.create
+            def traced_responses(*args, **kwargs):
+                # Get the current trace from contextvars
+                current_trace = current_trace_var.get()
+                # If no active trace, call the original
+                if not current_trace:
+                    return original_responses_create(*args, **kwargs)
+                # Trace this responses.create call
+                with current_trace.span(span_name, span_type="llm") as span:
+                    # Record raw input kwargs
+                    span.record_input(kwargs)
+                    # Make the actual API call
+                    response = original_responses_create(*args, **kwargs)
+                    # Record the output object
+                    span.record_output(response)
+                    return response
+            # Assign the traced wrapper
+            client.responses.create = traced_responses
+    elif isinstance(client, AsyncAnthropic):
+        client.messages.create = traced_create_async
+        if original_stream:
+             client.messages.stream = traced_stream_async
+    elif isinstance(client, genai.client.AsyncClient):
+        client.generate_content = traced_create_async
+    elif isinstance(client, (OpenAI, Together)):
+         client.chat.completions.create = traced_create_sync
+    elif isinstance(client, Anthropic):
+         client.messages.create = traced_create_sync
+         if original_stream:
+             client.messages.stream = traced_stream_sync
+    elif isinstance(client, genai.Client):
+         client.generate_content = traced_create_sync
-            with current_trace.span(span_name, span_type="llm") as span:
-                # Format and record the input parameters
-                input_data = _format_input_data(client, **kwargs)
-                span.record_input(input_data)
-                # Make the actual API call
-                try:
-                    response = original_create(*args, **kwargs)
-                except Exception as e:
-                    print(f"Error during API call: {e}")
-                    raise
-                # Format and record the output
-                output_data = _format_output_data(client, response)
-                span.record_output(output_data)
-                return response
-    # Replace the original method with our traced version
-    if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
-        client.chat.completions.create = traced_create
-    elif isinstance(client, (Anthropic, AsyncAnthropic)):
-        client.messages.create = traced_create
     return client
 # Helper functions for client-specific operations
-def _get_client_config(client: ApiClient) -> tuple[str, callable]:
+def _get_client_config(client: ApiClient) -> tuple[str, callable, Optional[callable]]:
     """Returns configuration tuple for the given API client.
     Args:
         client: An instance of OpenAI, Together, or Anthropic client
     Returns:
-        tuple: (span_name, create_method)
+        tuple: (span_name, create_method, stream_method)
             - span_name: String identifier for tracing
             - create_method: Reference to the client's creation method
+            - stream_method: Reference to the client's stream method (if applicable)
     Raises:
         ValueError: If client type is not supported
     """
     if isinstance(client, (OpenAI, AsyncOpenAI)):
-        return "OPENAI_API_CALL", client.chat.completions.create
+        return "OPENAI_API_CALL", client.chat.completions.create, None
     elif isinstance(client, (Together, AsyncTogether)):
-        return "TOGETHER_API_CALL", client.chat.completions.create
+        return "TOGETHER_API_CALL", client.chat.completions.create, None
     elif isinstance(client, (Anthropic, AsyncAnthropic)):
-        return "ANTHROPIC_API_CALL", client.messages.create
+        return "ANTHROPIC_API_CALL", client.messages.create, client.messages.stream
+    elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
+        return "GOOGLE_API_CALL", client.models.generate_content, None
     raise ValueError(f"Unsupported client type: {type(client)}")
 def _format_input_data(client: ApiClient, **kwargs) -> dict:
@@ -1303,6 +1555,11 @@ def _format_input_data(client: ApiClient, **kwargs) -> dict:
             "model": kwargs.get("model"),
             "messages": kwargs.get("messages"),
         }
+    elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
+        return {
+            "model": kwargs.get("model"),
+            "contents": kwargs.get("contents")
+        }
     # Anthropic requires additional max_tokens parameter
     return {
         "model": kwargs.get("model"),
@@ -1330,6 +1587,15 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
                 "total_tokens": response.usage.total_tokens
             }
         }
+    elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
+        return {
+            "content": response.candidates[0].content.parts[0].text,
+            "usage": {
+                "prompt_tokens": response.usage_metadata.prompt_token_count,
+                "completion_tokens": response.usage_metadata.candidates_token_count,
+                "total_tokens": response.usage_metadata.total_token_count
+            }
+        }
     # Anthropic has a different response structure
     return {
         "content": response.content[0].text,
@@ -1340,29 +1606,117 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
         }
     }
-# Add a global context-preserving gather function
-# async def trace_gather(*coroutines, return_exceptions=False): # REMOVED
-#     """ # REMOVED
-#     A wrapper around asyncio.gather that ensures the trace context # REMOVED
-#     is available within the gathered coroutines using contextvars.copy_context. # REMOVED
-#     """ # REMOVED
-#     # Get the original asyncio.gather (if we patched it) # REMOVED
-#     original_gather = getattr(asyncio, "_original_gather", asyncio.gather) # REMOVED
-# # REMOVED
-#     # Use contextvars.copy_context() to ensure context propagation # REMOVED
-#     ctx = contextvars.copy_context() # REMOVED
-#      # REMOVED
-#     # Wrap the gather call within the copied context # REMOVED
-#     return await ctx.run(original_gather, *coroutines, return_exceptions=return_exceptions) # REMOVED
-# Store the original gather and apply the patch *once*
-# global _original_gather_stored # REMOVED
-# if not globals().get('_original_gather_stored'): # REMOVED
-#     # Check if asyncio.gather is already our wrapper to prevent double patching # REMOVED
-#     if asyncio.gather.__name__ != 'trace_gather':  # REMOVED
-#         asyncio._original_gather = asyncio.gather # REMOVED
-#         asyncio.gather = trace_gather # REMOVED
-#         _original_gather_stored = True # REMOVED
+# Define a blocklist of functions that should not be traced
+# These are typically utility functions, print statements, logging, etc.
+_TRACE_BLOCKLIST = {
+    # Built-in functions
+    'print', 'str', 'int', 'float', 'bool', 'list', 'dict', 'set', 'tuple',
+    'len', 'range', 'enumerate', 'zip', 'map', 'filter', 'sorted', 'reversed',
+    'min', 'max', 'sum', 'any', 'all', 'abs', 'round', 'format',
+    # Logging functions
+    'debug', 'info', 'warning', 'error', 'critical', 'exception', 'log',
+    # Common utility functions
+    'sleep', 'time', 'datetime', 'json', 'dumps', 'loads',
+    # String operations
+    'join', 'split', 'strip', 'lstrip', 'rstrip', 'replace', 'lower', 'upper',
+    # Dict operations
+    'get', 'items', 'keys', 'values', 'update',
+    # List operations
+    'append', 'extend', 'insert', 'remove', 'pop', 'clear', 'index', 'count', 'sort',
+}
+# Add a new function for deep tracing at the module level
+def _create_deep_tracing_wrapper(func, tracer, span_type="span"):
+    """
+    Creates a wrapper for a function that automatically traces it when called within a traced function.
+    This enables deep tracing without requiring explicit @observe decorators on every function.
+    Args:
+        func: The function to wrap
+        tracer: The Tracer instance
+        span_type: Type of span (default "span")
+    Returns:
+        A wrapped function that will be traced when called
+    """
+    # Skip wrapping if the function is not callable or is a built-in
+    if not callable(func) or isinstance(func, type) or func.__module__ == 'builtins':
+        return func
+    # Skip functions in the blocklist
+    if func.__name__ in _TRACE_BLOCKLIST:
+        return func
+    # Skip functions from certain modules (logging, sys, etc.)
+    if func.__module__ and any(func.__module__.startswith(m) for m in ['logging', 'sys', 'os', 'json', 'time', 'datetime']):
+        return func
+    # Get function name for the span - check for custom name set by @observe
+    func_name = getattr(func, '_judgment_span_name', func.__name__)
+    # Check for custom span_type set by @observe
+    func_span_type = getattr(func, '_judgment_span_type', "span")
+    # Store original function to prevent losing reference
+    original_func = func
+    # Create appropriate wrapper based on whether the function is async or not
+    if asyncio.iscoroutinefunction(func):
+        @functools.wraps(func)
+        async def async_deep_wrapper(*args, **kwargs):
+            # Get current trace from context
+            current_trace = current_trace_var.get()
+            # If no trace context, just call the function
+            if not current_trace:
+                return await original_func(*args, **kwargs)
+            # Create a span for this function call - use custom span_type if available
+            with current_trace.span(func_name, span_type=func_span_type) as span:
+                # Record inputs
+                span.record_input({
+                    'args': str(args),
+                    'kwargs': kwargs
+                })
+                # Execute function
+                result = await original_func(*args, **kwargs)
+                # Record output
+                span.record_output(result)
+                return result
+        return async_deep_wrapper
+    else:
+        @functools.wraps(func)
+        def deep_wrapper(*args, **kwargs):
+            # Get current trace from context
+            current_trace = current_trace_var.get()
+            # If no trace context, just call the function
+            if not current_trace:
+                return original_func(*args, **kwargs)
+            # Create a span for this function call - use custom span_type if available
+            with current_trace.span(func_name, span_type=func_span_type) as span:
+                # Record inputs
+                span.record_input({
+                    'args': str(args),
+                    'kwargs': kwargs
+                })
+                # Execute function
+                result = original_func(*args, **kwargs)
+                # Record output
+                span.record_output(result)
+                return result
+        return deep_wrapper
 # Add the new TraceThreadPoolExecutor class
 class TraceThreadPoolExecutor(concurrent.futures.ThreadPoolExecutor):
@@ -1393,4 +1747,336 @@ class TraceThreadPoolExecutor(concurrent.futures.ThreadPoolExecutor):
         return super().submit(ctx.run, func_with_bound_args)
     # Note: The `map` method would also need to be overridden for full context
-    # propagation if users rely on it, but `submit` is the most common use case.
+    # propagation if users rely on it, but `submit` is the most common use case.
+# Helper functions for stream processing
+# ---------------------------------------
+def _extract_content_from_chunk(client: ApiClient, chunk: Any) -> Optional[str]:
+    """Extracts the text content from a stream chunk based on the client type."""
+    try:
+        if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
+            return chunk.choices[0].delta.content
+        elif isinstance(client, (Anthropic, AsyncAnthropic)):
+            # Anthropic streams various event types, we only care for content blocks
+            if chunk.type == "content_block_delta":
+                return chunk.delta.text
+        elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
+            # Google streams Candidate objects
+            if chunk.candidates and chunk.candidates[0].content and chunk.candidates[0].content.parts:
+                return chunk.candidates[0].content.parts[0].text
+    except (AttributeError, IndexError, KeyError):
+        # Handle cases where chunk structure is unexpected or doesn't contain content
+        pass # Return None
+    return None
+def _extract_usage_from_final_chunk(client: ApiClient, chunk: Any) -> Optional[Dict[str, int]]:
+    """Extracts usage data if present in the *final* chunk (client-specific)."""
+    try:
+        # OpenAI/Together include usage in the *last* chunk's `usage` attribute if available
+        # This typically requires specific API versions or settings. Often usage is *not* streamed.
+        if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
+             # Check if usage is directly on the chunk (some models might do this)
+             if hasattr(chunk, 'usage') and chunk.usage:
+                 return {
+                     "prompt_tokens": chunk.usage.prompt_tokens,
+                     "completion_tokens": chunk.usage.completion_tokens,
+                     "total_tokens": chunk.usage.total_tokens
+                 }
+             # Check if usage is nested within choices (less common for final chunk, but check)
+             elif chunk.choices and hasattr(chunk.choices[0], 'usage') and chunk.choices[0].usage:
+                 usage = chunk.choices[0].usage
+                 return {
+                      "prompt_tokens": usage.prompt_tokens,
+                      "completion_tokens": usage.completion_tokens,
+                      "total_tokens": usage.total_tokens
+                  }
+             # Anthropic includes usage in the 'message_stop' event type
+        elif isinstance(client, (Anthropic, AsyncAnthropic)):
+            if chunk.type == "message_stop":
+                # Anthropic final usage is often attached to the *message* object, not the chunk directly
+                # The API might provide a way to get the final message object, but typically not in the stream itself.
+                # Let's assume for now usage might appear in the final *chunk* metadata if supported.
+                # This is a placeholder - Anthropic usage typically needs a separate call or context.
+                pass
+        elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
+             # Google provides usage metadata on the full response object, not typically streamed per chunk.
+             # It might be in the *last* chunk's usage_metadata if the stream implementation supports it.
+             if hasattr(chunk, 'usage_metadata') and chunk.usage_metadata:
+                 return {
+                     "prompt_tokens": chunk.usage_metadata.prompt_token_count,
+                     "completion_tokens": chunk.usage_metadata.candidates_token_count,
+                     "total_tokens": chunk.usage_metadata.total_token_count
+                 }
+    except (AttributeError, IndexError, KeyError, TypeError):
+        # Handle cases where usage data is missing or malformed
+         pass # Return None
+    return None
+# --- Sync Stream Wrapper ---
+def _sync_stream_wrapper(
+    original_stream: Iterator,
+    client: ApiClient,
+    output_entry: TraceEntry
+) -> Generator[Any, None, None]:
+    """Wraps a synchronous stream iterator to capture content and update the trace."""
+    content_parts = []  # Use a list instead of string concatenation
+    final_usage = None
+    last_chunk = None
+    try:
+        for chunk in original_stream:
+            content_part = _extract_content_from_chunk(client, chunk)
+            if content_part:
+                content_parts.append(content_part)  # Append to list instead of concatenating
+            last_chunk = chunk # Keep track of the last chunk for potential usage data
+            yield chunk # Pass the chunk to the caller
+    finally:
+        # Attempt to extract usage from the last chunk received
+        if last_chunk:
+            final_usage = _extract_usage_from_final_chunk(client, last_chunk)
+        # Update the trace entry with the accumulated content and usage
+        output_entry.output = {
+            "content": "".join(content_parts),  # Join list at the end
+            "usage": final_usage if final_usage else {"info": "Usage data not available in stream."}, # Provide placeholder if None
+            "streamed": True
+        }
+        # Note: We might need to adjust _serialize_output if this dict causes issues,
+        # but Pydantic's model_dump should handle dicts.
+# --- Async Stream Wrapper ---
+async def _async_stream_wrapper(
+    original_stream: AsyncIterator,
+    client: ApiClient,
+    output_entry: TraceEntry
+) -> AsyncGenerator[Any, None]:
+    # [Existing logic - unchanged]
+    content_parts = []  # Use a list instead of string concatenation
+    final_usage_data = None
+    last_content_chunk = None
+    anthropic_input_tokens = 0
+    anthropic_output_tokens = 0
+    target_span_id = getattr(output_entry, 'span_id', 'UNKNOWN')
+    try:
+        async for chunk in original_stream:
+            # Check for OpenAI's final usage chunk
+            if isinstance(client, (AsyncOpenAI, OpenAI)) and hasattr(chunk, 'usage') and chunk.usage is not None:
+                final_usage_data = {
+                    "prompt_tokens": chunk.usage.prompt_tokens,
+                    "completion_tokens": chunk.usage.completion_tokens,
+                    "total_tokens": chunk.usage.total_tokens
+                }
+                yield chunk
+                continue
+            if isinstance(client, (AsyncAnthropic, Anthropic)) and hasattr(chunk, 'type'):
+                 if chunk.type == "message_start":
+                     if hasattr(chunk, 'message') and hasattr(chunk.message, 'usage') and hasattr(chunk.message.usage, 'input_tokens'):
+                         anthropic_input_tokens = chunk.message.usage.input_tokens
+                 elif chunk.type == "message_delta":
+                     if hasattr(chunk, 'usage') and hasattr(chunk.usage, 'output_tokens'):
+                         anthropic_output_tokens += chunk.usage.output_tokens
+            content_part = _extract_content_from_chunk(client, chunk)
+            if content_part:
+                content_parts.append(content_part)  # Append to list instead of concatenating
+                last_content_chunk = chunk
+            yield chunk
+    finally:
+        anthropic_final_usage = None
+        if isinstance(client, (AsyncAnthropic, Anthropic)) and (anthropic_input_tokens > 0 or anthropic_output_tokens > 0):
+             anthropic_final_usage = {
+                 "input_tokens": anthropic_input_tokens,
+                 "output_tokens": anthropic_output_tokens,
+                 "total_tokens": anthropic_input_tokens + anthropic_output_tokens
+             }
+        usage_info = None
+        if final_usage_data:
+             usage_info = final_usage_data
+        elif anthropic_final_usage:
+             usage_info = anthropic_final_usage
+        elif last_content_chunk:
+             usage_info = _extract_usage_from_final_chunk(client, last_content_chunk)
+        if output_entry and hasattr(output_entry, 'output'):
+            output_entry.output = {
+                "content": "".join(content_parts),  # Join list at the end
+                "usage": usage_info if usage_info else {"info": "Usage data not available in stream."},
+                "streamed": True
+            }
+            start_ts = getattr(output_entry, 'created_at', time.time())
+            output_entry.duration = time.time() - start_ts
+        # else: # Handle error case if necessary, but remove debug print
+# --- Define Context Manager Wrapper Classes ---
+class _TracedAsyncStreamManagerWrapper(AbstractAsyncContextManager):
+    """Wraps an original async stream manager to add tracing."""
+    def __init__(self, original_manager, client, span_name, trace_client, stream_wrapper_func, input_kwargs):
+        self._original_manager = original_manager
+        self._client = client
+        self._span_name = span_name
+        self._trace_client = trace_client
+        self._stream_wrapper_func = stream_wrapper_func
+        self._input_kwargs = input_kwargs
+        self._parent_span_id_at_entry = None
+    async def __aenter__(self):
+        self._parent_span_id_at_entry = current_span_var.get()
+        if not self._trace_client:
+             # If no trace, just delegate to the original manager
+             return await self._original_manager.__aenter__()
+        # --- Manually create the 'enter' entry ---
+        start_time = time.time()
+        span_id = str(uuid.uuid4())
+        current_depth = 0
+        if self._parent_span_id_at_entry and self._parent_span_id_at_entry in self._trace_client._span_depths:
+            current_depth = self._trace_client._span_depths[self._parent_span_id_at_entry] + 1
+        self._trace_client._span_depths[span_id] = current_depth
+        enter_entry = TraceEntry(
+             type="enter", function=self._span_name, span_id=span_id,
+             trace_id=self._trace_client.trace_id, depth=current_depth, message=self._span_name,
+             created_at=start_time, span_type="llm", parent_span_id=self._parent_span_id_at_entry
+        )
+        self._trace_client.add_entry(enter_entry)
+        # --- End manual 'enter' entry ---
+        # Set the current span ID in contextvars
+        self._span_context_token = current_span_var.set(span_id)
+        # Manually create 'input' entry
+        input_data = _format_input_data(self._client, **self._input_kwargs)
+        input_entry = TraceEntry(
+             type="input", function=self._span_name, span_id=span_id,
+             trace_id=self._trace_client.trace_id, depth=current_depth, message=f"Inputs to {self._span_name}",
+             created_at=time.time(), inputs=input_data, span_type="llm"
+        )
+        self._trace_client.add_entry(input_entry)
+        # Call the original __aenter__
+        raw_iterator = await self._original_manager.__aenter__()
+        # Manually create pending 'output' entry
+        output_entry = TraceEntry(
+            type="output", function=self._span_name, span_id=span_id,
+            trace_id=self._trace_client.trace_id, depth=current_depth, message=f"Output from {self._span_name}",
+            created_at=time.time(), output="<pending stream>", span_type="llm"
+        )
+        self._trace_client.add_entry(output_entry)
+        # Wrap the raw iterator
+        wrapped_iterator = self._stream_wrapper_func(raw_iterator, self._client, output_entry)
+        return wrapped_iterator
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        # Manually create the 'exit' entry
+        if hasattr(self, '_span_context_token'):
+             span_id = current_span_var.get()
+             start_time_for_duration = 0
+             for entry in reversed(self._trace_client.entries):
+                  if entry.span_id == span_id and entry.type == 'enter':
+                       start_time_for_duration = entry.created_at
+                       break
+             duration = time.time() - start_time_for_duration if start_time_for_duration else None
+             exit_depth = self._trace_client._span_depths.get(span_id, 0)
+             exit_entry = TraceEntry(
+                  type="exit", function=self._span_name, span_id=span_id,
+                  trace_id=self._trace_client.trace_id, depth=exit_depth, message=f"← {self._span_name}",
+                  created_at=time.time(), duration=duration, span_type="llm"
+             )
+             self._trace_client.add_entry(exit_entry)
+             if span_id in self._trace_client._span_depths: del self._trace_client._span_depths[span_id]
+             current_span_var.reset(self._span_context_token)
+             delattr(self, '_span_context_token')
+        # Delegate __aexit__
+        if hasattr(self._original_manager, "__aexit__"):
+             return await self._original_manager.__aexit__(exc_type, exc_val, exc_tb)
+        return None
+class _TracedSyncStreamManagerWrapper(AbstractContextManager):
+    """Wraps an original sync stream manager to add tracing."""
+    def __init__(self, original_manager, client, span_name, trace_client, stream_wrapper_func, input_kwargs):
+        self._original_manager = original_manager
+        self._client = client
+        self._span_name = span_name
+        self._trace_client = trace_client
+        self._stream_wrapper_func = stream_wrapper_func
+        self._input_kwargs = input_kwargs
+        self._parent_span_id_at_entry = None
+    def __enter__(self):
+        self._parent_span_id_at_entry = current_span_var.get()
+        if not self._trace_client:
+             return self._original_manager.__enter__()
+        # Manually create 'enter' entry
+        start_time = time.time()
+        span_id = str(uuid.uuid4())
+        current_depth = 0
+        if self._parent_span_id_at_entry and self._parent_span_id_at_entry in self._trace_client._span_depths:
+            current_depth = self._trace_client._span_depths[self._parent_span_id_at_entry] + 1
+        self._trace_client._span_depths[span_id] = current_depth
+        enter_entry = TraceEntry(
+             type="enter", function=self._span_name, span_id=span_id,
+             trace_id=self._trace_client.trace_id, depth=current_depth, message=self._span_name,
+             created_at=start_time, span_type="llm", parent_span_id=self._parent_span_id_at_entry
+        )
+        self._trace_client.add_entry(enter_entry)
+        self._span_context_token = current_span_var.set(span_id)
+        # Manually create 'input' entry
+        input_data = _format_input_data(self._client, **self._input_kwargs)
+        input_entry = TraceEntry(
+             type="input", function=self._span_name, span_id=span_id,
+             trace_id=self._trace_client.trace_id, depth=current_depth, message=f"Inputs to {self._span_name}",
+             created_at=time.time(), inputs=input_data, span_type="llm"
+        )
+        self._trace_client.add_entry(input_entry)
+        # Call original __enter__
+        raw_iterator = self._original_manager.__enter__()
+        # Manually create 'output' entry (pending)
+        output_entry = TraceEntry(
+            type="output", function=self._span_name, span_id=span_id,
+            trace_id=self._trace_client.trace_id, depth=current_depth, message=f"Output from {self._span_name}",
+            created_at=time.time(), output="<pending stream>", span_type="llm"
+        )
+        self._trace_client.add_entry(output_entry)
+        # Wrap the raw iterator
+        wrapped_iterator = self._stream_wrapper_func(raw_iterator, self._client, output_entry)
+        return wrapped_iterator
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # Manually create 'exit' entry
+        if hasattr(self, '_span_context_token'):
+             span_id = current_span_var.get()
+             start_time_for_duration = 0
+             for entry in reversed(self._trace_client.entries):
+                  if entry.span_id == span_id and entry.type == 'enter':
+                       start_time_for_duration = entry.created_at
+                       break
+             duration = time.time() - start_time_for_duration if start_time_for_duration else None
+             exit_depth = self._trace_client._span_depths.get(span_id, 0)
+             exit_entry = TraceEntry(
+                  type="exit", function=self._span_name, span_id=span_id,
+                  trace_id=self._trace_client.trace_id, depth=exit_depth, message=f"← {self._span_name}",
+                  created_at=time.time(), duration=duration, span_type="llm"
+             )
+             self._trace_client.add_entry(exit_entry)
+             if span_id in self._trace_client._span_depths: del self._trace_client._span_depths[span_id]
+             current_span_var.reset(self._span_context_token)
+             delattr(self, '_span_context_token')
+        # Delegate __exit__
+        if hasattr(self._original_manager, "__exit__"):
+             return self._original_manager.__exit__(exc_type, exc_val, exc_tb)
+        return None

judgeval 0.0.31__py3-none-any.whl → 0.0.33__py3-none-any.whl

judgeval 0.0.31py3-none-any.whl → 0.0.33py3-none-any.whl