PyPI - judgeval - Versions diffs - 0.0.31__py3-none-any.whl → 0.0.32__py3-none-any.whl - Mend

judgeval 0.0.31py3-none-any.whl → 0.0.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

judgeval/__init__.py +3 -1
judgeval/common/tracer.py +262 -65
judgeval/constants.py +1 -1
judgeval/data/datasets/dataset.py +5 -1
judgeval/data/datasets/eval_dataset_client.py +2 -2
judgeval/data/sequence.py +12 -16
judgeval/data/sequence_run.py +2 -0
judgeval/judgment_client.py +32 -93
judgeval/run_evaluation.py +1 -1
judgeval/version_check.py +22 -0
{judgeval-0.0.31.dist-info → judgeval-0.0.32.dist-info}/METADATA +1 -1
{judgeval-0.0.31.dist-info → judgeval-0.0.32.dist-info}/RECORD +14 -13
{judgeval-0.0.31.dist-info → judgeval-0.0.32.dist-info}/WHEEL +0 -0
{judgeval-0.0.31.dist-info → judgeval-0.0.32.dist-info}/licenses/LICENSE.md +0 -0

judgeval/__init__.py CHANGED Viewed

@@ -1,10 +1,12 @@
 # Import key components that should be publicly accessible
 from judgeval.clients import client, together_client
 from judgeval.judgment_client import JudgmentClient
+from judgeval.version_check import check_latest_version
+check_latest_version()
 __all__ = [
     # Clients
     'client',
     'together_client',
     'JudgmentClient',
-]
+]

judgeval/common/tracer.py CHANGED Viewed

@@ -11,11 +11,12 @@ import time
 import uuid
 import warnings
 import contextvars
+import sys
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from datetime import datetime
 from http import HTTPStatus
-from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, TypeAlias, Union, Callable, Awaitable
+from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, TypeAlias, Union, Callable, Awaitable, Set
 from rich import print as rprint
 # Third-party imports
@@ -27,6 +28,7 @@ from rich import print as rprint
 from openai import OpenAI, AsyncOpenAI
 from together import Together, AsyncTogether
 from anthropic import Anthropic, AsyncAnthropic
+from google import genai
 # Local application/library-specific imports
 from judgeval.constants import (
@@ -50,10 +52,11 @@ import concurrent.futures
 # Define context variables for tracking the current trace and the current span within a trace
 current_trace_var = contextvars.ContextVar('current_trace', default=None)
-current_span_var = contextvars.ContextVar('current_span', default=None) # NEW: ContextVar for the active span name
+current_span_var = contextvars.ContextVar('current_span', default=None) # ContextVar for the active span name
+in_traced_function_var = contextvars.ContextVar('in_traced_function', default=False) # Track if we're in a traced function
 # Define type aliases for better code readability and maintainability
-ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic, AsyncOpenAI, AsyncAnthropic, AsyncTogether]  # Supported API clients
+ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic, AsyncOpenAI, AsyncAnthropic, AsyncTogether, genai.Client, genai.client.AsyncClient]  # Supported API clients
 TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation']  # Valid trace entry types
 SpanType = Literal['span', 'tool', 'llm', 'evaluation', 'chain']
 @dataclass
@@ -888,6 +891,13 @@ class TraceClient:
             "parent_trace_id": self.parent_trace_id,
             "parent_name": self.parent_name
         }
+        # --- Log trace data before saving ---
+        try:
+            rprint(f"[TraceClient.save] Saving trace data for trace_id {self.trace_id}:")
+            rprint(json.dumps(trace_data, indent=2))
+        except Exception as log_e:
+            rprint(f"[TraceClient.save] Error logging trace data: {log_e}")
+        # --- End logging ---
         self.trace_manager_client.save_trace(trace_data)
         return self.trace_id, trace_data
@@ -910,7 +920,8 @@ class Tracer:
         rules: Optional[List[Rule]] = None,  # Added rules parameter
         organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
         enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower() == "true",
-        enable_evaluations: bool = os.getenv("JUDGMENT_EVALUATIONS", "true").lower() == "true"
+        enable_evaluations: bool = os.getenv("JUDGMENT_EVALUATIONS", "true").lower() == "true",
+        deep_tracing: bool = True  # NEW: Enable deep tracing by default
         ):
         if not hasattr(self, 'initialized'):
             if not api_key:
@@ -927,6 +938,7 @@ class Tracer:
             self.initialized: bool = True
             self.enable_monitoring: bool = enable_monitoring
             self.enable_evaluations: bool = enable_evaluations
+            self.deep_tracing: bool = deep_tracing  # NEW: Store deep tracing setting
         elif hasattr(self, 'project_name') and self.project_name != project_name:
             warnings.warn(
                 f"Attempting to initialize Tracer with project_name='{project_name}' but it was already initialized with "
@@ -941,12 +953,52 @@ class Tracer:
         """
         current_trace_var.set(trace)
-    def get_current_trace(self):
+    def get_current_trace(self) -> Optional[TraceClient]:
         """
         Get the current trace context from contextvars
         """
         return current_trace_var.get()
+    def _apply_deep_tracing(self, func, span_type="span"):
+        """
+        Apply deep tracing to all functions in the same module as the given function.
+        Args:
+            func: The function being traced
+            span_type: Type of span to use for traced functions
+        Returns:
+            A tuple of (module, original_functions_dict) where original_functions_dict
+            contains the original functions that were replaced with traced versions.
+        """
+        module = inspect.getmodule(func)
+        if not module:
+            return None, {}
+        # Save original functions
+        original_functions = {}
+        # Find all functions in the module
+        for name, obj in inspect.getmembers(module, inspect.isfunction):
+            # Skip already wrapped functions
+            if hasattr(obj, '_judgment_traced'):
+                continue
+            # Create a traced version of the function
+            # Always use default span type "span" for child functions
+            traced_func = _create_deep_tracing_wrapper(obj, self, "span")
+            # Mark the function as traced to avoid double wrapping
+            traced_func._judgment_traced = True
+            # Save the original function
+            original_functions[name] = obj
+            # Replace with traced version
+            setattr(module, name, traced_func)
+        return module, original_functions
     @contextmanager
     def trace(
         self,
@@ -992,14 +1044,8 @@ class Tracer:
             finally:
                 # Reset the context variable
                 current_trace_var.reset(token)
-    def get_current_trace(self) -> Optional[TraceClient]:
-        """
-        Get the current trace context from contextvars
-        """
-        return current_trace_var.get()
-    def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False):
+    def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False, deep_tracing: bool = None):
         """
         Decorator to trace function execution with detailed entry/exit information.
@@ -1009,20 +1055,37 @@ class Tracer:
             span_type: Type of span (default "span")
             project_name: Optional project name override
             overwrite: Whether to overwrite existing traces
+            deep_tracing: Whether to enable deep tracing for this function and all nested calls.
+                          If None, uses the tracer's default setting.
         """
         # If monitoring is disabled, return the function as is
         if not self.enable_monitoring:
             return func if func else lambda f: f
         if func is None:
-            return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name, overwrite=overwrite)
+            return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name,
+                                         overwrite=overwrite, deep_tracing=deep_tracing)
         # Use provided name or fall back to function name
         span_name = name or func.__name__
+        # Store custom attributes on the function object
+        func._judgment_span_name = span_name
+        func._judgment_span_type = span_type
+        # Use the provided deep_tracing value or fall back to the tracer's default
+        use_deep_tracing = deep_tracing if deep_tracing is not None else self.deep_tracing
         if asyncio.iscoroutinefunction(func):
             @functools.wraps(func)
             async def async_wrapper(*args, **kwargs):
+                # Check if we're already in a traced function
+                if in_traced_function_var.get():
+                    return await func(*args, **kwargs)
+                # Set in_traced_function_var to True
+                token = in_traced_function_var.set(True)
                 # Get current trace from context
                 current_trace = current_trace_var.get()
@@ -1057,9 +1120,18 @@ class Tracer:
                                 'kwargs': kwargs
                             })
+                            # If deep tracing is enabled, apply monkey patching
+                            if use_deep_tracing:
+                                module, original_functions = self._apply_deep_tracing(func, span_type)
                             # Execute function
                             result = await func(*args, **kwargs)
+                            # Restore original functions if deep tracing was enabled
+                            if use_deep_tracing and module and 'original_functions' in locals():
+                                for name, obj in original_functions.items():
+                                    setattr(module, name, obj)
                             # Record output
                             span.record_output(result)
@@ -1069,29 +1141,52 @@ class Tracer:
                     finally:
                         # Reset trace context (span context resets automatically)
                         current_trace_var.reset(trace_token)
+                        # Reset in_traced_function_var
+                        in_traced_function_var.reset(token)
                 else:
                     # Already have a trace context, just create a span in it
                     # The span method handles current_span_var
-                    with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
-                        # Record inputs
-                        span.record_input({
-                            'args': str(args),
-                            'kwargs': kwargs
-                        })
-                        # Execute function
-                        result = await func(*args, **kwargs)
-                        # Record output
-                        span.record_output(result)
+                    try:
+                        with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
+                            # Record inputs
+                            span.record_input({
+                                'args': str(args),
+                                'kwargs': kwargs
+                            })
+                            # If deep tracing is enabled, apply monkey patching
+                            if use_deep_tracing:
+                                module, original_functions = self._apply_deep_tracing(func, span_type)
+                            # Execute function
+                            result = await func(*args, **kwargs)
+                            # Restore original functions if deep tracing was enabled
+                            if use_deep_tracing and module and 'original_functions' in locals():
+                                for name, obj in original_functions.items():
+                                    setattr(module, name, obj)
+                            # Record output
+                            span.record_output(result)
                         return result
+                    finally:
+                        # Reset in_traced_function_var
+                        in_traced_function_var.reset(token)
             return async_wrapper
         else:
-            # Non-async function implementation remains unchanged
+            # Non-async function implementation with deep tracing
             @functools.wraps(func)
             def wrapper(*args, **kwargs):
+                # Check if we're already in a traced function
+                if in_traced_function_var.get():
+                    return func(*args, **kwargs)
+                # Set in_traced_function_var to True
+                token = in_traced_function_var.set(True)
                 # Get current trace from context
                 current_trace = current_trace_var.get()
@@ -1126,9 +1221,18 @@ class Tracer:
                                 'kwargs': kwargs
                             })
+                            # If deep tracing is enabled, apply monkey patching
+                            if use_deep_tracing:
+                                module, original_functions = self._apply_deep_tracing(func, span_type)
                             # Execute function
                             result = func(*args, **kwargs)
+                            # Restore original functions if deep tracing was enabled
+                            if use_deep_tracing and module and 'original_functions' in locals():
+                                for name, obj in original_functions.items():
+                                    setattr(module, name, obj)
                             # Record output
                             span.record_output(result)
@@ -1138,24 +1242,40 @@ class Tracer:
                     finally:
                         # Reset trace context (span context resets automatically)
                         current_trace_var.reset(trace_token)
+                        # Reset in_traced_function_var
+                        in_traced_function_var.reset(token)
                 else:
                     # Already have a trace context, just create a span in it
                     # The span method handles current_span_var
-                    with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
-                        # Record inputs
-                        span.record_input({
-                            'args': str(args),
-                            'kwargs': kwargs
-                        })
-                        # Execute function
-                        result = func(*args, **kwargs)
-                        # Record output
-                        span.record_output(result)
+                    try:
+                        with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
+                            # Record inputs
+                            span.record_input({
+                                'args': str(args),
+                                'kwargs': kwargs
+                            })
+                            # If deep tracing is enabled, apply monkey patching
+                            if use_deep_tracing:
+                                module, original_functions = self._apply_deep_tracing(func, span_type)
+                            # Execute function
+                            result = func(*args, **kwargs)
+                            # Restore original functions if deep tracing was enabled
+                            if use_deep_tracing and module and 'original_functions' in locals():
+                                for name, obj in original_functions.items():
+                                    setattr(module, name, obj)
+                            # Record output
+                            span.record_output(result)
                         return result
+                    finally:
+                        # Reset in_traced_function_var
+                        in_traced_function_var.reset(token)
             return wrapper
     def score(self, func=None, scorers: List[Union[APIJudgmentScorer, JudgevalScorer]] = None, model: str = None, log_results: bool = True, *, name: str = None, span_type: SpanType = "span"):
@@ -1206,7 +1326,7 @@ def wrap(client: Any) -> Any:
     span_name, original_create = _get_client_config(client)
     # Handle async clients differently than synchronous clients (need an async function for async clients)
-    if (isinstance(client, (AsyncOpenAI, AsyncAnthropic, AsyncTogether))):
+    if (isinstance(client, (AsyncOpenAI, AsyncAnthropic, AsyncTogether, genai.client.AsyncClient))):
         async def traced_create(*args, **kwargs):
             # Get the current trace from contextvars
             current_trace = current_trace_var.get()
@@ -1265,6 +1385,8 @@ def wrap(client: Any) -> Any:
         client.chat.completions.create = traced_create
     elif isinstance(client, (Anthropic, AsyncAnthropic)):
         client.messages.create = traced_create
+    elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
+        client.models.generate_content = traced_create
     return client
@@ -1290,6 +1412,8 @@ def _get_client_config(client: ApiClient) -> tuple[str, callable]:
         return "TOGETHER_API_CALL", client.chat.completions.create
     elif isinstance(client, (Anthropic, AsyncAnthropic)):
         return "ANTHROPIC_API_CALL", client.messages.create
+    elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
+        return "GOOGLE_API_CALL", client.models.generate_content
     raise ValueError(f"Unsupported client type: {type(client)}")
 def _format_input_data(client: ApiClient, **kwargs) -> dict:
@@ -1303,6 +1427,11 @@ def _format_input_data(client: ApiClient, **kwargs) -> dict:
             "model": kwargs.get("model"),
             "messages": kwargs.get("messages"),
         }
+    elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
+        return {
+            "model": kwargs.get("model"),
+            "contents": kwargs.get("contents")
+        }
     # Anthropic requires additional max_tokens parameter
     return {
         "model": kwargs.get("model"),
@@ -1330,6 +1459,15 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
                 "total_tokens": response.usage.total_tokens
             }
         }
+    elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
+        return {
+            "content": response.candidates[0].content.parts[0].text,
+            "usage": {
+                "prompt_tokens": response.usage_metadata.prompt_token_count,
+                "completion_tokens": response.usage_metadata.candidates_token_count,
+                "total_tokens": response.usage_metadata.total_token_count
+            }
+        }
     # Anthropic has a different response structure
     return {
         "content": response.content[0].text,
@@ -1340,29 +1478,88 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
         }
     }
-# Add a global context-preserving gather function
-# async def trace_gather(*coroutines, return_exceptions=False): # REMOVED
-#     """ # REMOVED
-#     A wrapper around asyncio.gather that ensures the trace context # REMOVED
-#     is available within the gathered coroutines using contextvars.copy_context. # REMOVED
-#     """ # REMOVED
-#     # Get the original asyncio.gather (if we patched it) # REMOVED
-#     original_gather = getattr(asyncio, "_original_gather", asyncio.gather) # REMOVED
-# # REMOVED
-#     # Use contextvars.copy_context() to ensure context propagation # REMOVED
-#     ctx = contextvars.copy_context() # REMOVED
-#      # REMOVED
-#     # Wrap the gather call within the copied context # REMOVED
-#     return await ctx.run(original_gather, *coroutines, return_exceptions=return_exceptions) # REMOVED
-# Store the original gather and apply the patch *once*
-# global _original_gather_stored # REMOVED
-# if not globals().get('_original_gather_stored'): # REMOVED
-#     # Check if asyncio.gather is already our wrapper to prevent double patching # REMOVED
-#     if asyncio.gather.__name__ != 'trace_gather':  # REMOVED
-#         asyncio._original_gather = asyncio.gather # REMOVED
-#         asyncio.gather = trace_gather # REMOVED
-#         _original_gather_stored = True # REMOVED
+# Add a new function for deep tracing at the module level
+def _create_deep_tracing_wrapper(func, tracer, span_type="span"):
+    """
+    Creates a wrapper for a function that automatically traces it when called within a traced function.
+    This enables deep tracing without requiring explicit @observe decorators on every function.
+    Args:
+        func: The function to wrap
+        tracer: The Tracer instance
+        span_type: Type of span (default "span")
+    Returns:
+        A wrapped function that will be traced when called
+    """
+    # Skip wrapping if the function is not callable or is a built-in
+    if not callable(func) or isinstance(func, type) or func.__module__ == 'builtins':
+        return func
+    # Get function name for the span - check for custom name set by @observe
+    func_name = getattr(func, '_judgment_span_name', func.__name__)
+    # Check for custom span_type set by @observe
+    func_span_type = getattr(func, '_judgment_span_type', "span")
+    # Store original function to prevent losing reference
+    original_func = func
+    # Create appropriate wrapper based on whether the function is async or not
+    if asyncio.iscoroutinefunction(func):
+        @functools.wraps(func)
+        async def async_deep_wrapper(*args, **kwargs):
+            # Get current trace from context
+            current_trace = current_trace_var.get()
+            # If no trace context, just call the function
+            if not current_trace:
+                return await original_func(*args, **kwargs)
+            # Create a span for this function call - use custom span_type if available
+            with current_trace.span(func_name, span_type=func_span_type) as span:
+                # Record inputs
+                span.record_input({
+                    'args': str(args),
+                    'kwargs': kwargs
+                })
+                # Execute function
+                result = await original_func(*args, **kwargs)
+                # Record output
+                span.record_output(result)
+                return result
+        return async_deep_wrapper
+    else:
+        @functools.wraps(func)
+        def deep_wrapper(*args, **kwargs):
+            # Get current trace from context
+            current_trace = current_trace_var.get()
+            # If no trace context, just call the function
+            if not current_trace:
+                return original_func(*args, **kwargs)
+            # Create a span for this function call - use custom span_type if available
+            with current_trace.span(func_name, span_type=func_span_type) as span:
+                # Record inputs
+                span.record_input({
+                    'args': str(args),
+                    'kwargs': kwargs
+                })
+                # Execute function
+                result = original_func(*args, **kwargs)
+                # Record output
+                span.record_output(result)
+                return result
+        return deep_wrapper
 # Add the new TraceThreadPoolExecutor class
 class TraceThreadPoolExecutor(concurrent.futures.ThreadPoolExecutor):

judgeval/constants.py CHANGED Viewed

@@ -43,7 +43,7 @@ JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
 JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
 JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
 JUDGMENT_DATASETS_APPEND_API_URL = f"{ROOT_API}/datasets/insert_examples/"
-JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
+JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
 JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
 JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
 JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"

judgeval/data/datasets/dataset.py CHANGED Viewed

@@ -7,12 +7,13 @@ import yaml
 from dataclasses import dataclass, field
 from typing import List, Union, Literal
-from judgeval.data import Example
+from judgeval.data import Example, Sequence
 from judgeval.common.logger import debug, error, warning, info
 @dataclass
 class EvalDataset:
     examples: List[Example]
+    sequences: List[Sequence]
     _alias: Union[str, None] = field(default=None)
     _id: Union[str, None] = field(default=None)
     judgment_api_key: str = field(default="")
@@ -21,11 +22,13 @@ class EvalDataset:
                  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
                  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
                  examples: List[Example] = [],
+                 sequences: List[Sequence] = []
                  ):
         debug(f"Initializing EvalDataset with {len(examples)} examples")
         if not judgment_api_key:
             warning("No judgment_api_key provided")
         self.examples = examples
+        self.sequences = sequences
         self._alias = None
         self._id = None
         self.judgment_api_key = judgment_api_key
@@ -309,6 +312,7 @@ class EvalDataset:
         return (
             f"{self.__class__.__name__}("
             f"examples={self.examples}, "
+            f"sequences={self.sequences}, "
             f"_alias={self._alias}, "
             f"_id={self._id}"
             f")"

judgeval/data/datasets/eval_dataset_client.py CHANGED Viewed

@@ -13,7 +13,7 @@ from judgeval.constants import (
     JUDGMENT_DATASETS_INSERT_API_URL,
     JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
 )
-from judgeval.data import Example
+from judgeval.data import Example, Sequence
 from judgeval.data.datasets import EvalDataset
@@ -201,8 +201,8 @@ class EvalDatasetClient:
                 info(f"Successfully pulled dataset with alias '{alias}'")
                 payload = response.json()
                 dataset.examples = [Example(**e) for e in payload.get("examples", [])]
+                dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])]
                 dataset._alias = payload.get("alias")
                 dataset._id = payload.get("id")
                 progress.update(

judgeval/data/sequence.py CHANGED Viewed

@@ -16,6 +16,9 @@ class Sequence(BaseModel):
     scorers: Optional[Any] = None
     parent_sequence_id: Optional[str] = None
     sequence_order: Optional[int] = 0
+    root_sequence_id: Optional[str] = None
+    inputs: Optional[str] = None
+    output: Optional[str] = None
     @field_validator("scorers")
     def validate_scorer(cls, v):
@@ -30,28 +33,21 @@ class Sequence(BaseModel):
                 raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
         return loaded_scorers
-    @model_validator(mode='after')
-    def set_parent_sequence_ids(self) -> "Sequence":
-        """Recursively set the parent_sequence_id for all nested Sequences."""
-        for item in self.items:
-            if isinstance(item, Sequence):
-                item.parent_sequence_id = self.sequence_id
-                # Recurse into deeper nested sequences
-                item.set_parent_sequence_ids()
-        return self
+    @model_validator(mode="after")
+    def populate_sequence_metadata(self) -> "Sequence":
+        """Recursively set parent_sequence_id, root_sequence_id, and sequence_order."""
+        # If root_sequence_id isn't already set, assign it to self
+        if self.root_sequence_id is None:
+            self.root_sequence_id = self.sequence_id
-    @model_validator(mode='after')
-    def set_parent_and_order(self) -> "Sequence":
-        """Set parent_sequence_id and sequence_order for all items."""
         for idx, item in enumerate(self.items):
-            # Set sequence_order for both Example and Sequence objects
             item.sequence_order = idx
             if isinstance(item, Sequence):
                 item.parent_sequence_id = self.sequence_id
-                item.set_parent_and_order()  # Recurse for nested sequences
+                item.root_sequence_id = self.root_sequence_id
+                item.populate_sequence_metadata()
         return self
     class Config:
         arbitrary_types_allowed = True

judgeval/data/sequence_run.py CHANGED Viewed

@@ -21,6 +21,7 @@ class SequenceRun(BaseModel):
         metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
         judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
         rules (Optional[List[Rule]]): Rules to evaluate against scoring results
+        append (Optional[bool]): Whether to append to existing evaluation results
     """
     # The user will specify whether they want log_results when they call run_eval
@@ -33,6 +34,7 @@ class SequenceRun(BaseModel):
     aggregator: Optional[str] = None
     metadata: Optional[Dict[str, Any]] = None
     trace_span_id: Optional[str] = None
+    append: Optional[bool] = False
     # API Key will be "" until user calls client.run_eval(), then API Key will be set
     judgment_api_key: Optional[str] = ""
     override: Optional[bool] = False

judgeval/judgment_client.py CHANGED Viewed

@@ -93,16 +93,47 @@ class JudgmentClient(metaclass=SingletonMeta):
         self,
         sequences: List[Sequence],
         model: Union[str, List[str], JudgevalJudge],
+        scorers: List[Union[ScorerWrapper, JudgevalScorer]],
         aggregator: Optional[str] = None,
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_sequence",
         use_judgment: bool = True,
         log_results: bool = True,
+        append: bool = False,
         override: bool = False,
         ignore_errors: bool = True,
         rules: Optional[List[Rule]] = None
     ) -> List[ScoringResult]:
         try:
+            loaded_scorers = []
+            for scorer in scorers:
+                try:
+                    if isinstance(scorer, ScorerWrapper):
+                        loaded_scorers.append(scorer.load_implementation())
+                    else:
+                        loaded_scorers.append(scorer)
+                except Exception as e:
+                    raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
+            def get_all_sequences(root: Sequence) -> List[Sequence]:
+                all_sequences = [root]
+                for item in root.items:
+                    if isinstance(item, Sequence):
+                        all_sequences.extend(get_all_sequences(item))
+                return all_sequences
+            def flatten_sequence_list(sequences: List[Sequence]) -> List[Sequence]:
+                flattened = []
+                for seq in sequences:
+                    flattened.extend(get_all_sequences(seq))
+                return flattened
+            flattened_sequences = flatten_sequence_list(sequences)
+            for sequence in flattened_sequences:
+                sequence.scorers = loaded_scorers
             if rules:
                 loaded_rules = []
                 for rule in rules:
@@ -134,10 +165,10 @@ class JudgmentClient(metaclass=SingletonMeta):
                 model=model,
                 aggregator=aggregator,
                 log_results=log_results,
+                append=append,
                 judgment_api_key=self.judgment_api_key,
                 organization_id=self.organization_id
             )
             return run_sequence_eval(sequence_run, override, ignore_errors, use_judgment)
         except ValueError as e:
             raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
@@ -244,98 +275,6 @@ class JudgmentClient(metaclass=SingletonMeta):
             raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
         except Exception as e:
             raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
-    def evaluate_dataset(
-        self,
-        dataset: EvalDataset,
-        scorers: List[Union[ScorerWrapper, JudgevalScorer]],
-        model: Union[str, List[str], JudgevalJudge],
-        aggregator: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-        project_name: str = "",
-        eval_run_name: str = "",
-        log_results: bool = True,
-        use_judgment: bool = True,
-        rules: Optional[List[Rule]] = None
-    ) -> List[ScoringResult]:
-        """
-        Executes an evaluation of a `EvalDataset` using one or more `Scorer`s
-        Args:
-            dataset (EvalDataset): The dataset containing examples to evaluate
-            scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
-            model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
-            aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
-            metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
-            project_name (str): The name of the project the evaluation results belong to
-            eval_run_name (str): A name for this evaluation run
-            log_results (bool): Whether to log the results to the Judgment API
-            use_judgment (bool): Whether to use Judgment API for evaluation
-            rules (Optional[List[Rule]]): Rules to evaluate against scoring results
-        Returns:
-            List[ScoringResult]: The results of the evaluation
-        """
-        try:
-            # Load appropriate implementations for all scorers
-            loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
-            for scorer in scorers:
-                try:
-                    if isinstance(scorer, ScorerWrapper):
-                        loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
-                    else:
-                        loaded_scorers.append(scorer)
-                except Exception as e:
-                    raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
-            # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
-            if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
-                raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
-            # Convert ScorerWrapper in rules to their implementations
-            loaded_rules = None
-            if rules:
-                loaded_rules = []
-                for rule in rules:
-                    try:
-                        processed_conditions = []
-                        for condition in rule.conditions:
-                            # Convert metric if it's a ScorerWrapper
-                            if isinstance(condition.metric, ScorerWrapper):
-                                try:
-                                    condition_copy = condition.model_copy()
-                                    condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
-                                    processed_conditions.append(condition_copy)
-                                except Exception as e:
-                                    raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
-                            else:
-                                processed_conditions.append(condition)
-                        # Create new rule with processed conditions
-                        new_rule = rule.model_copy()
-                        new_rule.conditions = processed_conditions
-                        loaded_rules.append(new_rule)
-                    except Exception as e:
-                        raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
-            evaluation_run = EvaluationRun(
-                log_results=log_results,
-                project_name=project_name,
-                eval_name=eval_run_name,
-                examples=dataset.examples,
-                scorers=loaded_scorers,
-                model=model,
-                aggregator=aggregator,
-                metadata=metadata,
-                judgment_api_key=self.judgment_api_key,
-                rules=loaded_rules,
-                organization_id=self.organization_id
-            )
-            return run_eval(evaluation_run)
-        except ValueError as e:
-            raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
-        except Exception as e:
-            raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
     def create_dataset(self) -> EvalDataset:
         return self.eval_dataset_client.create_dataset()

judgeval/run_evaluation.py CHANGED Viewed

@@ -336,7 +336,7 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
 def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
-    if not override and sequence_run.log_results:
+    if not override and sequence_run.log_results and not sequence_run.append:
         check_eval_run_name_exists(
             sequence_run.eval_name,
             sequence_run.project_name,

judgeval/version_check.py ADDED Viewed

@@ -0,0 +1,22 @@
+import importlib.metadata
+import requests
+import threading
+def check_latest_version(package_name: str = "judgeval"):
+    def _check():
+        try:
+            current_version = importlib.metadata.version(package_name)
+            response = requests.get(f"https://pypi.org/pypi/{package_name}/json", timeout=2)
+            latest_version = response.json()["info"]["version"]
+            if current_version != latest_version:
+                print(
+                    f"\033[93mUPDATE AVAILABLE:\033[0m You are using '{package_name}=={current_version}', "
+                    f"but the latest version is '{latest_version}'. While this version is still supported, "
+                    f"we recommend upgrading to avoid potential issues or missing features: "
+                    f"`pip install --upgrade {package_name}`"
+                )
+        except Exception:
+            pass
+    threading.Thread(target=_check, daemon=True).start()

{judgeval-0.0.31.dist-info → judgeval-0.0.32.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.31
+Version: 0.0.32
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues

{judgeval-0.0.31.dist-info → judgeval-0.0.32.dist-info}/RECORD RENAMED Viewed

@@ -1,25 +1,26 @@
-judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
+judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
 judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
-judgeval/constants.py,sha256=XTqijsuuLEhUBXTjzNJVsee5U_Gl14ULLO5uQVW_nEE,5398
+judgeval/constants.py,sha256=_XmVAkebMyGrDvvanAVlMgVd4p6MLHdEVsTQFI0kz1k,5411
 judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
-judgeval/judgment_client.py,sha256=FncHkjyFx2vfXv4cu4DzbOO0ideHNOWtHVbc8pSXNxk,29754
+judgeval/judgment_client.py,sha256=k0q2s5A0RkhF9ElD9o-KWN10H36t3Of2PrvNF-silf8,26141
 judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
-judgeval/run_evaluation.py,sha256=2Mv1iLthJeFQZSVhjLOcJKRZ52Sy6OxLb2KyQ_yVwnA,28484
+judgeval/run_evaluation.py,sha256=hnEY8QckEviXYNJutf-6tLFq2DWCzqWV1EVyPvrVXyA,28512
+judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
 judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
 judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
 judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
-judgeval/common/tracer.py,sha256=9Qga-7rLFlQK-oM5eK1O_8Mn1SewIrPtFwWbSZFtSII,59651
+judgeval/common/tracer.py,sha256=owRRfIZXPUOVCCn0macygnf18mcp8am1eULGnZXD0Kk,68876
 judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
 judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
 judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
 judgeval/data/example.py,sha256=cJrmPGLel_P2sy1UaRvuVSAi35EnA9XMR11Lhp4aDLo,5930
 judgeval/data/result.py,sha256=Gb9tiSDsk1amXgh0cFG6JmlW_BMKxS2kuTwNA0rrHjA,3184
 judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
-judgeval/data/sequence.py,sha256=DlQUjyWQJB6iNmiftDZ9N6C-nPtrOC1e0JZ57U00zZk,2387
-judgeval/data/sequence_run.py,sha256=GrnYSZBcZmt4tKQYA_1v09MFB8n3ccrkOJd4qyweHMg,1987
+judgeval/data/sequence.py,sha256=Fkk2HJGnPboH-Fvwgxub_ryG0eUXa3cbsj7ZD0qkeBo,2204
+judgeval/data/sequence_run.py,sha256=RmYjfWKMWg-pcF5PLeiWfrhuDkjDZi5VEmAIEXN3Ib0,2104
 judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
-judgeval/data/datasets/dataset.py,sha256=AFYjksV_wXx5CqFYJsl3aN8yZ6hC50O1myRuOJ8s8_E,12867
-judgeval/data/datasets/eval_dataset_client.py,sha256=xzXlBJRBEEmwsB79_eepm0Da-Bz8yRodX7ttk-u-BxU,14986
+judgeval/data/datasets/dataset.py,sha256=dhLo30hvpmmOK2R6O5wDs_neawUJ4lS8bb4S42SufNQ,13034
+judgeval/data/datasets/eval_dataset_client.py,sha256=xjj66BO9Es9IxXqzQe1RT_e0kpeKlt7OrhRoSuj4KHM,15085
 judgeval/integrations/langgraph.py,sha256=J-cQfFP52TjJewdSTe-fcsUC4HDvjNbXoxmbmF0SgiE,11743
 judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
 judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
@@ -90,7 +91,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
 judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
 judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
 judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
-judgeval-0.0.31.dist-info/METADATA,sha256=g9288fIE7NDwXuqUylqCV0mby5hAY7yEztR8TOn5sNk,5418
-judgeval-0.0.31.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.0.31.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.0.31.dist-info/RECORD,,
+judgeval-0.0.32.dist-info/METADATA,sha256=RJzqlHJwfYiOXEcyEEO5WQBM0DC1zQDuoN-Plix6U38,5418
+judgeval-0.0.32.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.0.32.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.0.32.dist-info/RECORD,,

{judgeval-0.0.31.dist-info → judgeval-0.0.32.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.0.31.dist-info → judgeval-0.0.32.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.0.31__py3-none-any.whl → 0.0.32__py3-none-any.whl

judgeval 0.0.31py3-none-any.whl → 0.0.32py3-none-any.whl