PyPI - judgeval - Versions diffs - 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl - Mend

judgeval 0.0.29py3-none-any.whl → 0.0.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

judgeval/common/tracer.py +93 -55
judgeval/constants.py +4 -2
judgeval/data/__init__.py +4 -0
judgeval/data/custom_example.py +18 -0
judgeval/data/datasets/eval_dataset_client.py +62 -3
judgeval/data/example.py +1 -0
judgeval/data/result.py +7 -6
judgeval/data/sequence.py +59 -0
judgeval/data/sequence_run.py +42 -0
judgeval/evaluation_run.py +12 -7
judgeval/integrations/langgraph.py +89 -72
judgeval/judgment_client.py +77 -14
judgeval/run_evaluation.py +87 -13
judgeval/scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorer.py +3 -0
judgeval/scorers/judgeval_scorers/__init__.py +7 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -1
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +21 -0
judgeval/scorers/score.py +6 -5
{judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/METADATA +1 -1
{judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/RECORD +23 -20
judgeval/data/custom_api_example.py +0 -91
{judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/WHEEL +0 -0
{judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/tracer.py CHANGED Viewed

@@ -24,9 +24,9 @@ import requests
 from litellm import cost_per_token
 from pydantic import BaseModel
 from rich import print as rprint
-from openai import OpenAI
-from together import Together
-from anthropic import Anthropic
+from openai import OpenAI, AsyncOpenAI
+from together import Together, AsyncTogether
+from anthropic import Anthropic, AsyncAnthropic
 # Local application/library-specific imports
 from judgeval.constants import (
@@ -37,7 +37,6 @@ from judgeval.constants import (
     RABBITMQ_QUEUE,
     JUDGMENT_TRACES_DELETE_API_URL,
     JUDGMENT_PROJECT_DELETE_API_URL,
-    JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
 )
 from judgeval.judgment_client import JudgmentClient
 from judgeval.data import Example
@@ -54,7 +53,7 @@ current_trace_var = contextvars.ContextVar('current_trace', default=None)
 current_span_var = contextvars.ContextVar('current_span', default=None) # NEW: ContextVar for the active span name
 # Define type aliases for better code readability and maintainability
-ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic]  # Supported API clients
+ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic, AsyncOpenAI, AsyncAnthropic, AsyncTogether]  # Supported API clients
 TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation']  # Valid trace entry types
 SpanType = Literal['span', 'tool', 'llm', 'evaluation', 'chain']
 @dataclass
@@ -69,11 +68,11 @@ class TraceEntry:
     - evaluation: Evaluation: (evaluation results)
     """
     type: TraceEntryType
-    function: str  # Name of the function being traced
     span_id: str # Unique ID for this specific span instance
     depth: int    # Indentation level for nested calls
-    message: str  # Human-readable description
     created_at: float # Unix timestamp when entry was created, replacing the deprecated 'timestamp' field
+    function: Optional[str] = None  # Name of the function being traced
+    message: Optional[str] = None  # Human-readable description
     duration: Optional[float] = None  # Time taken (for exit/evaluation entries)
     trace_id: str = None # ID of the trace this entry belongs to
     output: Any = None  # Function output value
@@ -229,6 +228,8 @@ class TraceManagerClient:
             raise ValueError(f"Failed to fetch traces: {response.text}")
         return response.json()
     def save_trace(self, trace_data: dict):
         """
@@ -356,6 +357,18 @@ class TraceClient:
         self.executed_tools = []
         self.executed_node_tools = []
         self._span_depths: Dict[str, int] = {} # NEW: To track depth of active spans
+    def get_current_span(self):
+        """Get the current span from the context var"""
+        return current_span_var.get()
+    def set_current_span(self, span: Any):
+        """Set the current span from the context var"""
+        return current_span_var.set(span)
+    def reset_current_span(self, token: Any):
+        """Reset the current span from the context var"""
+        return current_span_var.reset(token)
     @contextmanager
     def span(self, name: str, span_type: SpanType = "span"):
@@ -874,27 +887,7 @@ class TraceClient:
             "overwrite": overwrite,
             "parent_trace_id": self.parent_trace_id,
             "parent_name": self.parent_name
-        }
-        # Execute asynchrous evaluation in the background
-        # if not empty_save:  # Only send to RabbitMQ if the trace is not empty
-        #     # Send trace data to evaluation queue via API
-        #     try:
-        #         response = requests.post(
-        #             JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL,
-        #             json=trace_data,
-        #             headers={
-        #                 "Content-Type": "application/json",
-        #                 "Authorization": f"Bearer {self.tracer.api_key}",
-        #                 "X-Organization-Id": self.tracer.organization_id
-        #             },
-        #             verify=True
-        #         )
-        #         if response.status_code != HTTPStatus.OK:
-        #             warnings.warn(f"Failed to add trace to evaluation queue: {response.text}")
-        #     except Exception as e:
-        #         warnings.warn(f"Error sending trace to evaluation queue: {str(e)}")
+        }
         self.trace_manager_client.save_trace(trace_data)
         return self.trace_id, trace_data
@@ -941,6 +934,18 @@ class Tracer:
                 "To use a different project name, ensure the first Tracer initialization uses the desired project name.",
                 RuntimeWarning
             )
+    def set_current_trace(self, trace: TraceClient):
+        """
+        Set the current trace context in contextvars
+        """
+        current_trace_var.set(trace)
+    def get_current_trace(self):
+        """
+        Get the current trace context from contextvars
+        """
+        return current_trace_var.get()
     @contextmanager
     def trace(
@@ -1199,33 +1204,66 @@ def wrap(client: Any) -> Any:
     """
     # Get the appropriate configuration for this client type
     span_name, original_create = _get_client_config(client)
-    def traced_create(*args, **kwargs):
-        # Get the current trace from contextvars
-        current_trace = current_trace_var.get()
-        # Skip tracing if no active trace
-        if not current_trace:
-            return original_create(*args, **kwargs)
-        with current_trace.span(span_name, span_type="llm") as span:
-            # Format and record the input parameters
-            input_data = _format_input_data(client, **kwargs)
-            span.record_input(input_data)
-            # Make the actual API call
-            response = original_create(*args, **kwargs)
+    # Handle async clients differently than synchronous clients (need an async function for async clients)
+    if (isinstance(client, (AsyncOpenAI, AsyncAnthropic, AsyncTogether))):
+        async def traced_create(*args, **kwargs):
+            # Get the current trace from contextvars
+            current_trace = current_trace_var.get()
-            # Format and record the output
-            output_data = _format_output_data(client, response)
-            span.record_output(output_data)
+            # Skip tracing if no active trace
+            if not current_trace:
+                return original_create(*args, **kwargs)
+            with current_trace.span(span_name, span_type="llm") as span:
+                # Format and record the input parameters
+                input_data = _format_input_data(client, **kwargs)
+                span.record_input(input_data)
+                # Make the actual API call
+                try:
+                    response = await original_create(*args, **kwargs)
+                except Exception as e:
+                    print(f"Error during API call: {e}")
+                    raise
+                # Format and record the output
+                output_data = _format_output_data(client, response)
+                span.record_output(output_data)
+                return response
+    else:
+        def traced_create(*args, **kwargs):
+            # Get the current trace from contextvars
+            current_trace = current_trace_var.get()
-            return response
+            # Skip tracing if no active trace
+            if not current_trace:
+                return original_create(*args, **kwargs)
+            with current_trace.span(span_name, span_type="llm") as span:
+                # Format and record the input parameters
+                input_data = _format_input_data(client, **kwargs)
+                span.record_input(input_data)
+                # Make the actual API call
+                try:
+                    response = original_create(*args, **kwargs)
+                except Exception as e:
+                    print(f"Error during API call: {e}")
+                    raise
+                # Format and record the output
+                output_data = _format_output_data(client, response)
+                span.record_output(output_data)
+                return response
     # Replace the original method with our traced version
-    if isinstance(client, (OpenAI, Together)):
+    if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
         client.chat.completions.create = traced_create
-    elif isinstance(client, Anthropic):
+    elif isinstance(client, (Anthropic, AsyncAnthropic)):
         client.messages.create = traced_create
     return client
@@ -1246,11 +1284,11 @@ def _get_client_config(client: ApiClient) -> tuple[str, callable]:
     Raises:
         ValueError: If client type is not supported
     """
-    if isinstance(client, OpenAI):
+    if isinstance(client, (OpenAI, AsyncOpenAI)):
         return "OPENAI_API_CALL", client.chat.completions.create
-    elif isinstance(client, Together):
+    elif isinstance(client, (Together, AsyncTogether)):
         return "TOGETHER_API_CALL", client.chat.completions.create
-    elif isinstance(client, Anthropic):
+    elif isinstance(client, (Anthropic, AsyncAnthropic)):
         return "ANTHROPIC_API_CALL", client.messages.create
     raise ValueError(f"Unsupported client type: {type(client)}")
@@ -1260,7 +1298,7 @@ def _format_input_data(client: ApiClient, **kwargs) -> dict:
     Extracts relevant parameters from kwargs based on the client type
     to ensure consistent tracing across different APIs.
     """
-    if isinstance(client, (OpenAI, Together)):
+    if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
         return {
             "model": kwargs.get("model"),
             "messages": kwargs.get("messages"),
@@ -1283,7 +1321,7 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
             - content: The generated text
             - usage: Token usage statistics
     """
-    if isinstance(client, (OpenAI, Together)):
+    if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
         return {
             "content": response.choices[0].message.content,
             "usage": {

judgeval/constants.py CHANGED Viewed

@@ -26,7 +26,8 @@ class APIScorer(str, Enum):
     JSON_CORRECTNESS = "json_correctness"
     COMPARISON = "comparison"
     GROUNDEDNESS = "groundedness"
+    DERAILMENT = "derailment"
     @classmethod
     def _missing_(cls, value):
         # Handle case-insensitive lookup
@@ -39,7 +40,9 @@ UNBOUNDED_SCORERS = set([APIScorer.COMPARISON])  # scorers whose scores are not
 ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
 # API URLs
 JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
+JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
 JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
+JUDGMENT_DATASETS_APPEND_API_URL = f"{ROOT_API}/datasets/insert_examples/"
 JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
 JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
 JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
@@ -54,7 +57,6 @@ JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
 JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
 JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
 JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
-JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_trace_eval_queue/"
 JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
 # RabbitMQ
 RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")

judgeval/data/__init__.py CHANGED Viewed

@@ -1,12 +1,16 @@
 from judgeval.data.example import Example, ExampleParams
+from judgeval.data.custom_example import CustomExample
 from judgeval.data.scorer_data import ScorerData, create_scorer_data
 from judgeval.data.result import ScoringResult, generate_scoring_result
+from judgeval.data.sequence import Sequence
 __all__ = [
     "Example",
     "ExampleParams",
+    "CustomExample",
     "ScorerData",
     "create_scorer_data",
     "ScoringResult",
     "generate_scoring_result",
+    "Sequence",
 ]

judgeval/data/custom_example.py ADDED Viewed

@@ -0,0 +1,18 @@
+from pydantic import BaseModel, Field
+from typing import Optional, Union, List, Dict, Any
+from uuid import uuid4
+class CustomExample(BaseModel):
+    input: Optional[Dict[str, Any]] = None
+    actual_output: Optional[Dict[str, Any]] = None
+    expected_output: Optional[Dict[str, Any]] = None
+    context: Optional[List[str]] = None
+    retrieval_context: Optional[List[str]] = None
+    additional_metadata: Optional[Dict[str, Any]] = None
+    tools_called: Optional[List[str]] = None
+    expected_tools: Optional[List[str]] = None
+    name: Optional[str] = None
+    example_id: str = Field(default_factory=lambda: str(uuid4()))
+    example_index: Optional[int] = None
+    timestamp: Optional[str] = None
+    trace_id: Optional[str] = None

judgeval/data/datasets/eval_dataset_client.py CHANGED Viewed

@@ -6,6 +6,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
 from judgeval.common.logger import debug, error, warning, info
 from judgeval.constants import (
     JUDGMENT_DATASETS_PUSH_API_URL,
+    JUDGMENT_DATASETS_APPEND_API_URL,
     JUDGMENT_DATASETS_PULL_API_URL,
     JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
     JUDGMENT_DATASETS_DELETE_API_URL,
@@ -70,9 +71,9 @@ class EvalDatasetClient:
                     },
                     verify=True
                 )
-                if response.status_code == 500:
-                    error(f"Server error during push: {content.get('message')}")
-                    return False
+                if response.status_code != 200:
+                    error(f"Server error during push: {response.json()}")
+                    raise Exception(f"Server error during push: {response.json()}")
                 response.raise_for_status()
             except requests.exceptions.HTTPError as err:
                 if response.status_code == 422:
@@ -90,6 +91,64 @@ class EvalDatasetClient:
                 )
             return True
+    def append(self, alias: str, examples: List[Example], project_name: str) -> bool:
+        debug(f"Appending dataset with alias '{alias}'")
+        """
+        Appends the dataset to Judgment platform
+        Mock request:
+        dataset = {
+            "alias": alias,
+            "examples": [...],
+            "project_name": project_name
+        } ==>
+        {
+            "_alias": alias,
+            "_id": "..."  # ID of the dataset
+        }
+        """
+        with Progress(
+            SpinnerColumn(style="rgb(106,0,255)"),
+            TextColumn("[progress.description]{task.description}"),
+            transient=False,
+        ) as progress:
+            task_id = progress.add_task(
+                f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
+                total=100,
+            )
+            content = {
+                    "dataset_alias": alias,
+                    "project_name": project_name,
+                    "examples": [e.to_dict() for e in examples],
+                }
+            try:
+                response = requests.post(
+                    JUDGMENT_DATASETS_APPEND_API_URL,
+                    json=content,
+                    headers={
+                        "Content-Type": "application/json",
+                        "Authorization": f"Bearer {self.judgment_api_key}",
+                        "X-Organization-Id": self.organization_id
+                    },
+                    verify=True
+                )
+                if response.status_code != 200:
+                    error(f"Server error during append: {response.json()}")
+                    raise Exception(f"Server error during append: {response.json()}")
+                response.raise_for_status()
+            except requests.exceptions.HTTPError as err:
+                if response.status_code == 422:
+                    error(f"Validation error during append: {err.response.json()}")
+                else:
+                    error(f"HTTP error during append: {err}")
+            progress.update(
+                    task_id,
+                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
+                )
+            return True
     def pull(self, alias: str, project_name: str) -> EvalDataset:
         debug(f"Pulling dataset with alias '{alias}'")
         """

judgeval/data/example.py CHANGED Viewed

@@ -37,6 +37,7 @@ class Example(BaseModel):
     example_index: Optional[int] = None
     timestamp: Optional[str] = None
     trace_id: Optional[str] = None
+    sequence_order: Optional[int] = 0
     def __init__(self, **data):
         if 'example_id' not in data:

judgeval/data/result.py CHANGED Viewed

@@ -2,7 +2,8 @@ from dataclasses import dataclass
 from typing import List, Union, Optional, Dict, Any, Union
 from judgeval.common.logger import debug, error
 from pydantic import BaseModel
-from judgeval.data import ScorerData, Example
+from judgeval.data import ScorerData, Example, CustomExample
+from judgeval.data.sequence import Sequence
 class ScoringResult(BaseModel):
@@ -23,7 +24,7 @@ class ScoringResult(BaseModel):
     name: Optional[str] = None
     # The original example object that was used to create the ScoringResult
-    data_object: Optional[Example] = None #can be Example, CustomExample (future), WorkflowRun (future)
+    data_object: Optional[Union[Sequence, CustomExample, Example]] = None
     trace_id: Optional[str] = None
     # Additional fields for internal use
@@ -48,7 +49,7 @@ class ScoringResult(BaseModel):
 def generate_scoring_result(
-    example: Example,
+    data_object: Union[Example, Sequence],
     scorers_data: List[ScorerData],
     run_duration: float,
     success: bool,
@@ -59,15 +60,15 @@ def generate_scoring_result(
     When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
     At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
     """
-    if example.name is not None:
-        name = example.name
+    if data_object.name is not None:
+        name = data_object.name
     else:
         name = "Test Case Placeholder"
         debug(f"No name provided for example, using default name: {name}")
     debug(f"Creating ScoringResult for: {name}")
     scoring_result = ScoringResult(
         name=name,
-        data_object=example,
+        data_object=data_object,
         success=success,
         scorers_data=scorers_data,
         run_duration=run_duration,

judgeval/data/sequence.py ADDED Viewed

@@ -0,0 +1,59 @@
+from pydantic import BaseModel, Field, field_validator, model_validator
+from typing import List, Optional, Union, Any
+from judgeval.data.example import Example
+from judgeval.scorers import ScorerWrapper, JudgevalScorer
+from uuid import uuid4
+from datetime import datetime, timezone
+class Sequence(BaseModel):
+    """
+    A sequence is a list of either Examples or nested Sequence objects.
+    """
+    sequence_id: str = Field(default_factory=lambda: str(uuid4()))
+    name: Optional[str] = "Sequence"
+    created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
+    items: List[Union["Sequence", Example]]
+    scorers: Optional[Any] = None
+    parent_sequence_id: Optional[str] = None
+    sequence_order: Optional[int] = 0
+    @field_validator("scorers")
+    def validate_scorer(cls, v):
+        loaded_scorers = []
+        for scorer in v or []:
+            try:
+                if isinstance(scorer, ScorerWrapper):
+                    loaded_scorers.append(scorer.load_implementation())
+                else:
+                    loaded_scorers.append(scorer)
+            except Exception as e:
+                raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
+        return loaded_scorers
+    @model_validator(mode='after')
+    def set_parent_sequence_ids(self) -> "Sequence":
+        """Recursively set the parent_sequence_id for all nested Sequences."""
+        for item in self.items:
+            if isinstance(item, Sequence):
+                item.parent_sequence_id = self.sequence_id
+                # Recurse into deeper nested sequences
+                item.set_parent_sequence_ids()
+        return self
+    @model_validator(mode='after')
+    def set_parent_and_order(self) -> "Sequence":
+        """Set parent_sequence_id and sequence_order for all items."""
+        for idx, item in enumerate(self.items):
+            # Set sequence_order for both Example and Sequence objects
+            item.sequence_order = idx
+            if isinstance(item, Sequence):
+                item.parent_sequence_id = self.sequence_id
+                item.set_parent_and_order()  # Recurse for nested sequences
+        return self
+    class Config:
+        arbitrary_types_allowed = True
+# Update forward references so that "Sequence" inside items is resolved.
+Sequence.model_rebuild()

judgeval/data/sequence_run.py ADDED Viewed

@@ -0,0 +1,42 @@
+from pydantic import BaseModel
+from typing import List, Optional, Dict, Any, Union
+from judgeval.data import Sequence
+from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
+from judgeval.judges import JudgevalJudge
+from judgeval.rules import Rule
+class SequenceRun(BaseModel):
+    """
+    Stores example and evaluation scorers together for running an eval task
+    Args:
+        project_name (str): The name of the project the evaluation results belong to
+        eval_name (str): A name for this evaluation run
+        sequences (List[Sequence]): The sequences to evaluate
+        scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
+        model (str): The model used as a judge when using LLM as a Judge
+        aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
+        metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
+        judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
+        rules (Optional[List[Rule]]): Rules to evaluate against scoring results
+    """
+    # The user will specify whether they want log_results when they call run_eval
+    log_results: bool = False  # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
+    organization_id: Optional[str] = None
+    project_name: Optional[str] = None
+    eval_name: Optional[str] = None
+    sequences: List[Sequence]
+    model: Union[str, List[str], JudgevalJudge]
+    aggregator: Optional[str] = None
+    metadata: Optional[Dict[str, Any]] = None
+    trace_span_id: Optional[str] = None
+    # API Key will be "" until user calls client.run_eval(), then API Key will be set
+    judgment_api_key: Optional[str] = ""
+    override: Optional[bool] = False
+    rules: Optional[List[Rule]] = None
+    class Config:
+        arbitrary_types_allowed = True

judgeval/evaluation_run.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import List, Optional, Dict, Any, Union
 from pydantic import BaseModel, field_validator
-from judgeval.data import Example
+from judgeval.data import Example, CustomExample
 from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
 from judgeval.constants import ACCEPTABLE_MODELS
 from judgeval.common.logger import debug, error
@@ -15,7 +15,7 @@ class EvaluationRun(BaseModel):
     Args:
         project_name (str): The name of the project the evaluation results belong to
         eval_name (str): A name for this evaluation run
-        examples (List[Example]): The examples to evaluate
+        examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
         scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
         model (str): The model used as a judge when using LLM as a Judge
         aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -29,7 +29,7 @@ class EvaluationRun(BaseModel):
     organization_id: Optional[str] = None
     project_name: Optional[str] = None
     eval_name: Optional[str] = None
-    examples: List[Example]
+    examples: Union[List[Example], List[CustomExample]]
     scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
     model: Union[str, List[str], JudgevalJudge]
     aggregator: Optional[str] = None
@@ -38,6 +38,7 @@ class EvaluationRun(BaseModel):
     # API Key will be "" until user calls client.run_eval(), then API Key will be set
     judgment_api_key: Optional[str] = ""
     override: Optional[bool] = False
+    append: Optional[bool] = False
     rules: Optional[List[Rule]] = None
     def model_dump(self, **kwargs):
@@ -78,13 +79,17 @@ class EvaluationRun(BaseModel):
             raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
         return v
-    @field_validator('examples')
+    @field_validator('examples', mode='before')
     def validate_examples(cls, v):
         if not v:
             raise ValueError("Examples cannot be empty.")
-        for ex in v:
-            if not isinstance(ex, Example):
-                raise ValueError(f"Invalid type for Example: {type(ex)}")
+        first_type = type(v[0])
+        if first_type not in (Example, CustomExample):
+            raise ValueError(f"Invalid type for Example/CustomExample: {first_type}")
+        if not all(isinstance(ex, first_type) for ex in v):
+            raise ValueError("All examples must be of the same type, either all Example or all CustomExample.")
         return v
     @field_validator('scorers')

judgeval 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl

judgeval 0.0.29py3-none-any.whl → 0.0.31py3-none-any.whl