PyPI - judgeval - Versions diffs - 0.0.30__py3-none-any.whl → 0.0.32__py3-none-any.whl - Mend

judgeval 0.0.30py3-none-any.whl → 0.0.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

judgeval/__init__.py +3 -1
judgeval/common/tracer.py +352 -117
judgeval/constants.py +5 -3
judgeval/data/__init__.py +4 -0
judgeval/data/custom_example.py +18 -0
judgeval/data/datasets/dataset.py +5 -1
judgeval/data/datasets/eval_dataset_client.py +64 -5
judgeval/data/example.py +1 -0
judgeval/data/result.py +7 -6
judgeval/data/sequence.py +55 -0
judgeval/data/sequence_run.py +44 -0
judgeval/evaluation_run.py +12 -7
judgeval/integrations/langgraph.py +89 -72
judgeval/judgment_client.py +70 -68
judgeval/run_evaluation.py +87 -13
judgeval/scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorer.py +3 -0
judgeval/scorers/judgeval_scorers/__init__.py +7 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -1
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +21 -0
judgeval/scorers/score.py +6 -5
judgeval/version_check.py +22 -0
{judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/METADATA +1 -1
{judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/RECORD +26 -22
judgeval/data/custom_api_example.py +0 -91
{judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/WHEEL +0 -0
{judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/licenses/LICENSE.md +0 -0

judgeval/constants.py CHANGED Viewed

@@ -26,7 +26,8 @@ class APIScorer(str, Enum):
     JSON_CORRECTNESS = "json_correctness"
     COMPARISON = "comparison"
     GROUNDEDNESS = "groundedness"
+    DERAILMENT = "derailment"
     @classmethod
     def _missing_(cls, value):
         # Handle case-insensitive lookup
@@ -39,8 +40,10 @@ UNBOUNDED_SCORERS = set([APIScorer.COMPARISON])  # scorers whose scores are not
 ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
 # API URLs
 JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
+JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
 JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
-JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
+JUDGMENT_DATASETS_APPEND_API_URL = f"{ROOT_API}/datasets/insert_examples/"
+JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
 JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
 JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
 JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
@@ -54,7 +57,6 @@ JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
 JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
 JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
 JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
-JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_trace_eval_queue/"
 JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
 # RabbitMQ
 RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")

judgeval/data/__init__.py CHANGED Viewed

@@ -1,12 +1,16 @@
 from judgeval.data.example import Example, ExampleParams
+from judgeval.data.custom_example import CustomExample
 from judgeval.data.scorer_data import ScorerData, create_scorer_data
 from judgeval.data.result import ScoringResult, generate_scoring_result
+from judgeval.data.sequence import Sequence
 __all__ = [
     "Example",
     "ExampleParams",
+    "CustomExample",
     "ScorerData",
     "create_scorer_data",
     "ScoringResult",
     "generate_scoring_result",
+    "Sequence",
 ]

judgeval/data/custom_example.py ADDED Viewed

@@ -0,0 +1,18 @@
+from pydantic import BaseModel, Field
+from typing import Optional, Union, List, Dict, Any
+from uuid import uuid4
+class CustomExample(BaseModel):
+    input: Optional[Dict[str, Any]] = None
+    actual_output: Optional[Dict[str, Any]] = None
+    expected_output: Optional[Dict[str, Any]] = None
+    context: Optional[List[str]] = None
+    retrieval_context: Optional[List[str]] = None
+    additional_metadata: Optional[Dict[str, Any]] = None
+    tools_called: Optional[List[str]] = None
+    expected_tools: Optional[List[str]] = None
+    name: Optional[str] = None
+    example_id: str = Field(default_factory=lambda: str(uuid4()))
+    example_index: Optional[int] = None
+    timestamp: Optional[str] = None
+    trace_id: Optional[str] = None

judgeval/data/datasets/dataset.py CHANGED Viewed

@@ -7,12 +7,13 @@ import yaml
 from dataclasses import dataclass, field
 from typing import List, Union, Literal
-from judgeval.data import Example
+from judgeval.data import Example, Sequence
 from judgeval.common.logger import debug, error, warning, info
 @dataclass
 class EvalDataset:
     examples: List[Example]
+    sequences: List[Sequence]
     _alias: Union[str, None] = field(default=None)
     _id: Union[str, None] = field(default=None)
     judgment_api_key: str = field(default="")
@@ -21,11 +22,13 @@ class EvalDataset:
                  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
                  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
                  examples: List[Example] = [],
+                 sequences: List[Sequence] = []
                  ):
         debug(f"Initializing EvalDataset with {len(examples)} examples")
         if not judgment_api_key:
             warning("No judgment_api_key provided")
         self.examples = examples
+        self.sequences = sequences
         self._alias = None
         self._id = None
         self.judgment_api_key = judgment_api_key
@@ -309,6 +312,7 @@ class EvalDataset:
         return (
             f"{self.__class__.__name__}("
             f"examples={self.examples}, "
+            f"sequences={self.sequences}, "
             f"_alias={self._alias}, "
             f"_id={self._id}"
             f")"

judgeval/data/datasets/eval_dataset_client.py CHANGED Viewed

@@ -6,13 +6,14 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
 from judgeval.common.logger import debug, error, warning, info
 from judgeval.constants import (
     JUDGMENT_DATASETS_PUSH_API_URL,
+    JUDGMENT_DATASETS_APPEND_API_URL,
     JUDGMENT_DATASETS_PULL_API_URL,
     JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
     JUDGMENT_DATASETS_DELETE_API_URL,
     JUDGMENT_DATASETS_INSERT_API_URL,
     JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
 )
-from judgeval.data import Example
+from judgeval.data import Example, Sequence
 from judgeval.data.datasets import EvalDataset
@@ -70,9 +71,9 @@ class EvalDatasetClient:
                     },
                     verify=True
                 )
-                if response.status_code == 500:
-                    error(f"Server error during push: {content.get('message')}")
-                    return False
+                if response.status_code != 200:
+                    error(f"Server error during push: {response.json()}")
+                    raise Exception(f"Server error during push: {response.json()}")
                 response.raise_for_status()
             except requests.exceptions.HTTPError as err:
                 if response.status_code == 422:
@@ -90,6 +91,64 @@ class EvalDatasetClient:
                 )
             return True
+    def append(self, alias: str, examples: List[Example], project_name: str) -> bool:
+        debug(f"Appending dataset with alias '{alias}'")
+        """
+        Appends the dataset to Judgment platform
+        Mock request:
+        dataset = {
+            "alias": alias,
+            "examples": [...],
+            "project_name": project_name
+        } ==>
+        {
+            "_alias": alias,
+            "_id": "..."  # ID of the dataset
+        }
+        """
+        with Progress(
+            SpinnerColumn(style="rgb(106,0,255)"),
+            TextColumn("[progress.description]{task.description}"),
+            transient=False,
+        ) as progress:
+            task_id = progress.add_task(
+                f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
+                total=100,
+            )
+            content = {
+                    "dataset_alias": alias,
+                    "project_name": project_name,
+                    "examples": [e.to_dict() for e in examples],
+                }
+            try:
+                response = requests.post(
+                    JUDGMENT_DATASETS_APPEND_API_URL,
+                    json=content,
+                    headers={
+                        "Content-Type": "application/json",
+                        "Authorization": f"Bearer {self.judgment_api_key}",
+                        "X-Organization-Id": self.organization_id
+                    },
+                    verify=True
+                )
+                if response.status_code != 200:
+                    error(f"Server error during append: {response.json()}")
+                    raise Exception(f"Server error during append: {response.json()}")
+                response.raise_for_status()
+            except requests.exceptions.HTTPError as err:
+                if response.status_code == 422:
+                    error(f"Validation error during append: {err.response.json()}")
+                else:
+                    error(f"HTTP error during append: {err}")
+            progress.update(
+                    task_id,
+                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
+                )
+            return True
     def pull(self, alias: str, project_name: str) -> EvalDataset:
         debug(f"Pulling dataset with alias '{alias}'")
         """
@@ -142,8 +201,8 @@ class EvalDatasetClient:
                 info(f"Successfully pulled dataset with alias '{alias}'")
                 payload = response.json()
                 dataset.examples = [Example(**e) for e in payload.get("examples", [])]
+                dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])]
                 dataset._alias = payload.get("alias")
                 dataset._id = payload.get("id")
                 progress.update(

judgeval/data/example.py CHANGED Viewed

@@ -37,6 +37,7 @@ class Example(BaseModel):
     example_index: Optional[int] = None
     timestamp: Optional[str] = None
     trace_id: Optional[str] = None
+    sequence_order: Optional[int] = 0
     def __init__(self, **data):
         if 'example_id' not in data:

judgeval/data/result.py CHANGED Viewed

@@ -2,7 +2,8 @@ from dataclasses import dataclass
 from typing import List, Union, Optional, Dict, Any, Union
 from judgeval.common.logger import debug, error
 from pydantic import BaseModel
-from judgeval.data import ScorerData, Example
+from judgeval.data import ScorerData, Example, CustomExample
+from judgeval.data.sequence import Sequence
 class ScoringResult(BaseModel):
@@ -23,7 +24,7 @@ class ScoringResult(BaseModel):
     name: Optional[str] = None
     # The original example object that was used to create the ScoringResult
-    data_object: Optional[Example] = None #can be Example, CustomExample (future), WorkflowRun (future)
+    data_object: Optional[Union[Sequence, CustomExample, Example]] = None
     trace_id: Optional[str] = None
     # Additional fields for internal use
@@ -48,7 +49,7 @@ class ScoringResult(BaseModel):
 def generate_scoring_result(
-    example: Example,
+    data_object: Union[Example, Sequence],
     scorers_data: List[ScorerData],
     run_duration: float,
     success: bool,
@@ -59,15 +60,15 @@ def generate_scoring_result(
     When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
     At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
     """
-    if example.name is not None:
-        name = example.name
+    if data_object.name is not None:
+        name = data_object.name
     else:
         name = "Test Case Placeholder"
         debug(f"No name provided for example, using default name: {name}")
     debug(f"Creating ScoringResult for: {name}")
     scoring_result = ScoringResult(
         name=name,
-        data_object=example,
+        data_object=data_object,
         success=success,
         scorers_data=scorers_data,
         run_duration=run_duration,

judgeval/data/sequence.py ADDED Viewed

@@ -0,0 +1,55 @@
+from pydantic import BaseModel, Field, field_validator, model_validator
+from typing import List, Optional, Union, Any
+from judgeval.data.example import Example
+from judgeval.scorers import ScorerWrapper, JudgevalScorer
+from uuid import uuid4
+from datetime import datetime, timezone
+class Sequence(BaseModel):
+    """
+    A sequence is a list of either Examples or nested Sequence objects.
+    """
+    sequence_id: str = Field(default_factory=lambda: str(uuid4()))
+    name: Optional[str] = "Sequence"
+    created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
+    items: List[Union["Sequence", Example]]
+    scorers: Optional[Any] = None
+    parent_sequence_id: Optional[str] = None
+    sequence_order: Optional[int] = 0
+    root_sequence_id: Optional[str] = None
+    inputs: Optional[str] = None
+    output: Optional[str] = None
+    @field_validator("scorers")
+    def validate_scorer(cls, v):
+        loaded_scorers = []
+        for scorer in v or []:
+            try:
+                if isinstance(scorer, ScorerWrapper):
+                    loaded_scorers.append(scorer.load_implementation())
+                else:
+                    loaded_scorers.append(scorer)
+            except Exception as e:
+                raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
+        return loaded_scorers
+    @model_validator(mode="after")
+    def populate_sequence_metadata(self) -> "Sequence":
+        """Recursively set parent_sequence_id, root_sequence_id, and sequence_order."""
+        # If root_sequence_id isn't already set, assign it to self
+        if self.root_sequence_id is None:
+            self.root_sequence_id = self.sequence_id
+        for idx, item in enumerate(self.items):
+            item.sequence_order = idx
+            if isinstance(item, Sequence):
+                item.parent_sequence_id = self.sequence_id
+                item.root_sequence_id = self.root_sequence_id
+                item.populate_sequence_metadata()
+        return self
+    class Config:
+        arbitrary_types_allowed = True
+# Update forward references so that "Sequence" inside items is resolved.
+Sequence.model_rebuild()

judgeval/data/sequence_run.py ADDED Viewed

@@ -0,0 +1,44 @@
+from pydantic import BaseModel
+from typing import List, Optional, Dict, Any, Union
+from judgeval.data import Sequence
+from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
+from judgeval.judges import JudgevalJudge
+from judgeval.rules import Rule
+class SequenceRun(BaseModel):
+    """
+    Stores example and evaluation scorers together for running an eval task
+    Args:
+        project_name (str): The name of the project the evaluation results belong to
+        eval_name (str): A name for this evaluation run
+        sequences (List[Sequence]): The sequences to evaluate
+        scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
+        model (str): The model used as a judge when using LLM as a Judge
+        aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
+        metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
+        judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
+        rules (Optional[List[Rule]]): Rules to evaluate against scoring results
+        append (Optional[bool]): Whether to append to existing evaluation results
+    """
+    # The user will specify whether they want log_results when they call run_eval
+    log_results: bool = False  # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
+    organization_id: Optional[str] = None
+    project_name: Optional[str] = None
+    eval_name: Optional[str] = None
+    sequences: List[Sequence]
+    model: Union[str, List[str], JudgevalJudge]
+    aggregator: Optional[str] = None
+    metadata: Optional[Dict[str, Any]] = None
+    trace_span_id: Optional[str] = None
+    append: Optional[bool] = False
+    # API Key will be "" until user calls client.run_eval(), then API Key will be set
+    judgment_api_key: Optional[str] = ""
+    override: Optional[bool] = False
+    rules: Optional[List[Rule]] = None
+    class Config:
+        arbitrary_types_allowed = True

judgeval/evaluation_run.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import List, Optional, Dict, Any, Union
 from pydantic import BaseModel, field_validator
-from judgeval.data import Example
+from judgeval.data import Example, CustomExample
 from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
 from judgeval.constants import ACCEPTABLE_MODELS
 from judgeval.common.logger import debug, error
@@ -15,7 +15,7 @@ class EvaluationRun(BaseModel):
     Args:
         project_name (str): The name of the project the evaluation results belong to
         eval_name (str): A name for this evaluation run
-        examples (List[Example]): The examples to evaluate
+        examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
         scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
         model (str): The model used as a judge when using LLM as a Judge
         aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -29,7 +29,7 @@ class EvaluationRun(BaseModel):
     organization_id: Optional[str] = None
     project_name: Optional[str] = None
     eval_name: Optional[str] = None
-    examples: List[Example]
+    examples: Union[List[Example], List[CustomExample]]
     scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
     model: Union[str, List[str], JudgevalJudge]
     aggregator: Optional[str] = None
@@ -38,6 +38,7 @@ class EvaluationRun(BaseModel):
     # API Key will be "" until user calls client.run_eval(), then API Key will be set
     judgment_api_key: Optional[str] = ""
     override: Optional[bool] = False
+    append: Optional[bool] = False
     rules: Optional[List[Rule]] = None
     def model_dump(self, **kwargs):
@@ -78,13 +79,17 @@ class EvaluationRun(BaseModel):
             raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
         return v
-    @field_validator('examples')
+    @field_validator('examples', mode='before')
     def validate_examples(cls, v):
         if not v:
             raise ValueError("Examples cannot be empty.")
-        for ex in v:
-            if not isinstance(ex, Example):
-                raise ValueError(f"Invalid type for Example: {type(ex)}")
+        first_type = type(v[0])
+        if first_type not in (Example, CustomExample):
+            raise ValueError(f"Invalid type for Example/CustomExample: {first_type}")
+        if not all(isinstance(ex, first_type) for ex in v):
+            raise ValueError("All examples must be of the same type, either all Example or all CustomExample.")
         return v
     @field_validator('scorers')

judgeval 0.0.30__py3-none-any.whl → 0.0.32__py3-none-any.whl

judgeval 0.0.30py3-none-any.whl → 0.0.32py3-none-any.whl