PyPI - judgeval - Versions diffs - 0.0.36__py3-none-any.whl → 0.0.38__py3-none-any.whl - Mend

judgeval 0.0.36py3-none-any.whl → 0.0.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

judgeval/common/tracer.py +663 -1105
judgeval/common/utils.py +19 -1
judgeval/constants.py +3 -3
judgeval/data/__init__.py +4 -2
judgeval/data/datasets/dataset.py +2 -11
judgeval/data/datasets/eval_dataset_client.py +1 -62
judgeval/data/example.py +29 -8
judgeval/data/result.py +3 -3
judgeval/data/trace.py +132 -0
judgeval/data/{sequence_run.py → trace_run.py} +7 -6
judgeval/evaluation_run.py +2 -2
judgeval/integrations/langgraph.py +189 -1769
judgeval/judges/litellm_judge.py +1 -1
judgeval/judges/mixture_of_judges.py +1 -1
judgeval/judges/utils.py +1 -1
judgeval/judgment_client.py +85 -78
judgeval/run_evaluation.py +98 -51
judgeval/scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +20 -0
judgeval/scorers/score.py +1 -1
judgeval/utils/data_utils.py +57 -0
judgeval-0.0.38.dist-info/METADATA +247 -0
{judgeval-0.0.36.dist-info → judgeval-0.0.38.dist-info}/RECORD +26 -24
judgeval/data/sequence.py +0 -49
judgeval-0.0.36.dist-info/METADATA +0 -169
{judgeval-0.0.36.dist-info → judgeval-0.0.38.dist-info}/WHEEL +0 -0
{judgeval-0.0.36.dist-info → judgeval-0.0.38.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/utils.py CHANGED Viewed

@@ -12,6 +12,7 @@ NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is a
 import asyncio
 import concurrent.futures
 import os
+import requests
 import pprint
 from typing import Any, Dict, List, Literal, Mapping, Optional, Union
@@ -96,6 +97,23 @@ def read_file(file_path: str) -> str:
     with open(file_path, "r", encoding='utf-8') as file:
         return file.read()
+def validate_api_key(judgment_api_key: str):
+    """
+    Validates that the user api key is valid
+    """
+    response = requests.post(
+        f"{ROOT_API}/validate_api_key/",
+        headers={
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {judgment_api_key}",
+        },
+        json={},  # Empty body now
+        verify=True
+    )
+    if response.status_code == 200:
+        return True, response.json()
+    else:
+        return False, response.json().get("detail", "Error validating API key")
 def fetch_together_api_response(model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None) -> str:
     """
@@ -747,7 +765,7 @@ if __name__ == "__main__":
     # Batched single completion to multiple models
     pprint.pprint(get_completion_multiple_models(
         models=[
-            "LLAMA3_70B_INSTRUCT_TURBO", "LLAMA3_405B_INSTRUCT_TURBO", "gpt-4o-mini"
+            "LLAMA3_70B_INSTRUCT_TURBO", "LLAMA3_405B_INSTRUCT_TURBO", "gpt-4.1-mini"
         ],
         messages=[
             [

judgeval/constants.py CHANGED Viewed

@@ -27,7 +27,7 @@ class APIScorer(str, Enum):
     COMPARISON = "comparison"
     GROUNDEDNESS = "groundedness"
     DERAILMENT = "derailment"
+    TOOL_ORDER = "tool_order"
     @classmethod
     def _missing_(cls, value):
         # Handle case-insensitive lookup
@@ -40,10 +40,9 @@ UNBOUNDED_SCORERS = set([APIScorer.COMPARISON])  # scorers whose scores are not
 ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
 # API URLs
 JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
-JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
+JUDGMENT_TRACE_EVAL_API_URL = f"{ROOT_API}/evaluate_trace/"
 JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
 JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
-JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL = f"{ROOT_API}/datasets/insert_sequences/"
 JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
 JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
 JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
@@ -58,6 +57,7 @@ JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
 JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
 JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
 JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
+JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
 JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
 # RabbitMQ
 RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")

judgeval/data/__init__.py CHANGED Viewed

@@ -2,7 +2,8 @@ from judgeval.data.example import Example, ExampleParams
 from judgeval.data.custom_example import CustomExample
 from judgeval.data.scorer_data import ScorerData, create_scorer_data
 from judgeval.data.result import ScoringResult, generate_scoring_result
-from judgeval.data.sequence import Sequence
+from judgeval.data.trace import Trace, TraceSpan
 __all__ = [
     "Example",
@@ -12,5 +13,6 @@ __all__ = [
     "create_scorer_data",
     "ScoringResult",
     "generate_scoring_result",
-    "Sequence",
+    "Trace",
+    "TraceSpan",
 ]

judgeval/data/datasets/dataset.py CHANGED Viewed

@@ -7,13 +7,12 @@ import yaml
 from dataclasses import dataclass, field
 from typing import List, Union, Literal
-from judgeval.data import Example, Sequence
+from judgeval.data import Example
 from judgeval.common.logger import debug, error, warning, info
 @dataclass
 class EvalDataset:
     examples: List[Example]
-    sequences: List[Sequence]
     _alias: Union[str, None] = field(default=None)
     _id: Union[str, None] = field(default=None)
     judgment_api_key: str = field(default="")
@@ -22,13 +21,11 @@ class EvalDataset:
                  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
                  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
                  examples: List[Example] = [],
-                 sequences: List[Sequence] = []
                  ):
         debug(f"Initializing EvalDataset with {len(examples)} examples")
         if not judgment_api_key:
             warning("No judgment_api_key provided")
         self.examples = examples
-        self.sequences = sequences
         self._alias = None
         self._id = None
         self.judgment_api_key = judgment_api_key
@@ -223,10 +220,7 @@ class EvalDataset:
     def add_example(self, e: Example) -> None:
         self.examples = self.examples + [e]
         # TODO if we need to add rank, then we need to do it here
-    def add_sequence(self, s: Sequence) -> None:
-        self.sequences = self.sequences + [s]
     def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
         """
         Saves the dataset as a file. Save only the examples.
@@ -273,7 +267,6 @@ class EvalDataset:
                             None,  # Example does not have comments
                             None,  # Example does not have source file
                             True,  # Adding an Example
-                            e.trace_id
                         ]
                     )
@@ -295,7 +288,6 @@ class EvalDataset:
                             "comments": None,  # Example does not have comments
                             "source_file": None,  # Example does not have source file
                             "example": True,  # Adding an Example
-                            "trace_id": e.trace_id
                         }
                         for e in self.examples
                     ],
@@ -315,7 +307,6 @@ class EvalDataset:
         return (
             f"{self.__class__.__name__}("
             f"examples={self.examples}, "
-            f"sequences={self.sequences}, "
             f"_alias={self._alias}, "
             f"_id={self._id}"
             f")"

judgeval/data/datasets/eval_dataset_client.py CHANGED Viewed

@@ -7,14 +7,13 @@ from judgeval.common.logger import debug, error, warning, info
 from judgeval.constants import (
     JUDGMENT_DATASETS_PUSH_API_URL,
     JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
-    JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
     JUDGMENT_DATASETS_PULL_API_URL,
     JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
     JUDGMENT_DATASETS_DELETE_API_URL,
     JUDGMENT_DATASETS_INSERT_API_URL,
     JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
 )
-from judgeval.data import Example, Sequence
+from judgeval.data import Example
 from judgeval.data.datasets import EvalDataset
@@ -59,8 +58,6 @@ class EvalDatasetClient:
                     "dataset_alias": alias,
                     "project_name": project_name,
                     "examples": [e.to_dict() for e in dataset.examples],
-                    "sequences": [s.model_dump() for s in dataset.sequences],
-                    "is_sequence": len(dataset.sequences) > 0,
                     "overwrite": overwrite,
                 }
             try:
@@ -151,63 +148,6 @@ class EvalDatasetClient:
                     description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
                 )
             return True
-    def append_sequences(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
-        debug(f"Appending dataset with alias '{alias}'")
-        """
-        Appends the dataset to Judgment platform
-        Mock request:
-        dataset = {
-            "alias": alias,
-            "examples": [...],
-            "project_name": project_name
-        } ==>
-        {
-            "_alias": alias,
-            "_id": "..."  # ID of the dataset
-        }
-        """
-        with Progress(
-            SpinnerColumn(style="rgb(106,0,255)"),
-            TextColumn("[progress.description]{task.description}"),
-            transient=False,
-        ) as progress:
-            task_id = progress.add_task(
-                f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
-                total=100,
-            )
-            content = {
-                    "dataset_alias": alias,
-                    "project_name": project_name,
-                    "sequences": [s.model_dump() for s in sequences],
-                }
-            try:
-                response = requests.post(
-                    JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
-                    json=content,
-                    headers={
-                        "Content-Type": "application/json",
-                        "Authorization": f"Bearer {self.judgment_api_key}",
-                        "X-Organization-Id": self.organization_id
-                    },
-                    verify=True
-                )
-                if response.status_code != 200:
-                    error(f"Server error during append: {response.json()}")
-                    raise Exception(f"Server error during append: {response.json()}")
-                response.raise_for_status()
-            except requests.exceptions.HTTPError as err:
-                if response.status_code == 422:
-                    error(f"Validation error during append: {err.response.json()}")
-                else:
-                    error(f"HTTP error during append: {err}")
-            progress.update(
-                    task_id,
-                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
-                )
-            return True
     def pull(self, alias: str, project_name: str) -> EvalDataset:
         debug(f"Pulling dataset with alias '{alias}'")
@@ -262,7 +202,6 @@ class EvalDatasetClient:
                 info(f"Successfully pulled dataset with alias '{alias}'")
                 payload = response.json()
                 dataset.examples = [Example(**e) for e in payload.get("examples", [])]
-                dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])]
                 dataset._alias = payload.get("alias")
                 dataset._id = payload.get("id")
                 progress.update(

judgeval/data/example.py CHANGED Viewed

@@ -24,20 +24,19 @@ class ExampleParams(Enum):
 class Example(BaseModel):
-    input: Optional[str] = None
+    input: Optional[Union[str, Dict[str, Any]]] = None
     actual_output: Optional[Union[str, List[str]]] = None
     expected_output: Optional[Union[str, List[str]]] = None
     context: Optional[List[str]] = None
     retrieval_context: Optional[List[str]] = None
     additional_metadata: Optional[Dict[str, Any]] = None
     tools_called: Optional[List[str]] = None
-    expected_tools: Optional[List[str]] = None
+    expected_tools: Optional[List[Dict[str, Any]]] = None
     name: Optional[str] = None
     example_id: str = Field(default_factory=lambda: str(uuid4()))
     example_index: Optional[int] = None
     timestamp: Optional[str] = None
     trace_id: Optional[str] = None
-    sequence_order: Optional[int] = 0
     def __init__(self, **data):
         if 'example_id' not in data:
@@ -50,8 +49,18 @@ class Example(BaseModel):
     @field_validator('input', mode='before')
     @classmethod
     def validate_input(cls, v):
-        if v is not None and (not v or not isinstance(v, str)):
-            raise ValueError(f"Input must be a non-empty string but got '{v}' of type {type(v)}")
+        if v is not None:
+            if not isinstance(v, (str, dict)):
+                raise ValueError(f"Input must be a string or dictionary but got {v} of type {type(v)}")
+            # If it's a string, check that it's not empty
+            if isinstance(v, str) and not v:
+                raise ValueError(f"Input string must be non-empty but got '{v}'")
+            # If it's a dictionary, check that it's not empty
+            if isinstance(v, dict) and not v:
+                raise ValueError(f"Input dictionary must be non-empty but got {v}")
         return v
     @field_validator('actual_output', mode='before')
@@ -73,7 +82,21 @@ class Example(BaseModel):
             raise ValueError(f"All items in expected_output must be strings but got {v}")
         return v
-    @field_validator('context', 'retrieval_context', 'tools_called', 'expected_tools', mode='before')
+    @field_validator('expected_tools', mode='before')
+    @classmethod
+    def validate_expected_tools(cls, v):
+        if v is not None:
+            if not isinstance(v, list):
+                raise ValueError(f"Expected tools must be a list of dictionaries or None but got {v} of type {type(v)}")
+            # Check that each item in the list is a dictionary
+            for i, item in enumerate(v):
+                if not isinstance(item, dict):
+                    raise ValueError(f"Expected tools must be a list of dictionaries, but item at index {i} is {item} of type {type(item)}")
+        return v
+    @field_validator('context', 'retrieval_context', 'tools_called', mode='before')
     @classmethod
     def validate_string_lists(cls, v, info):
         field_name = info.field_name
@@ -127,7 +150,6 @@ class Example(BaseModel):
             "example_id": self.example_id,
             "example_index": self.example_index,
             "timestamp": self.timestamp,
-            "trace_id": self.trace_id
         }
     def __str__(self):
@@ -144,5 +166,4 @@ class Example(BaseModel):
             f"example_id={self.example_id}, "
             f"example_index={self.example_index}, "
             f"timestamp={self.timestamp}, "
-            f"trace_id={self.trace_id})"
         )

judgeval/data/result.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import List, Union, Optional, Dict, Any, Union
 from judgeval.common.logger import debug, error
 from pydantic import BaseModel
 from judgeval.data import ScorerData, Example, CustomExample
-from judgeval.data.sequence import Sequence
+from judgeval.data.trace import TraceSpan
 class ScoringResult(BaseModel):
@@ -24,7 +24,7 @@ class ScoringResult(BaseModel):
     name: Optional[str] = None
     # The original example object that was used to create the ScoringResult
-    data_object: Optional[Union[Sequence, CustomExample, Example]] = None
+    data_object: Optional[Union[TraceSpan, CustomExample, Example]] = None
     trace_id: Optional[str] = None
     # Additional fields for internal use
@@ -49,7 +49,7 @@ class ScoringResult(BaseModel):
 def generate_scoring_result(
-    data_object: Union[Example, Sequence],
+    data_object: Union[Example, TraceSpan],
     scorers_data: List[ScorerData],
     run_duration: float,
     success: bool,

judgeval/data/trace.py ADDED Viewed

@@ -0,0 +1,132 @@
+from pydantic import BaseModel
+from typing import Optional, Dict, Any, List
+from judgeval.evaluation_run import EvaluationRun
+import json
+from datetime import datetime, timezone
+class TraceSpan(BaseModel):
+    span_id: str
+    trace_id: str
+    function: Optional[str] = None
+    depth: int
+    created_at: Optional[Any] = None
+    parent_span_id: Optional[str] = None
+    span_type: Optional[str] = "span"
+    inputs: Optional[Dict[str, Any]] = None
+    output: Optional[Any] = None
+    duration: Optional[float] = None
+    annotation: Optional[List[Dict[str, Any]]] = None
+    evaluation_runs: Optional[List[EvaluationRun]] = []
+    expected_tools: Optional[List[Dict[str, Any]]] = None
+    additional_metadata: Optional[Dict[str, Any]] = None
+    def model_dump(self, **kwargs):
+        return {
+            "span_id": self.span_id,
+            "trace_id": self.trace_id,
+            "depth": self.depth,
+#             "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
+            "created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
+            "inputs": self._serialize_inputs(),
+            "output": self._serialize_output(),
+            "evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
+            "parent_span_id": self.parent_span_id,
+            "function": self.function,
+            "duration": self.duration,
+            "span_type": self.span_type
+        }
+    def print_span(self):
+        """Print the span with proper formatting and parent relationship information."""
+        indent = "  " * self.depth
+        parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
+        print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
+    def _serialize_inputs(self) -> dict:
+        """Helper method to serialize input data safely."""
+        if self.inputs is None:
+            return {}
+        serialized_inputs = {}
+        for key, value in self.inputs.items():
+            if isinstance(value, BaseModel):
+                serialized_inputs[key] = value.model_dump()
+            elif isinstance(value, (list, tuple)):
+                # Handle lists/tuples of arguments
+                serialized_inputs[key] = [
+                    item.model_dump() if isinstance(item, BaseModel)
+                    else None if not self._is_json_serializable(item)
+                    else item
+                    for item in value
+                ]
+            else:
+                if self._is_json_serializable(value):
+                    serialized_inputs[key] = value
+                else:
+                    serialized_inputs[key] = self.safe_stringify(value, self.function)
+        return serialized_inputs
+    def _is_json_serializable(self, obj: Any) -> bool:
+        """Helper method to check if an object is JSON serializable."""
+        try:
+            json.dumps(obj)
+            return True
+        except (TypeError, OverflowError, ValueError):
+            return False
+    def safe_stringify(self, output, function_name):
+        """
+        Safely converts an object to a string or repr, handling serialization issues gracefully.
+        """
+        try:
+            return str(output)
+        except (TypeError, OverflowError, ValueError):
+            pass
+        try:
+            return repr(output)
+        except (TypeError, OverflowError, ValueError):
+            pass
+        warnings.warn(
+            f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
+        )
+        return None
+    def _serialize_output(self) -> Any:
+        """Helper method to serialize output data safely."""
+        if self.output is None:
+            return None
+        def serialize_value(value):
+            if isinstance(value, BaseModel):
+                return value.model_dump()
+            elif isinstance(value, dict):
+                # Recursively serialize dictionary values
+                return {k: serialize_value(v) for k, v in value.items()}
+            elif isinstance(value, (list, tuple)):
+                # Recursively serialize list/tuple items
+                return [serialize_value(item) for item in value]
+            else:
+                # Try direct JSON serialization first
+                try:
+                    json.dumps(value)
+                    return value
+                except (TypeError, OverflowError, ValueError):
+                    # Fallback to safe stringification
+                    return self.safe_stringify(value, self.function)
+        # Start serialization with the top-level output
+        return serialize_value(self.output)
+class Trace(BaseModel):
+    trace_id: str
+    name: str
+    created_at: str
+    duration: float
+    entries: List[TraceSpan]
+    overwrite: bool = False
+    offline_mode: bool = False
+    rules: Optional[Dict[str, Any]] = None
+    has_notification: Optional[bool] = False

judgeval/data/{sequence_run.py → trace_run.py} RENAMED Viewed

@@ -1,20 +1,20 @@
 from pydantic import BaseModel
-from typing import List, Optional, Dict, Any, Union
-from judgeval.data import Sequence
+from typing import List, Optional, Dict, Any, Union, Callable
+from judgeval.data import Trace
 from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
 from judgeval.judges import JudgevalJudge
 from judgeval.rules import Rule
-class SequenceRun(BaseModel):
+class TraceRun(BaseModel):
     """
     Stores example and evaluation scorers together for running an eval task
     Args:
         project_name (str): The name of the project the evaluation results belong to
         eval_name (str): A name for this evaluation run
-        sequences (List[Sequence]): The sequences to evaluate
+        traces (List[Trace]): The traces to evaluate
         scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
         model (str): The model used as a judge when using LLM as a Judge
         aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -29,8 +29,9 @@ class SequenceRun(BaseModel):
     organization_id: Optional[str] = None
     project_name: Optional[str] = None
     eval_name: Optional[str] = None
-    sequences: List[Sequence]
-    model: Union[str, List[str], JudgevalJudge]
+    traces: Optional[List[Trace]] = None
+    scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
+    model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
     aggregator: Optional[str] = None
     metadata: Optional[Dict[str, Any]] = None
     trace_span_id: Optional[str] = None

judgeval/evaluation_run.py CHANGED Viewed

@@ -31,7 +31,7 @@ class EvaluationRun(BaseModel):
     eval_name: Optional[str] = None
     examples: Union[List[Example], List[CustomExample]]
     scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
-    model: Union[str, List[str], JudgevalJudge]
+    model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
     aggregator: Optional[str] = None
     metadata: Optional[Dict[str, Any]] = None
     trace_span_id: Optional[str] = None
@@ -79,7 +79,7 @@ class EvaluationRun(BaseModel):
             raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
         return v
-    @field_validator('examples', mode='before')
+    @field_validator('examples')
     def validate_examples(cls, v):
         if not v:
             raise ValueError("Examples cannot be empty.")

judgeval 0.0.36__py3-none-any.whl → 0.0.38__py3-none-any.whl

judgeval 0.0.36py3-none-any.whl → 0.0.38py3-none-any.whl