PyPI - judgeval - Versions diffs - 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl - Mend

judgeval 0.0.35py3-none-any.whl → 0.0.37py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

judgeval/common/tracer.py +869 -928
judgeval/common/utils.py +18 -0
judgeval/constants.py +6 -3
judgeval/data/__init__.py +4 -0
judgeval/data/datasets/dataset.py +3 -2
judgeval/data/datasets/eval_dataset_client.py +63 -3
judgeval/data/example.py +29 -7
judgeval/data/sequence.py +5 -4
judgeval/data/sequence_run.py +4 -3
judgeval/data/trace.py +129 -0
judgeval/evaluation_run.py +1 -1
judgeval/integrations/langgraph.py +1962 -299
judgeval/judgment_client.py +85 -66
judgeval/run_evaluation.py +191 -45
judgeval/scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -0
judgeval/scorers/score.py +2 -1
judgeval/utils/data_utils.py +57 -0
judgeval-0.0.37.dist-info/METADATA +214 -0
{judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/RECORD +23 -20
judgeval-0.0.35.dist-info/METADATA +0 -170
{judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/WHEEL +0 -0
{judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/utils.py CHANGED Viewed

@@ -12,6 +12,7 @@ NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is a
 import asyncio
 import concurrent.futures
 import os
+import requests
 import pprint
 from typing import Any, Dict, List, Literal, Mapping, Optional, Union
@@ -96,6 +97,23 @@ def read_file(file_path: str) -> str:
     with open(file_path, "r", encoding='utf-8') as file:
         return file.read()
+def validate_api_key(judgment_api_key: str):
+    """
+    Validates that the user api key is valid
+    """
+    response = requests.post(
+        f"{ROOT_API}/validate_api_key/",
+        headers={
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {judgment_api_key}",
+        },
+        json={},  # Empty body now
+        verify=True
+    )
+    if response.status_code == 200:
+        return True, response.json()
+    else:
+        return False, response.json().get("detail", "Error validating API key")
 def fetch_together_api_response(model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None) -> str:
     """

judgeval/constants.py CHANGED Viewed

@@ -27,7 +27,7 @@ class APIScorer(str, Enum):
     COMPARISON = "comparison"
     GROUNDEDNESS = "groundedness"
     DERAILMENT = "derailment"
+    TOOL_ORDER = "tool_order"
     @classmethod
     def _missing_(cls, value):
         # Handle case-insensitive lookup
@@ -42,14 +42,16 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
 JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
 JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
 JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
-JUDGMENT_DATASETS_APPEND_API_URL = f"{ROOT_API}/datasets/insert_examples/"
+JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
+JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL = f"{ROOT_API}/datasets/insert_sequences/"
 JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
 JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
 JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
 JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
 JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
 JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
-JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
+JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL = f"{ROOT_API}/traces/convert_trace_to_sequence/"
+JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/"
 JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
 JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
 JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
@@ -57,6 +59,7 @@ JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
 JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
 JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
 JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
+JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
 JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
 # RabbitMQ
 RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")

judgeval/data/__init__.py CHANGED Viewed

@@ -3,6 +3,8 @@ from judgeval.data.custom_example import CustomExample
 from judgeval.data.scorer_data import ScorerData, create_scorer_data
 from judgeval.data.result import ScoringResult, generate_scoring_result
 from judgeval.data.sequence import Sequence
+from judgeval.data.trace import Trace, TraceSpan
 __all__ = [
     "Example",
@@ -13,4 +15,6 @@ __all__ = [
     "ScoringResult",
     "generate_scoring_result",
     "Sequence",
+    "Trace",
+    "TraceSpan",
 ]

judgeval/data/datasets/dataset.py CHANGED Viewed

@@ -224,6 +224,9 @@ class EvalDataset:
         self.examples = self.examples + [e]
         # TODO if we need to add rank, then we need to do it here
+    def add_sequence(self, s: Sequence) -> None:
+        self.sequences = self.sequences + [s]
     def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
         """
         Saves the dataset as a file. Save only the examples.
@@ -270,7 +273,6 @@ class EvalDataset:
                             None,  # Example does not have comments
                             None,  # Example does not have source file
                             True,  # Adding an Example
-                            e.trace_id
                         ]
                     )
@@ -292,7 +294,6 @@ class EvalDataset:
                             "comments": None,  # Example does not have comments
                             "source_file": None,  # Example does not have source file
                             "example": True,  # Adding an Example
-                            "trace_id": e.trace_id
                         }
                         for e in self.examples
                     ],

judgeval/data/datasets/eval_dataset_client.py CHANGED Viewed

@@ -6,7 +6,8 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
 from judgeval.common.logger import debug, error, warning, info
 from judgeval.constants import (
     JUDGMENT_DATASETS_PUSH_API_URL,
-    JUDGMENT_DATASETS_APPEND_API_URL,
+    JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
+    JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
     JUDGMENT_DATASETS_PULL_API_URL,
     JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
     JUDGMENT_DATASETS_DELETE_API_URL,
@@ -58,6 +59,8 @@ class EvalDatasetClient:
                     "dataset_alias": alias,
                     "project_name": project_name,
                     "examples": [e.to_dict() for e in dataset.examples],
+                    "sequences": [s.model_dump() for s in dataset.sequences],
+                    "is_sequence": len(dataset.sequences) > 0,
                     "overwrite": overwrite,
                 }
             try:
@@ -92,7 +95,7 @@ class EvalDatasetClient:
             return True
-    def append(self, alias: str, examples: List[Example], project_name: str) -> bool:
+    def append_examples(self, alias: str, examples: List[Example], project_name: str) -> bool:
         debug(f"Appending dataset with alias '{alias}'")
         """
         Appends the dataset to Judgment platform
@@ -124,7 +127,7 @@ class EvalDatasetClient:
                 }
             try:
                 response = requests.post(
-                    JUDGMENT_DATASETS_APPEND_API_URL,
+                    JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
                     json=content,
                     headers={
                         "Content-Type": "application/json",
@@ -149,6 +152,63 @@ class EvalDatasetClient:
                 )
             return True
+    def append_sequences(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
+        debug(f"Appending dataset with alias '{alias}'")
+        """
+        Appends the dataset to Judgment platform
+        Mock request:
+        dataset = {
+            "alias": alias,
+            "examples": [...],
+            "project_name": project_name
+        } ==>
+        {
+            "_alias": alias,
+            "_id": "..."  # ID of the dataset
+        }
+        """
+        with Progress(
+            SpinnerColumn(style="rgb(106,0,255)"),
+            TextColumn("[progress.description]{task.description}"),
+            transient=False,
+        ) as progress:
+            task_id = progress.add_task(
+                f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
+                total=100,
+            )
+            content = {
+                    "dataset_alias": alias,
+                    "project_name": project_name,
+                    "sequences": [s.model_dump() for s in sequences],
+                }
+            try:
+                response = requests.post(
+                    JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
+                    json=content,
+                    headers={
+                        "Content-Type": "application/json",
+                        "Authorization": f"Bearer {self.judgment_api_key}",
+                        "X-Organization-Id": self.organization_id
+                    },
+                    verify=True
+                )
+                if response.status_code != 200:
+                    error(f"Server error during append: {response.json()}")
+                    raise Exception(f"Server error during append: {response.json()}")
+                response.raise_for_status()
+            except requests.exceptions.HTTPError as err:
+                if response.status_code == 422:
+                    error(f"Validation error during append: {err.response.json()}")
+                else:
+                    error(f"HTTP error during append: {err}")
+            progress.update(
+                    task_id,
+                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
+                )
+            return True
     def pull(self, alias: str, project_name: str) -> EvalDataset:
         debug(f"Pulling dataset with alias '{alias}'")
         """

judgeval/data/example.py CHANGED Viewed

@@ -24,14 +24,14 @@ class ExampleParams(Enum):
 class Example(BaseModel):
-    input: Optional[str] = None
+    input: Optional[Union[str, Dict[str, Any]]] = None
     actual_output: Optional[Union[str, List[str]]] = None
     expected_output: Optional[Union[str, List[str]]] = None
     context: Optional[List[str]] = None
     retrieval_context: Optional[List[str]] = None
     additional_metadata: Optional[Dict[str, Any]] = None
     tools_called: Optional[List[str]] = None
-    expected_tools: Optional[List[str]] = None
+    expected_tools: Optional[List[Dict[str, Any]]] = None
     name: Optional[str] = None
     example_id: str = Field(default_factory=lambda: str(uuid4()))
     example_index: Optional[int] = None
@@ -50,8 +50,18 @@ class Example(BaseModel):
     @field_validator('input', mode='before')
     @classmethod
     def validate_input(cls, v):
-        if v is not None and (not v or not isinstance(v, str)):
-            raise ValueError(f"Input must be a non-empty string but got '{v}' of type {type(v)}")
+        if v is not None:
+            if not isinstance(v, (str, dict)):
+                raise ValueError(f"Input must be a string or dictionary but got {v} of type {type(v)}")
+            # If it's a string, check that it's not empty
+            if isinstance(v, str) and not v:
+                raise ValueError(f"Input string must be non-empty but got '{v}'")
+            # If it's a dictionary, check that it's not empty
+            if isinstance(v, dict) and not v:
+                raise ValueError(f"Input dictionary must be non-empty but got {v}")
         return v
     @field_validator('actual_output', mode='before')
@@ -73,7 +83,21 @@ class Example(BaseModel):
             raise ValueError(f"All items in expected_output must be strings but got {v}")
         return v
-    @field_validator('context', 'retrieval_context', 'tools_called', 'expected_tools', mode='before')
+    @field_validator('expected_tools', mode='before')
+    @classmethod
+    def validate_expected_tools(cls, v):
+        if v is not None:
+            if not isinstance(v, list):
+                raise ValueError(f"Expected tools must be a list of dictionaries or None but got {v} of type {type(v)}")
+            # Check that each item in the list is a dictionary
+            for i, item in enumerate(v):
+                if not isinstance(item, dict):
+                    raise ValueError(f"Expected tools must be a list of dictionaries, but item at index {i} is {item} of type {type(item)}")
+        return v
+    @field_validator('context', 'retrieval_context', 'tools_called', mode='before')
     @classmethod
     def validate_string_lists(cls, v, info):
         field_name = info.field_name
@@ -127,7 +151,6 @@ class Example(BaseModel):
             "example_id": self.example_id,
             "example_index": self.example_index,
             "timestamp": self.timestamp,
-            "trace_id": self.trace_id
         }
     def __str__(self):
@@ -144,5 +167,4 @@ class Example(BaseModel):
             f"example_id={self.example_id}, "
             f"example_index={self.example_index}, "
             f"timestamp={self.timestamp}, "
-            f"trace_id={self.trace_id})"
         )

judgeval/data/sequence.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from pydantic import BaseModel, Field, field_validator, model_validator
-from typing import List, Optional, Union, Any
+from typing import List, Optional, Union, Any, Dict
 from judgeval.data.example import Example
 from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
 from uuid import uuid4
@@ -12,13 +12,14 @@ class Sequence(BaseModel):
     sequence_id: str = Field(default_factory=lambda: str(uuid4()))
     name: Optional[str] = "Sequence"
     created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
-    items: List[Union["Sequence", Example]]
+    items: List[Union["Sequence", Example]] = []
     scorers: Optional[Any] = None
     parent_sequence_id: Optional[str] = None
     sequence_order: Optional[int] = 0
     root_sequence_id: Optional[str] = None
-    inputs: Optional[str] = None
-    output: Optional[str] = None
+    inputs: Optional[Dict[str, Any]] = None
+    output: Optional[Any] = None
+    expected_tools: Optional[List[Dict[str, Any]]] = None
     @field_validator("scorers")
     def validate_scorer(cls, v):

judgeval/data/sequence_run.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from pydantic import BaseModel
-from typing import List, Optional, Dict, Any, Union
+from typing import List, Optional, Dict, Any, Union, Callable
 from judgeval.data import Sequence
 from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
 from judgeval.judges import JudgevalJudge
@@ -29,8 +29,9 @@ class SequenceRun(BaseModel):
     organization_id: Optional[str] = None
     project_name: Optional[str] = None
     eval_name: Optional[str] = None
-    sequences: List[Sequence]
-    model: Union[str, List[str], JudgevalJudge]
+    sequences: Optional[List[Sequence]] = None
+    scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
+    model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
     aggregator: Optional[str] = None
     metadata: Optional[Dict[str, Any]] = None
     trace_span_id: Optional[str] = None

judgeval/data/trace.py ADDED Viewed

@@ -0,0 +1,129 @@
+from pydantic import BaseModel
+from typing import Optional, Dict, Any, List
+from judgeval.evaluation_run import EvaluationRun
+import json
+from datetime import datetime, timezone
+class TraceSpan(BaseModel):
+    span_id: str
+    trace_id: str
+    function: Optional[str] = None
+    depth: int
+    created_at: Optional[float] = None
+    parent_span_id: Optional[str] = None
+    span_type: Optional[str] = "span"
+    inputs: Optional[Dict[str, Any]] = None
+    output: Optional[Any] = None
+    duration: Optional[float] = None
+    annotation: Optional[List[Dict[str, Any]]] = None
+    evaluation_runs: Optional[List[EvaluationRun]] = []
+    def model_dump(self, **kwargs):
+        return {
+            "span_id": self.span_id,
+            "trace_id": self.trace_id,
+            "depth": self.depth,
+#             "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
+            "created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
+            "inputs": self._serialize_inputs(),
+            "output": self._serialize_output(),
+            "evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
+            "parent_span_id": self.parent_span_id,
+            "function": self.function,
+            "duration": self.duration,
+            "span_type": self.span_type
+        }
+    def print_span(self):
+        """Print the span with proper formatting and parent relationship information."""
+        indent = "  " * self.depth
+        parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
+        print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
+    def _serialize_inputs(self) -> dict:
+        """Helper method to serialize input data safely."""
+        if self.inputs is None:
+            return {}
+        serialized_inputs = {}
+        for key, value in self.inputs.items():
+            if isinstance(value, BaseModel):
+                serialized_inputs[key] = value.model_dump()
+            elif isinstance(value, (list, tuple)):
+                # Handle lists/tuples of arguments
+                serialized_inputs[key] = [
+                    item.model_dump() if isinstance(item, BaseModel)
+                    else None if not self._is_json_serializable(item)
+                    else item
+                    for item in value
+                ]
+            else:
+                if self._is_json_serializable(value):
+                    serialized_inputs[key] = value
+                else:
+                    serialized_inputs[key] = self.safe_stringify(value, self.function)
+        return serialized_inputs
+    def _is_json_serializable(self, obj: Any) -> bool:
+        """Helper method to check if an object is JSON serializable."""
+        try:
+            json.dumps(obj)
+            return True
+        except (TypeError, OverflowError, ValueError):
+            return False
+    def safe_stringify(self, output, function_name):
+        """
+        Safely converts an object to a string or repr, handling serialization issues gracefully.
+        """
+        try:
+            return str(output)
+        except (TypeError, OverflowError, ValueError):
+            pass
+        try:
+            return repr(output)
+        except (TypeError, OverflowError, ValueError):
+            pass
+        warnings.warn(
+            f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
+        )
+        return None
+    def _serialize_output(self) -> Any:
+        """Helper method to serialize output data safely."""
+        if self.output is None:
+            return None
+        def serialize_value(value):
+            if isinstance(value, BaseModel):
+                return value.model_dump()
+            elif isinstance(value, dict):
+                # Recursively serialize dictionary values
+                return {k: serialize_value(v) for k, v in value.items()}
+            elif isinstance(value, (list, tuple)):
+                # Recursively serialize list/tuple items
+                return [serialize_value(item) for item in value]
+            else:
+                # Try direct JSON serialization first
+                try:
+                    json.dumps(value)
+                    return value
+                except (TypeError, OverflowError, ValueError):
+                    # Fallback to safe stringification
+                    return self.safe_stringify(value, self.function)
+        # Start serialization with the top-level output
+        return serialize_value(self.output)
+class Trace(BaseModel):
+    trace_id: str
+    name: str
+    created_at: str
+    duration: float
+    entries: List[TraceSpan]
+    overwrite: bool = False
+    rules: Optional[Dict[str, Any]] = None
+    has_notification: Optional[bool] = False

judgeval/evaluation_run.py CHANGED Viewed

@@ -31,7 +31,7 @@ class EvaluationRun(BaseModel):
     eval_name: Optional[str] = None
     examples: Union[List[Example], List[CustomExample]]
     scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
-    model: Union[str, List[str], JudgevalJudge]
+    model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
     aggregator: Optional[str] = None
     metadata: Optional[Dict[str, Any]] = None
     trace_span_id: Optional[str] = None

judgeval 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl

judgeval 0.0.35py3-none-any.whl → 0.0.37py3-none-any.whl