PyPI - judgeval - Versions diffs - 0.0.37__py3-none-any.whl → 0.0.38__py3-none-any.whl - Mend

judgeval 0.0.37py3-none-any.whl → 0.0.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

judgeval/common/tracer.py +132 -281
judgeval/common/utils.py +1 -1
judgeval/constants.py +1 -3
judgeval/data/__init__.py +0 -2
judgeval/data/datasets/dataset.py +2 -9
judgeval/data/datasets/eval_dataset_client.py +1 -62
judgeval/data/example.py +0 -1
judgeval/data/result.py +3 -3
judgeval/data/trace.py +4 -1
judgeval/data/{sequence_run.py → trace_run.py} +4 -4
judgeval/evaluation_run.py +1 -1
judgeval/integrations/langgraph.py +187 -1768
judgeval/judges/litellm_judge.py +1 -1
judgeval/judges/mixture_of_judges.py +1 -1
judgeval/judges/utils.py +1 -1
judgeval/judgment_client.py +15 -21
judgeval/run_evaluation.py +31 -81
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +4 -2
judgeval-0.0.38.dist-info/METADATA +247 -0
{judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/RECORD +22 -23
judgeval/data/sequence.py +0 -50
judgeval-0.0.37.dist-info/METADATA +0 -214
{judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/WHEEL +0 -0
{judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/licenses/LICENSE.md +0 -0

judgeval/data/datasets/eval_dataset_client.py CHANGED Viewed

@@ -7,14 +7,13 @@ from judgeval.common.logger import debug, error, warning, info
 from judgeval.constants import (
     JUDGMENT_DATASETS_PUSH_API_URL,
     JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
-    JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
     JUDGMENT_DATASETS_PULL_API_URL,
     JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
     JUDGMENT_DATASETS_DELETE_API_URL,
     JUDGMENT_DATASETS_INSERT_API_URL,
     JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
 )
-from judgeval.data import Example, Sequence
+from judgeval.data import Example
 from judgeval.data.datasets import EvalDataset
@@ -59,8 +58,6 @@ class EvalDatasetClient:
                     "dataset_alias": alias,
                     "project_name": project_name,
                     "examples": [e.to_dict() for e in dataset.examples],
-                    "sequences": [s.model_dump() for s in dataset.sequences],
-                    "is_sequence": len(dataset.sequences) > 0,
                     "overwrite": overwrite,
                 }
             try:
@@ -151,63 +148,6 @@ class EvalDatasetClient:
                     description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
                 )
             return True
-    def append_sequences(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
-        debug(f"Appending dataset with alias '{alias}'")
-        """
-        Appends the dataset to Judgment platform
-        Mock request:
-        dataset = {
-            "alias": alias,
-            "examples": [...],
-            "project_name": project_name
-        } ==>
-        {
-            "_alias": alias,
-            "_id": "..."  # ID of the dataset
-        }
-        """
-        with Progress(
-            SpinnerColumn(style="rgb(106,0,255)"),
-            TextColumn("[progress.description]{task.description}"),
-            transient=False,
-        ) as progress:
-            task_id = progress.add_task(
-                f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
-                total=100,
-            )
-            content = {
-                    "dataset_alias": alias,
-                    "project_name": project_name,
-                    "sequences": [s.model_dump() for s in sequences],
-                }
-            try:
-                response = requests.post(
-                    JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
-                    json=content,
-                    headers={
-                        "Content-Type": "application/json",
-                        "Authorization": f"Bearer {self.judgment_api_key}",
-                        "X-Organization-Id": self.organization_id
-                    },
-                    verify=True
-                )
-                if response.status_code != 200:
-                    error(f"Server error during append: {response.json()}")
-                    raise Exception(f"Server error during append: {response.json()}")
-                response.raise_for_status()
-            except requests.exceptions.HTTPError as err:
-                if response.status_code == 422:
-                    error(f"Validation error during append: {err.response.json()}")
-                else:
-                    error(f"HTTP error during append: {err}")
-            progress.update(
-                    task_id,
-                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
-                )
-            return True
     def pull(self, alias: str, project_name: str) -> EvalDataset:
         debug(f"Pulling dataset with alias '{alias}'")
@@ -262,7 +202,6 @@ class EvalDatasetClient:
                 info(f"Successfully pulled dataset with alias '{alias}'")
                 payload = response.json()
                 dataset.examples = [Example(**e) for e in payload.get("examples", [])]
-                dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])]
                 dataset._alias = payload.get("alias")
                 dataset._id = payload.get("id")
                 progress.update(

judgeval/data/example.py CHANGED Viewed

@@ -37,7 +37,6 @@ class Example(BaseModel):
     example_index: Optional[int] = None
     timestamp: Optional[str] = None
     trace_id: Optional[str] = None
-    sequence_order: Optional[int] = 0
     def __init__(self, **data):
         if 'example_id' not in data:

judgeval/data/result.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import List, Union, Optional, Dict, Any, Union
 from judgeval.common.logger import debug, error
 from pydantic import BaseModel
 from judgeval.data import ScorerData, Example, CustomExample
-from judgeval.data.sequence import Sequence
+from judgeval.data.trace import TraceSpan
 class ScoringResult(BaseModel):
@@ -24,7 +24,7 @@ class ScoringResult(BaseModel):
     name: Optional[str] = None
     # The original example object that was used to create the ScoringResult
-    data_object: Optional[Union[Sequence, CustomExample, Example]] = None
+    data_object: Optional[Union[TraceSpan, CustomExample, Example]] = None
     trace_id: Optional[str] = None
     # Additional fields for internal use
@@ -49,7 +49,7 @@ class ScoringResult(BaseModel):
 def generate_scoring_result(
-    data_object: Union[Example, Sequence],
+    data_object: Union[Example, TraceSpan],
     scorers_data: List[ScorerData],
     run_duration: float,
     success: bool,

judgeval/data/trace.py CHANGED Viewed

@@ -9,7 +9,7 @@ class TraceSpan(BaseModel):
     trace_id: str
     function: Optional[str] = None
     depth: int
-    created_at: Optional[float] = None
+    created_at: Optional[Any] = None
     parent_span_id: Optional[str] = None
     span_type: Optional[str] = "span"
     inputs: Optional[Dict[str, Any]] = None
@@ -17,6 +17,8 @@ class TraceSpan(BaseModel):
     duration: Optional[float] = None
     annotation: Optional[List[Dict[str, Any]]] = None
     evaluation_runs: Optional[List[EvaluationRun]] = []
+    expected_tools: Optional[List[Dict[str, Any]]] = None
+    additional_metadata: Optional[Dict[str, Any]] = None
     def model_dump(self, **kwargs):
         return {
@@ -124,6 +126,7 @@ class Trace(BaseModel):
     duration: float
     entries: List[TraceSpan]
     overwrite: bool = False
+    offline_mode: bool = False
     rules: Optional[Dict[str, Any]] = None
     has_notification: Optional[bool] = False

judgeval/data/{sequence_run.py → trace_run.py} RENAMED Viewed

@@ -1,20 +1,20 @@
 from pydantic import BaseModel
 from typing import List, Optional, Dict, Any, Union, Callable
-from judgeval.data import Sequence
+from judgeval.data import Trace
 from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
 from judgeval.judges import JudgevalJudge
 from judgeval.rules import Rule
-class SequenceRun(BaseModel):
+class TraceRun(BaseModel):
     """
     Stores example and evaluation scorers together for running an eval task
     Args:
         project_name (str): The name of the project the evaluation results belong to
         eval_name (str): A name for this evaluation run
-        sequences (List[Sequence]): The sequences to evaluate
+        traces (List[Trace]): The traces to evaluate
         scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
         model (str): The model used as a judge when using LLM as a Judge
         aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -29,7 +29,7 @@ class SequenceRun(BaseModel):
     organization_id: Optional[str] = None
     project_name: Optional[str] = None
     eval_name: Optional[str] = None
-    sequences: Optional[List[Sequence]] = None
+    traces: Optional[List[Trace]] = None
     scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
     model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
     aggregator: Optional[str] = None

judgeval/evaluation_run.py CHANGED Viewed

@@ -79,7 +79,7 @@ class EvaluationRun(BaseModel):
             raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
         return v
-    @field_validator('examples', mode='before')
+    @field_validator('examples')
     def validate_examples(cls, v):
         if not v:
             raise ValueError("Examples cannot be empty.")

judgeval 0.0.37__py3-none-any.whl → 0.0.38__py3-none-any.whl

judgeval 0.0.37py3-none-any.whl → 0.0.38py3-none-any.whl