judgeval 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +132 -281
- judgeval/common/utils.py +1 -1
- judgeval/constants.py +2 -3
- judgeval/data/__init__.py +0 -2
- judgeval/data/datasets/dataset.py +2 -9
- judgeval/data/datasets/eval_dataset_client.py +1 -62
- judgeval/data/example.py +7 -7
- judgeval/data/result.py +3 -3
- judgeval/data/tool.py +19 -0
- judgeval/data/trace.py +5 -1
- judgeval/data/{sequence_run.py → trace_run.py} +4 -4
- judgeval/evaluation_run.py +1 -1
- judgeval/integrations/langgraph.py +187 -1768
- judgeval/judges/litellm_judge.py +1 -1
- judgeval/judges/mixture_of_judges.py +1 -1
- judgeval/judges/utils.py +1 -1
- judgeval/judgment_client.py +21 -25
- judgeval/run_evaluation.py +381 -107
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +4 -2
- judgeval-0.0.39.dist-info/METADATA +247 -0
- {judgeval-0.0.37.dist-info → judgeval-0.0.39.dist-info}/RECORD +23 -23
- judgeval/data/sequence.py +0 -50
- judgeval-0.0.37.dist-info/METADATA +0 -214
- {judgeval-0.0.37.dist-info → judgeval-0.0.39.dist-info}/WHEEL +0 -0
- {judgeval-0.0.37.dist-info → judgeval-0.0.39.dist-info}/licenses/LICENSE.md +0 -0
@@ -7,14 +7,13 @@ from judgeval.common.logger import debug, error, warning, info
|
|
7
7
|
from judgeval.constants import (
|
8
8
|
JUDGMENT_DATASETS_PUSH_API_URL,
|
9
9
|
JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
|
10
|
-
JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
|
11
10
|
JUDGMENT_DATASETS_PULL_API_URL,
|
12
11
|
JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
|
13
12
|
JUDGMENT_DATASETS_DELETE_API_URL,
|
14
13
|
JUDGMENT_DATASETS_INSERT_API_URL,
|
15
14
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
|
16
15
|
)
|
17
|
-
from judgeval.data import Example
|
16
|
+
from judgeval.data import Example
|
18
17
|
from judgeval.data.datasets import EvalDataset
|
19
18
|
|
20
19
|
|
@@ -59,8 +58,6 @@ class EvalDatasetClient:
|
|
59
58
|
"dataset_alias": alias,
|
60
59
|
"project_name": project_name,
|
61
60
|
"examples": [e.to_dict() for e in dataset.examples],
|
62
|
-
"sequences": [s.model_dump() for s in dataset.sequences],
|
63
|
-
"is_sequence": len(dataset.sequences) > 0,
|
64
61
|
"overwrite": overwrite,
|
65
62
|
}
|
66
63
|
try:
|
@@ -151,63 +148,6 @@ class EvalDatasetClient:
|
|
151
148
|
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
152
149
|
)
|
153
150
|
return True
|
154
|
-
|
155
|
-
def append_sequences(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
|
156
|
-
debug(f"Appending dataset with alias '{alias}'")
|
157
|
-
"""
|
158
|
-
Appends the dataset to Judgment platform
|
159
|
-
|
160
|
-
Mock request:
|
161
|
-
dataset = {
|
162
|
-
"alias": alias,
|
163
|
-
"examples": [...],
|
164
|
-
"project_name": project_name
|
165
|
-
} ==>
|
166
|
-
{
|
167
|
-
"_alias": alias,
|
168
|
-
"_id": "..." # ID of the dataset
|
169
|
-
}
|
170
|
-
"""
|
171
|
-
with Progress(
|
172
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
173
|
-
TextColumn("[progress.description]{task.description}"),
|
174
|
-
transient=False,
|
175
|
-
) as progress:
|
176
|
-
task_id = progress.add_task(
|
177
|
-
f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
|
178
|
-
total=100,
|
179
|
-
)
|
180
|
-
content = {
|
181
|
-
"dataset_alias": alias,
|
182
|
-
"project_name": project_name,
|
183
|
-
"sequences": [s.model_dump() for s in sequences],
|
184
|
-
}
|
185
|
-
try:
|
186
|
-
response = requests.post(
|
187
|
-
JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
|
188
|
-
json=content,
|
189
|
-
headers={
|
190
|
-
"Content-Type": "application/json",
|
191
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
192
|
-
"X-Organization-Id": self.organization_id
|
193
|
-
},
|
194
|
-
verify=True
|
195
|
-
)
|
196
|
-
if response.status_code != 200:
|
197
|
-
error(f"Server error during append: {response.json()}")
|
198
|
-
raise Exception(f"Server error during append: {response.json()}")
|
199
|
-
response.raise_for_status()
|
200
|
-
except requests.exceptions.HTTPError as err:
|
201
|
-
if response.status_code == 422:
|
202
|
-
error(f"Validation error during append: {err.response.json()}")
|
203
|
-
else:
|
204
|
-
error(f"HTTP error during append: {err}")
|
205
|
-
|
206
|
-
progress.update(
|
207
|
-
task_id,
|
208
|
-
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
209
|
-
)
|
210
|
-
return True
|
211
151
|
|
212
152
|
def pull(self, alias: str, project_name: str) -> EvalDataset:
|
213
153
|
debug(f"Pulling dataset with alias '{alias}'")
|
@@ -262,7 +202,6 @@ class EvalDatasetClient:
|
|
262
202
|
info(f"Successfully pulled dataset with alias '{alias}'")
|
263
203
|
payload = response.json()
|
264
204
|
dataset.examples = [Example(**e) for e in payload.get("examples", [])]
|
265
|
-
dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])]
|
266
205
|
dataset._alias = payload.get("alias")
|
267
206
|
dataset._id = payload.get("id")
|
268
207
|
progress.update(
|
judgeval/data/example.py
CHANGED
@@ -8,6 +8,7 @@ from uuid import uuid4
|
|
8
8
|
from pydantic import BaseModel, Field, field_validator
|
9
9
|
from enum import Enum
|
10
10
|
from datetime import datetime
|
11
|
+
from judgeval.data.tool import Tool
|
11
12
|
import time
|
12
13
|
|
13
14
|
|
@@ -31,13 +32,12 @@ class Example(BaseModel):
|
|
31
32
|
retrieval_context: Optional[List[str]] = None
|
32
33
|
additional_metadata: Optional[Dict[str, Any]] = None
|
33
34
|
tools_called: Optional[List[str]] = None
|
34
|
-
expected_tools: Optional[List[
|
35
|
+
expected_tools: Optional[List[Tool]] = None
|
35
36
|
name: Optional[str] = None
|
36
37
|
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
37
38
|
example_index: Optional[int] = None
|
38
39
|
timestamp: Optional[str] = None
|
39
40
|
trace_id: Optional[str] = None
|
40
|
-
sequence_order: Optional[int] = 0
|
41
41
|
|
42
42
|
def __init__(self, **data):
|
43
43
|
if 'example_id' not in data:
|
@@ -83,17 +83,17 @@ class Example(BaseModel):
|
|
83
83
|
raise ValueError(f"All items in expected_output must be strings but got {v}")
|
84
84
|
return v
|
85
85
|
|
86
|
-
@field_validator('expected_tools'
|
86
|
+
@field_validator('expected_tools')
|
87
87
|
@classmethod
|
88
88
|
def validate_expected_tools(cls, v):
|
89
89
|
if v is not None:
|
90
90
|
if not isinstance(v, list):
|
91
|
-
raise ValueError(f"Expected tools must be a list of
|
91
|
+
raise ValueError(f"Expected tools must be a list of Tools or None but got {v} of type {type(v)}")
|
92
92
|
|
93
|
-
# Check that each item in the list is a
|
93
|
+
# Check that each item in the list is a Tool
|
94
94
|
for i, item in enumerate(v):
|
95
|
-
if not isinstance(item,
|
96
|
-
raise ValueError(f"Expected tools must be a list of
|
95
|
+
if not isinstance(item, Tool):
|
96
|
+
raise ValueError(f"Expected tools must be a list of Tools, but item at index {i} is {item} of type {type(item)}")
|
97
97
|
|
98
98
|
return v
|
99
99
|
|
judgeval/data/result.py
CHANGED
@@ -3,7 +3,7 @@ from typing import List, Union, Optional, Dict, Any, Union
|
|
3
3
|
from judgeval.common.logger import debug, error
|
4
4
|
from pydantic import BaseModel
|
5
5
|
from judgeval.data import ScorerData, Example, CustomExample
|
6
|
-
from judgeval.data.
|
6
|
+
from judgeval.data.trace import TraceSpan
|
7
7
|
|
8
8
|
|
9
9
|
class ScoringResult(BaseModel):
|
@@ -24,7 +24,7 @@ class ScoringResult(BaseModel):
|
|
24
24
|
name: Optional[str] = None
|
25
25
|
|
26
26
|
# The original example object that was used to create the ScoringResult
|
27
|
-
data_object: Optional[Union[
|
27
|
+
data_object: Optional[Union[TraceSpan, CustomExample, Example]] = None
|
28
28
|
trace_id: Optional[str] = None
|
29
29
|
|
30
30
|
# Additional fields for internal use
|
@@ -49,7 +49,7 @@ class ScoringResult(BaseModel):
|
|
49
49
|
|
50
50
|
|
51
51
|
def generate_scoring_result(
|
52
|
-
data_object: Union[Example,
|
52
|
+
data_object: Union[Example, TraceSpan],
|
53
53
|
scorers_data: List[ScorerData],
|
54
54
|
run_duration: float,
|
55
55
|
success: bool,
|
judgeval/data/tool.py
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
from pydantic import BaseModel, field_validator
|
2
|
+
from typing import Dict, Any, Optional
|
3
|
+
import warnings
|
4
|
+
|
5
|
+
class Tool(BaseModel):
|
6
|
+
tool_name: str
|
7
|
+
parameters: Optional[Dict[str, Any]] = None
|
8
|
+
|
9
|
+
@field_validator('tool_name')
|
10
|
+
def validate_tool_name(cls, v):
|
11
|
+
if not v:
|
12
|
+
warnings.warn("Tool name is empty or None", UserWarning)
|
13
|
+
return v
|
14
|
+
|
15
|
+
@field_validator('parameters')
|
16
|
+
def validate_parameters(cls, v):
|
17
|
+
if v is not None and not isinstance(v, dict):
|
18
|
+
warnings.warn(f"Parameters should be a dictionary, got {type(v)}", UserWarning)
|
19
|
+
return v
|
judgeval/data/trace.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from pydantic import BaseModel
|
2
2
|
from typing import Optional, Dict, Any, List
|
3
3
|
from judgeval.evaluation_run import EvaluationRun
|
4
|
+
from judgeval.data.tool import Tool
|
4
5
|
import json
|
5
6
|
from datetime import datetime, timezone
|
6
7
|
|
@@ -9,7 +10,7 @@ class TraceSpan(BaseModel):
|
|
9
10
|
trace_id: str
|
10
11
|
function: Optional[str] = None
|
11
12
|
depth: int
|
12
|
-
created_at: Optional[
|
13
|
+
created_at: Optional[Any] = None
|
13
14
|
parent_span_id: Optional[str] = None
|
14
15
|
span_type: Optional[str] = "span"
|
15
16
|
inputs: Optional[Dict[str, Any]] = None
|
@@ -17,6 +18,8 @@ class TraceSpan(BaseModel):
|
|
17
18
|
duration: Optional[float] = None
|
18
19
|
annotation: Optional[List[Dict[str, Any]]] = None
|
19
20
|
evaluation_runs: Optional[List[EvaluationRun]] = []
|
21
|
+
expected_tools: Optional[List[Tool]] = None
|
22
|
+
additional_metadata: Optional[Dict[str, Any]] = None
|
20
23
|
|
21
24
|
def model_dump(self, **kwargs):
|
22
25
|
return {
|
@@ -124,6 +127,7 @@ class Trace(BaseModel):
|
|
124
127
|
duration: float
|
125
128
|
entries: List[TraceSpan]
|
126
129
|
overwrite: bool = False
|
130
|
+
offline_mode: bool = False
|
127
131
|
rules: Optional[Dict[str, Any]] = None
|
128
132
|
has_notification: Optional[bool] = False
|
129
133
|
|
@@ -1,20 +1,20 @@
|
|
1
1
|
|
2
2
|
from pydantic import BaseModel
|
3
3
|
from typing import List, Optional, Dict, Any, Union, Callable
|
4
|
-
from judgeval.data import
|
4
|
+
from judgeval.data import Trace
|
5
5
|
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
6
6
|
from judgeval.judges import JudgevalJudge
|
7
7
|
from judgeval.rules import Rule
|
8
8
|
|
9
9
|
|
10
|
-
class
|
10
|
+
class TraceRun(BaseModel):
|
11
11
|
"""
|
12
12
|
Stores example and evaluation scorers together for running an eval task
|
13
13
|
|
14
14
|
Args:
|
15
15
|
project_name (str): The name of the project the evaluation results belong to
|
16
16
|
eval_name (str): A name for this evaluation run
|
17
|
-
|
17
|
+
traces (List[Trace]): The traces to evaluate
|
18
18
|
scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
19
19
|
model (str): The model used as a judge when using LLM as a Judge
|
20
20
|
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
@@ -29,7 +29,7 @@ class SequenceRun(BaseModel):
|
|
29
29
|
organization_id: Optional[str] = None
|
30
30
|
project_name: Optional[str] = None
|
31
31
|
eval_name: Optional[str] = None
|
32
|
-
|
32
|
+
traces: Optional[List[Trace]] = None
|
33
33
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
|
34
34
|
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
|
35
35
|
aggregator: Optional[str] = None
|
judgeval/evaluation_run.py
CHANGED
@@ -79,7 +79,7 @@ class EvaluationRun(BaseModel):
|
|
79
79
|
raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
|
80
80
|
return v
|
81
81
|
|
82
|
-
@field_validator('examples'
|
82
|
+
@field_validator('examples')
|
83
83
|
def validate_examples(cls, v):
|
84
84
|
if not v:
|
85
85
|
raise ValueError("Examples cannot be empty.")
|