judgeval 0.0.37__py3-none-any.whl → 0.0.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +132 -281
- judgeval/common/utils.py +1 -1
- judgeval/constants.py +1 -3
- judgeval/data/__init__.py +0 -2
- judgeval/data/datasets/dataset.py +2 -9
- judgeval/data/datasets/eval_dataset_client.py +1 -62
- judgeval/data/example.py +0 -1
- judgeval/data/result.py +3 -3
- judgeval/data/trace.py +4 -1
- judgeval/data/{sequence_run.py → trace_run.py} +4 -4
- judgeval/evaluation_run.py +1 -1
- judgeval/integrations/langgraph.py +187 -1768
- judgeval/judges/litellm_judge.py +1 -1
- judgeval/judges/mixture_of_judges.py +1 -1
- judgeval/judges/utils.py +1 -1
- judgeval/judgment_client.py +15 -21
- judgeval/run_evaluation.py +31 -81
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +4 -2
- judgeval-0.0.38.dist-info/METADATA +247 -0
- {judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/RECORD +22 -23
- judgeval/data/sequence.py +0 -50
- judgeval-0.0.37.dist-info/METADATA +0 -214
- {judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/WHEEL +0 -0
- {judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/licenses/LICENSE.md +0 -0
@@ -7,14 +7,13 @@ from judgeval.common.logger import debug, error, warning, info
|
|
7
7
|
from judgeval.constants import (
|
8
8
|
JUDGMENT_DATASETS_PUSH_API_URL,
|
9
9
|
JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
|
10
|
-
JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
|
11
10
|
JUDGMENT_DATASETS_PULL_API_URL,
|
12
11
|
JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
|
13
12
|
JUDGMENT_DATASETS_DELETE_API_URL,
|
14
13
|
JUDGMENT_DATASETS_INSERT_API_URL,
|
15
14
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
|
16
15
|
)
|
17
|
-
from judgeval.data import Example
|
16
|
+
from judgeval.data import Example
|
18
17
|
from judgeval.data.datasets import EvalDataset
|
19
18
|
|
20
19
|
|
@@ -59,8 +58,6 @@ class EvalDatasetClient:
|
|
59
58
|
"dataset_alias": alias,
|
60
59
|
"project_name": project_name,
|
61
60
|
"examples": [e.to_dict() for e in dataset.examples],
|
62
|
-
"sequences": [s.model_dump() for s in dataset.sequences],
|
63
|
-
"is_sequence": len(dataset.sequences) > 0,
|
64
61
|
"overwrite": overwrite,
|
65
62
|
}
|
66
63
|
try:
|
@@ -151,63 +148,6 @@ class EvalDatasetClient:
|
|
151
148
|
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
152
149
|
)
|
153
150
|
return True
|
154
|
-
|
155
|
-
def append_sequences(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
|
156
|
-
debug(f"Appending dataset with alias '{alias}'")
|
157
|
-
"""
|
158
|
-
Appends the dataset to Judgment platform
|
159
|
-
|
160
|
-
Mock request:
|
161
|
-
dataset = {
|
162
|
-
"alias": alias,
|
163
|
-
"examples": [...],
|
164
|
-
"project_name": project_name
|
165
|
-
} ==>
|
166
|
-
{
|
167
|
-
"_alias": alias,
|
168
|
-
"_id": "..." # ID of the dataset
|
169
|
-
}
|
170
|
-
"""
|
171
|
-
with Progress(
|
172
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
173
|
-
TextColumn("[progress.description]{task.description}"),
|
174
|
-
transient=False,
|
175
|
-
) as progress:
|
176
|
-
task_id = progress.add_task(
|
177
|
-
f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
|
178
|
-
total=100,
|
179
|
-
)
|
180
|
-
content = {
|
181
|
-
"dataset_alias": alias,
|
182
|
-
"project_name": project_name,
|
183
|
-
"sequences": [s.model_dump() for s in sequences],
|
184
|
-
}
|
185
|
-
try:
|
186
|
-
response = requests.post(
|
187
|
-
JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
|
188
|
-
json=content,
|
189
|
-
headers={
|
190
|
-
"Content-Type": "application/json",
|
191
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
192
|
-
"X-Organization-Id": self.organization_id
|
193
|
-
},
|
194
|
-
verify=True
|
195
|
-
)
|
196
|
-
if response.status_code != 200:
|
197
|
-
error(f"Server error during append: {response.json()}")
|
198
|
-
raise Exception(f"Server error during append: {response.json()}")
|
199
|
-
response.raise_for_status()
|
200
|
-
except requests.exceptions.HTTPError as err:
|
201
|
-
if response.status_code == 422:
|
202
|
-
error(f"Validation error during append: {err.response.json()}")
|
203
|
-
else:
|
204
|
-
error(f"HTTP error during append: {err}")
|
205
|
-
|
206
|
-
progress.update(
|
207
|
-
task_id,
|
208
|
-
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
209
|
-
)
|
210
|
-
return True
|
211
151
|
|
212
152
|
def pull(self, alias: str, project_name: str) -> EvalDataset:
|
213
153
|
debug(f"Pulling dataset with alias '{alias}'")
|
@@ -262,7 +202,6 @@ class EvalDatasetClient:
|
|
262
202
|
info(f"Successfully pulled dataset with alias '{alias}'")
|
263
203
|
payload = response.json()
|
264
204
|
dataset.examples = [Example(**e) for e in payload.get("examples", [])]
|
265
|
-
dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])]
|
266
205
|
dataset._alias = payload.get("alias")
|
267
206
|
dataset._id = payload.get("id")
|
268
207
|
progress.update(
|
judgeval/data/example.py
CHANGED
judgeval/data/result.py
CHANGED
@@ -3,7 +3,7 @@ from typing import List, Union, Optional, Dict, Any, Union
|
|
3
3
|
from judgeval.common.logger import debug, error
|
4
4
|
from pydantic import BaseModel
|
5
5
|
from judgeval.data import ScorerData, Example, CustomExample
|
6
|
-
from judgeval.data.
|
6
|
+
from judgeval.data.trace import TraceSpan
|
7
7
|
|
8
8
|
|
9
9
|
class ScoringResult(BaseModel):
|
@@ -24,7 +24,7 @@ class ScoringResult(BaseModel):
|
|
24
24
|
name: Optional[str] = None
|
25
25
|
|
26
26
|
# The original example object that was used to create the ScoringResult
|
27
|
-
data_object: Optional[Union[
|
27
|
+
data_object: Optional[Union[TraceSpan, CustomExample, Example]] = None
|
28
28
|
trace_id: Optional[str] = None
|
29
29
|
|
30
30
|
# Additional fields for internal use
|
@@ -49,7 +49,7 @@ class ScoringResult(BaseModel):
|
|
49
49
|
|
50
50
|
|
51
51
|
def generate_scoring_result(
|
52
|
-
data_object: Union[Example,
|
52
|
+
data_object: Union[Example, TraceSpan],
|
53
53
|
scorers_data: List[ScorerData],
|
54
54
|
run_duration: float,
|
55
55
|
success: bool,
|
judgeval/data/trace.py
CHANGED
@@ -9,7 +9,7 @@ class TraceSpan(BaseModel):
|
|
9
9
|
trace_id: str
|
10
10
|
function: Optional[str] = None
|
11
11
|
depth: int
|
12
|
-
created_at: Optional[
|
12
|
+
created_at: Optional[Any] = None
|
13
13
|
parent_span_id: Optional[str] = None
|
14
14
|
span_type: Optional[str] = "span"
|
15
15
|
inputs: Optional[Dict[str, Any]] = None
|
@@ -17,6 +17,8 @@ class TraceSpan(BaseModel):
|
|
17
17
|
duration: Optional[float] = None
|
18
18
|
annotation: Optional[List[Dict[str, Any]]] = None
|
19
19
|
evaluation_runs: Optional[List[EvaluationRun]] = []
|
20
|
+
expected_tools: Optional[List[Dict[str, Any]]] = None
|
21
|
+
additional_metadata: Optional[Dict[str, Any]] = None
|
20
22
|
|
21
23
|
def model_dump(self, **kwargs):
|
22
24
|
return {
|
@@ -124,6 +126,7 @@ class Trace(BaseModel):
|
|
124
126
|
duration: float
|
125
127
|
entries: List[TraceSpan]
|
126
128
|
overwrite: bool = False
|
129
|
+
offline_mode: bool = False
|
127
130
|
rules: Optional[Dict[str, Any]] = None
|
128
131
|
has_notification: Optional[bool] = False
|
129
132
|
|
@@ -1,20 +1,20 @@
|
|
1
1
|
|
2
2
|
from pydantic import BaseModel
|
3
3
|
from typing import List, Optional, Dict, Any, Union, Callable
|
4
|
-
from judgeval.data import
|
4
|
+
from judgeval.data import Trace
|
5
5
|
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
6
6
|
from judgeval.judges import JudgevalJudge
|
7
7
|
from judgeval.rules import Rule
|
8
8
|
|
9
9
|
|
10
|
-
class
|
10
|
+
class TraceRun(BaseModel):
|
11
11
|
"""
|
12
12
|
Stores example and evaluation scorers together for running an eval task
|
13
13
|
|
14
14
|
Args:
|
15
15
|
project_name (str): The name of the project the evaluation results belong to
|
16
16
|
eval_name (str): A name for this evaluation run
|
17
|
-
|
17
|
+
traces (List[Trace]): The traces to evaluate
|
18
18
|
scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
19
19
|
model (str): The model used as a judge when using LLM as a Judge
|
20
20
|
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
@@ -29,7 +29,7 @@ class SequenceRun(BaseModel):
|
|
29
29
|
organization_id: Optional[str] = None
|
30
30
|
project_name: Optional[str] = None
|
31
31
|
eval_name: Optional[str] = None
|
32
|
-
|
32
|
+
traces: Optional[List[Trace]] = None
|
33
33
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
|
34
34
|
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
|
35
35
|
aggregator: Optional[str] = None
|
judgeval/evaluation_run.py
CHANGED
@@ -79,7 +79,7 @@ class EvaluationRun(BaseModel):
|
|
79
79
|
raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
|
80
80
|
return v
|
81
81
|
|
82
|
-
@field_validator('examples'
|
82
|
+
@field_validator('examples')
|
83
83
|
def validate_examples(cls, v):
|
84
84
|
if not v:
|
85
85
|
raise ValueError("Examples cannot be empty.")
|