judgeval 0.0.36__py3-none-any.whl → 0.0.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +663 -1105
- judgeval/common/utils.py +19 -1
- judgeval/constants.py +3 -3
- judgeval/data/__init__.py +4 -2
- judgeval/data/datasets/dataset.py +2 -11
- judgeval/data/datasets/eval_dataset_client.py +1 -62
- judgeval/data/example.py +29 -8
- judgeval/data/result.py +3 -3
- judgeval/data/trace.py +132 -0
- judgeval/data/{sequence_run.py → trace_run.py} +7 -6
- judgeval/evaluation_run.py +2 -2
- judgeval/integrations/langgraph.py +189 -1769
- judgeval/judges/litellm_judge.py +1 -1
- judgeval/judges/mixture_of_judges.py +1 -1
- judgeval/judges/utils.py +1 -1
- judgeval/judgment_client.py +85 -78
- judgeval/run_evaluation.py +98 -51
- judgeval/scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +20 -0
- judgeval/scorers/score.py +1 -1
- judgeval/utils/data_utils.py +57 -0
- judgeval-0.0.38.dist-info/METADATA +247 -0
- {judgeval-0.0.36.dist-info → judgeval-0.0.38.dist-info}/RECORD +26 -24
- judgeval/data/sequence.py +0 -49
- judgeval-0.0.36.dist-info/METADATA +0 -169
- {judgeval-0.0.36.dist-info → judgeval-0.0.38.dist-info}/WHEEL +0 -0
- {judgeval-0.0.36.dist-info → judgeval-0.0.38.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/utils.py
CHANGED
@@ -12,6 +12,7 @@ NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is a
|
|
12
12
|
import asyncio
|
13
13
|
import concurrent.futures
|
14
14
|
import os
|
15
|
+
import requests
|
15
16
|
import pprint
|
16
17
|
from typing import Any, Dict, List, Literal, Mapping, Optional, Union
|
17
18
|
|
@@ -96,6 +97,23 @@ def read_file(file_path: str) -> str:
|
|
96
97
|
with open(file_path, "r", encoding='utf-8') as file:
|
97
98
|
return file.read()
|
98
99
|
|
100
|
+
def validate_api_key(judgment_api_key: str):
|
101
|
+
"""
|
102
|
+
Validates that the user api key is valid
|
103
|
+
"""
|
104
|
+
response = requests.post(
|
105
|
+
f"{ROOT_API}/validate_api_key/",
|
106
|
+
headers={
|
107
|
+
"Content-Type": "application/json",
|
108
|
+
"Authorization": f"Bearer {judgment_api_key}",
|
109
|
+
},
|
110
|
+
json={}, # Empty body now
|
111
|
+
verify=True
|
112
|
+
)
|
113
|
+
if response.status_code == 200:
|
114
|
+
return True, response.json()
|
115
|
+
else:
|
116
|
+
return False, response.json().get("detail", "Error validating API key")
|
99
117
|
|
100
118
|
def fetch_together_api_response(model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None) -> str:
|
101
119
|
"""
|
@@ -747,7 +765,7 @@ if __name__ == "__main__":
|
|
747
765
|
# Batched single completion to multiple models
|
748
766
|
pprint.pprint(get_completion_multiple_models(
|
749
767
|
models=[
|
750
|
-
"LLAMA3_70B_INSTRUCT_TURBO", "LLAMA3_405B_INSTRUCT_TURBO", "gpt-
|
768
|
+
"LLAMA3_70B_INSTRUCT_TURBO", "LLAMA3_405B_INSTRUCT_TURBO", "gpt-4.1-mini"
|
751
769
|
],
|
752
770
|
messages=[
|
753
771
|
[
|
judgeval/constants.py
CHANGED
@@ -27,7 +27,7 @@ class APIScorer(str, Enum):
|
|
27
27
|
COMPARISON = "comparison"
|
28
28
|
GROUNDEDNESS = "groundedness"
|
29
29
|
DERAILMENT = "derailment"
|
30
|
-
|
30
|
+
TOOL_ORDER = "tool_order"
|
31
31
|
@classmethod
|
32
32
|
def _missing_(cls, value):
|
33
33
|
# Handle case-insensitive lookup
|
@@ -40,10 +40,9 @@ UNBOUNDED_SCORERS = set([APIScorer.COMPARISON]) # scorers whose scores are not
|
|
40
40
|
ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
41
41
|
# API URLs
|
42
42
|
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
43
|
-
|
43
|
+
JUDGMENT_TRACE_EVAL_API_URL = f"{ROOT_API}/evaluate_trace/"
|
44
44
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
45
45
|
JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
46
|
-
JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL = f"{ROOT_API}/datasets/insert_sequences/"
|
47
46
|
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
|
48
47
|
JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
|
49
48
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
@@ -58,6 +57,7 @@ JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
|
|
58
57
|
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
59
58
|
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
60
59
|
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
60
|
+
JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
|
61
61
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
62
62
|
# RabbitMQ
|
63
63
|
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
|
judgeval/data/__init__.py
CHANGED
@@ -2,7 +2,8 @@ from judgeval.data.example import Example, ExampleParams
|
|
2
2
|
from judgeval.data.custom_example import CustomExample
|
3
3
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
4
4
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
5
|
-
from judgeval.data.
|
5
|
+
from judgeval.data.trace import Trace, TraceSpan
|
6
|
+
|
6
7
|
|
7
8
|
__all__ = [
|
8
9
|
"Example",
|
@@ -12,5 +13,6 @@ __all__ = [
|
|
12
13
|
"create_scorer_data",
|
13
14
|
"ScoringResult",
|
14
15
|
"generate_scoring_result",
|
15
|
-
"
|
16
|
+
"Trace",
|
17
|
+
"TraceSpan",
|
16
18
|
]
|
@@ -7,13 +7,12 @@ import yaml
|
|
7
7
|
from dataclasses import dataclass, field
|
8
8
|
from typing import List, Union, Literal
|
9
9
|
|
10
|
-
from judgeval.data import Example
|
10
|
+
from judgeval.data import Example
|
11
11
|
from judgeval.common.logger import debug, error, warning, info
|
12
12
|
|
13
13
|
@dataclass
|
14
14
|
class EvalDataset:
|
15
15
|
examples: List[Example]
|
16
|
-
sequences: List[Sequence]
|
17
16
|
_alias: Union[str, None] = field(default=None)
|
18
17
|
_id: Union[str, None] = field(default=None)
|
19
18
|
judgment_api_key: str = field(default="")
|
@@ -22,13 +21,11 @@ class EvalDataset:
|
|
22
21
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
|
23
22
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
|
24
23
|
examples: List[Example] = [],
|
25
|
-
sequences: List[Sequence] = []
|
26
24
|
):
|
27
25
|
debug(f"Initializing EvalDataset with {len(examples)} examples")
|
28
26
|
if not judgment_api_key:
|
29
27
|
warning("No judgment_api_key provided")
|
30
28
|
self.examples = examples
|
31
|
-
self.sequences = sequences
|
32
29
|
self._alias = None
|
33
30
|
self._id = None
|
34
31
|
self.judgment_api_key = judgment_api_key
|
@@ -223,10 +220,7 @@ class EvalDataset:
|
|
223
220
|
def add_example(self, e: Example) -> None:
|
224
221
|
self.examples = self.examples + [e]
|
225
222
|
# TODO if we need to add rank, then we need to do it here
|
226
|
-
|
227
|
-
def add_sequence(self, s: Sequence) -> None:
|
228
|
-
self.sequences = self.sequences + [s]
|
229
|
-
|
223
|
+
|
230
224
|
def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
|
231
225
|
"""
|
232
226
|
Saves the dataset as a file. Save only the examples.
|
@@ -273,7 +267,6 @@ class EvalDataset:
|
|
273
267
|
None, # Example does not have comments
|
274
268
|
None, # Example does not have source file
|
275
269
|
True, # Adding an Example
|
276
|
-
e.trace_id
|
277
270
|
]
|
278
271
|
)
|
279
272
|
|
@@ -295,7 +288,6 @@ class EvalDataset:
|
|
295
288
|
"comments": None, # Example does not have comments
|
296
289
|
"source_file": None, # Example does not have source file
|
297
290
|
"example": True, # Adding an Example
|
298
|
-
"trace_id": e.trace_id
|
299
291
|
}
|
300
292
|
for e in self.examples
|
301
293
|
],
|
@@ -315,7 +307,6 @@ class EvalDataset:
|
|
315
307
|
return (
|
316
308
|
f"{self.__class__.__name__}("
|
317
309
|
f"examples={self.examples}, "
|
318
|
-
f"sequences={self.sequences}, "
|
319
310
|
f"_alias={self._alias}, "
|
320
311
|
f"_id={self._id}"
|
321
312
|
f")"
|
@@ -7,14 +7,13 @@ from judgeval.common.logger import debug, error, warning, info
|
|
7
7
|
from judgeval.constants import (
|
8
8
|
JUDGMENT_DATASETS_PUSH_API_URL,
|
9
9
|
JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
|
10
|
-
JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
|
11
10
|
JUDGMENT_DATASETS_PULL_API_URL,
|
12
11
|
JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
|
13
12
|
JUDGMENT_DATASETS_DELETE_API_URL,
|
14
13
|
JUDGMENT_DATASETS_INSERT_API_URL,
|
15
14
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
|
16
15
|
)
|
17
|
-
from judgeval.data import Example
|
16
|
+
from judgeval.data import Example
|
18
17
|
from judgeval.data.datasets import EvalDataset
|
19
18
|
|
20
19
|
|
@@ -59,8 +58,6 @@ class EvalDatasetClient:
|
|
59
58
|
"dataset_alias": alias,
|
60
59
|
"project_name": project_name,
|
61
60
|
"examples": [e.to_dict() for e in dataset.examples],
|
62
|
-
"sequences": [s.model_dump() for s in dataset.sequences],
|
63
|
-
"is_sequence": len(dataset.sequences) > 0,
|
64
61
|
"overwrite": overwrite,
|
65
62
|
}
|
66
63
|
try:
|
@@ -151,63 +148,6 @@ class EvalDatasetClient:
|
|
151
148
|
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
152
149
|
)
|
153
150
|
return True
|
154
|
-
|
155
|
-
def append_sequences(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
|
156
|
-
debug(f"Appending dataset with alias '{alias}'")
|
157
|
-
"""
|
158
|
-
Appends the dataset to Judgment platform
|
159
|
-
|
160
|
-
Mock request:
|
161
|
-
dataset = {
|
162
|
-
"alias": alias,
|
163
|
-
"examples": [...],
|
164
|
-
"project_name": project_name
|
165
|
-
} ==>
|
166
|
-
{
|
167
|
-
"_alias": alias,
|
168
|
-
"_id": "..." # ID of the dataset
|
169
|
-
}
|
170
|
-
"""
|
171
|
-
with Progress(
|
172
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
173
|
-
TextColumn("[progress.description]{task.description}"),
|
174
|
-
transient=False,
|
175
|
-
) as progress:
|
176
|
-
task_id = progress.add_task(
|
177
|
-
f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
|
178
|
-
total=100,
|
179
|
-
)
|
180
|
-
content = {
|
181
|
-
"dataset_alias": alias,
|
182
|
-
"project_name": project_name,
|
183
|
-
"sequences": [s.model_dump() for s in sequences],
|
184
|
-
}
|
185
|
-
try:
|
186
|
-
response = requests.post(
|
187
|
-
JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
|
188
|
-
json=content,
|
189
|
-
headers={
|
190
|
-
"Content-Type": "application/json",
|
191
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
192
|
-
"X-Organization-Id": self.organization_id
|
193
|
-
},
|
194
|
-
verify=True
|
195
|
-
)
|
196
|
-
if response.status_code != 200:
|
197
|
-
error(f"Server error during append: {response.json()}")
|
198
|
-
raise Exception(f"Server error during append: {response.json()}")
|
199
|
-
response.raise_for_status()
|
200
|
-
except requests.exceptions.HTTPError as err:
|
201
|
-
if response.status_code == 422:
|
202
|
-
error(f"Validation error during append: {err.response.json()}")
|
203
|
-
else:
|
204
|
-
error(f"HTTP error during append: {err}")
|
205
|
-
|
206
|
-
progress.update(
|
207
|
-
task_id,
|
208
|
-
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
209
|
-
)
|
210
|
-
return True
|
211
151
|
|
212
152
|
def pull(self, alias: str, project_name: str) -> EvalDataset:
|
213
153
|
debug(f"Pulling dataset with alias '{alias}'")
|
@@ -262,7 +202,6 @@ class EvalDatasetClient:
|
|
262
202
|
info(f"Successfully pulled dataset with alias '{alias}'")
|
263
203
|
payload = response.json()
|
264
204
|
dataset.examples = [Example(**e) for e in payload.get("examples", [])]
|
265
|
-
dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])]
|
266
205
|
dataset._alias = payload.get("alias")
|
267
206
|
dataset._id = payload.get("id")
|
268
207
|
progress.update(
|
judgeval/data/example.py
CHANGED
@@ -24,20 +24,19 @@ class ExampleParams(Enum):
|
|
24
24
|
|
25
25
|
|
26
26
|
class Example(BaseModel):
|
27
|
-
input: Optional[str] = None
|
27
|
+
input: Optional[Union[str, Dict[str, Any]]] = None
|
28
28
|
actual_output: Optional[Union[str, List[str]]] = None
|
29
29
|
expected_output: Optional[Union[str, List[str]]] = None
|
30
30
|
context: Optional[List[str]] = None
|
31
31
|
retrieval_context: Optional[List[str]] = None
|
32
32
|
additional_metadata: Optional[Dict[str, Any]] = None
|
33
33
|
tools_called: Optional[List[str]] = None
|
34
|
-
expected_tools: Optional[List[str]] = None
|
34
|
+
expected_tools: Optional[List[Dict[str, Any]]] = None
|
35
35
|
name: Optional[str] = None
|
36
36
|
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
37
37
|
example_index: Optional[int] = None
|
38
38
|
timestamp: Optional[str] = None
|
39
39
|
trace_id: Optional[str] = None
|
40
|
-
sequence_order: Optional[int] = 0
|
41
40
|
|
42
41
|
def __init__(self, **data):
|
43
42
|
if 'example_id' not in data:
|
@@ -50,8 +49,18 @@ class Example(BaseModel):
|
|
50
49
|
@field_validator('input', mode='before')
|
51
50
|
@classmethod
|
52
51
|
def validate_input(cls, v):
|
53
|
-
if v is not None
|
54
|
-
|
52
|
+
if v is not None:
|
53
|
+
if not isinstance(v, (str, dict)):
|
54
|
+
raise ValueError(f"Input must be a string or dictionary but got {v} of type {type(v)}")
|
55
|
+
|
56
|
+
# If it's a string, check that it's not empty
|
57
|
+
if isinstance(v, str) and not v:
|
58
|
+
raise ValueError(f"Input string must be non-empty but got '{v}'")
|
59
|
+
|
60
|
+
# If it's a dictionary, check that it's not empty
|
61
|
+
if isinstance(v, dict) and not v:
|
62
|
+
raise ValueError(f"Input dictionary must be non-empty but got {v}")
|
63
|
+
|
55
64
|
return v
|
56
65
|
|
57
66
|
@field_validator('actual_output', mode='before')
|
@@ -73,7 +82,21 @@ class Example(BaseModel):
|
|
73
82
|
raise ValueError(f"All items in expected_output must be strings but got {v}")
|
74
83
|
return v
|
75
84
|
|
76
|
-
@field_validator('
|
85
|
+
@field_validator('expected_tools', mode='before')
|
86
|
+
@classmethod
|
87
|
+
def validate_expected_tools(cls, v):
|
88
|
+
if v is not None:
|
89
|
+
if not isinstance(v, list):
|
90
|
+
raise ValueError(f"Expected tools must be a list of dictionaries or None but got {v} of type {type(v)}")
|
91
|
+
|
92
|
+
# Check that each item in the list is a dictionary
|
93
|
+
for i, item in enumerate(v):
|
94
|
+
if not isinstance(item, dict):
|
95
|
+
raise ValueError(f"Expected tools must be a list of dictionaries, but item at index {i} is {item} of type {type(item)}")
|
96
|
+
|
97
|
+
return v
|
98
|
+
|
99
|
+
@field_validator('context', 'retrieval_context', 'tools_called', mode='before')
|
77
100
|
@classmethod
|
78
101
|
def validate_string_lists(cls, v, info):
|
79
102
|
field_name = info.field_name
|
@@ -127,7 +150,6 @@ class Example(BaseModel):
|
|
127
150
|
"example_id": self.example_id,
|
128
151
|
"example_index": self.example_index,
|
129
152
|
"timestamp": self.timestamp,
|
130
|
-
"trace_id": self.trace_id
|
131
153
|
}
|
132
154
|
|
133
155
|
def __str__(self):
|
@@ -144,5 +166,4 @@ class Example(BaseModel):
|
|
144
166
|
f"example_id={self.example_id}, "
|
145
167
|
f"example_index={self.example_index}, "
|
146
168
|
f"timestamp={self.timestamp}, "
|
147
|
-
f"trace_id={self.trace_id})"
|
148
169
|
)
|
judgeval/data/result.py
CHANGED
@@ -3,7 +3,7 @@ from typing import List, Union, Optional, Dict, Any, Union
|
|
3
3
|
from judgeval.common.logger import debug, error
|
4
4
|
from pydantic import BaseModel
|
5
5
|
from judgeval.data import ScorerData, Example, CustomExample
|
6
|
-
from judgeval.data.
|
6
|
+
from judgeval.data.trace import TraceSpan
|
7
7
|
|
8
8
|
|
9
9
|
class ScoringResult(BaseModel):
|
@@ -24,7 +24,7 @@ class ScoringResult(BaseModel):
|
|
24
24
|
name: Optional[str] = None
|
25
25
|
|
26
26
|
# The original example object that was used to create the ScoringResult
|
27
|
-
data_object: Optional[Union[
|
27
|
+
data_object: Optional[Union[TraceSpan, CustomExample, Example]] = None
|
28
28
|
trace_id: Optional[str] = None
|
29
29
|
|
30
30
|
# Additional fields for internal use
|
@@ -49,7 +49,7 @@ class ScoringResult(BaseModel):
|
|
49
49
|
|
50
50
|
|
51
51
|
def generate_scoring_result(
|
52
|
-
data_object: Union[Example,
|
52
|
+
data_object: Union[Example, TraceSpan],
|
53
53
|
scorers_data: List[ScorerData],
|
54
54
|
run_duration: float,
|
55
55
|
success: bool,
|
judgeval/data/trace.py
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
from pydantic import BaseModel
|
2
|
+
from typing import Optional, Dict, Any, List
|
3
|
+
from judgeval.evaluation_run import EvaluationRun
|
4
|
+
import json
|
5
|
+
from datetime import datetime, timezone
|
6
|
+
|
7
|
+
class TraceSpan(BaseModel):
|
8
|
+
span_id: str
|
9
|
+
trace_id: str
|
10
|
+
function: Optional[str] = None
|
11
|
+
depth: int
|
12
|
+
created_at: Optional[Any] = None
|
13
|
+
parent_span_id: Optional[str] = None
|
14
|
+
span_type: Optional[str] = "span"
|
15
|
+
inputs: Optional[Dict[str, Any]] = None
|
16
|
+
output: Optional[Any] = None
|
17
|
+
duration: Optional[float] = None
|
18
|
+
annotation: Optional[List[Dict[str, Any]]] = None
|
19
|
+
evaluation_runs: Optional[List[EvaluationRun]] = []
|
20
|
+
expected_tools: Optional[List[Dict[str, Any]]] = None
|
21
|
+
additional_metadata: Optional[Dict[str, Any]] = None
|
22
|
+
|
23
|
+
def model_dump(self, **kwargs):
|
24
|
+
return {
|
25
|
+
"span_id": self.span_id,
|
26
|
+
"trace_id": self.trace_id,
|
27
|
+
"depth": self.depth,
|
28
|
+
# "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
|
29
|
+
"created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
|
30
|
+
"inputs": self._serialize_inputs(),
|
31
|
+
"output": self._serialize_output(),
|
32
|
+
"evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
|
33
|
+
"parent_span_id": self.parent_span_id,
|
34
|
+
"function": self.function,
|
35
|
+
"duration": self.duration,
|
36
|
+
"span_type": self.span_type
|
37
|
+
}
|
38
|
+
|
39
|
+
def print_span(self):
|
40
|
+
"""Print the span with proper formatting and parent relationship information."""
|
41
|
+
indent = " " * self.depth
|
42
|
+
parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
|
43
|
+
print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
|
44
|
+
|
45
|
+
def _serialize_inputs(self) -> dict:
|
46
|
+
"""Helper method to serialize input data safely."""
|
47
|
+
if self.inputs is None:
|
48
|
+
return {}
|
49
|
+
|
50
|
+
serialized_inputs = {}
|
51
|
+
for key, value in self.inputs.items():
|
52
|
+
if isinstance(value, BaseModel):
|
53
|
+
serialized_inputs[key] = value.model_dump()
|
54
|
+
elif isinstance(value, (list, tuple)):
|
55
|
+
# Handle lists/tuples of arguments
|
56
|
+
serialized_inputs[key] = [
|
57
|
+
item.model_dump() if isinstance(item, BaseModel)
|
58
|
+
else None if not self._is_json_serializable(item)
|
59
|
+
else item
|
60
|
+
for item in value
|
61
|
+
]
|
62
|
+
else:
|
63
|
+
if self._is_json_serializable(value):
|
64
|
+
serialized_inputs[key] = value
|
65
|
+
else:
|
66
|
+
serialized_inputs[key] = self.safe_stringify(value, self.function)
|
67
|
+
return serialized_inputs
|
68
|
+
|
69
|
+
def _is_json_serializable(self, obj: Any) -> bool:
|
70
|
+
"""Helper method to check if an object is JSON serializable."""
|
71
|
+
try:
|
72
|
+
json.dumps(obj)
|
73
|
+
return True
|
74
|
+
except (TypeError, OverflowError, ValueError):
|
75
|
+
return False
|
76
|
+
|
77
|
+
def safe_stringify(self, output, function_name):
|
78
|
+
"""
|
79
|
+
Safely converts an object to a string or repr, handling serialization issues gracefully.
|
80
|
+
"""
|
81
|
+
try:
|
82
|
+
return str(output)
|
83
|
+
except (TypeError, OverflowError, ValueError):
|
84
|
+
pass
|
85
|
+
|
86
|
+
try:
|
87
|
+
return repr(output)
|
88
|
+
except (TypeError, OverflowError, ValueError):
|
89
|
+
pass
|
90
|
+
|
91
|
+
warnings.warn(
|
92
|
+
f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
|
93
|
+
)
|
94
|
+
return None
|
95
|
+
|
96
|
+
def _serialize_output(self) -> Any:
|
97
|
+
"""Helper method to serialize output data safely."""
|
98
|
+
if self.output is None:
|
99
|
+
return None
|
100
|
+
|
101
|
+
def serialize_value(value):
|
102
|
+
if isinstance(value, BaseModel):
|
103
|
+
return value.model_dump()
|
104
|
+
elif isinstance(value, dict):
|
105
|
+
# Recursively serialize dictionary values
|
106
|
+
return {k: serialize_value(v) for k, v in value.items()}
|
107
|
+
elif isinstance(value, (list, tuple)):
|
108
|
+
# Recursively serialize list/tuple items
|
109
|
+
return [serialize_value(item) for item in value]
|
110
|
+
else:
|
111
|
+
# Try direct JSON serialization first
|
112
|
+
try:
|
113
|
+
json.dumps(value)
|
114
|
+
return value
|
115
|
+
except (TypeError, OverflowError, ValueError):
|
116
|
+
# Fallback to safe stringification
|
117
|
+
return self.safe_stringify(value, self.function)
|
118
|
+
|
119
|
+
# Start serialization with the top-level output
|
120
|
+
return serialize_value(self.output)
|
121
|
+
|
122
|
+
class Trace(BaseModel):
|
123
|
+
trace_id: str
|
124
|
+
name: str
|
125
|
+
created_at: str
|
126
|
+
duration: float
|
127
|
+
entries: List[TraceSpan]
|
128
|
+
overwrite: bool = False
|
129
|
+
offline_mode: bool = False
|
130
|
+
rules: Optional[Dict[str, Any]] = None
|
131
|
+
has_notification: Optional[bool] = False
|
132
|
+
|
@@ -1,20 +1,20 @@
|
|
1
1
|
|
2
2
|
from pydantic import BaseModel
|
3
|
-
from typing import List, Optional, Dict, Any, Union
|
4
|
-
from judgeval.data import
|
3
|
+
from typing import List, Optional, Dict, Any, Union, Callable
|
4
|
+
from judgeval.data import Trace
|
5
5
|
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
6
6
|
from judgeval.judges import JudgevalJudge
|
7
7
|
from judgeval.rules import Rule
|
8
8
|
|
9
9
|
|
10
|
-
class
|
10
|
+
class TraceRun(BaseModel):
|
11
11
|
"""
|
12
12
|
Stores example and evaluation scorers together for running an eval task
|
13
13
|
|
14
14
|
Args:
|
15
15
|
project_name (str): The name of the project the evaluation results belong to
|
16
16
|
eval_name (str): A name for this evaluation run
|
17
|
-
|
17
|
+
traces (List[Trace]): The traces to evaluate
|
18
18
|
scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
19
19
|
model (str): The model used as a judge when using LLM as a Judge
|
20
20
|
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
@@ -29,8 +29,9 @@ class SequenceRun(BaseModel):
|
|
29
29
|
organization_id: Optional[str] = None
|
30
30
|
project_name: Optional[str] = None
|
31
31
|
eval_name: Optional[str] = None
|
32
|
-
|
33
|
-
|
32
|
+
traces: Optional[List[Trace]] = None
|
33
|
+
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
|
34
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
|
34
35
|
aggregator: Optional[str] = None
|
35
36
|
metadata: Optional[Dict[str, Any]] = None
|
36
37
|
trace_span_id: Optional[str] = None
|
judgeval/evaluation_run.py
CHANGED
@@ -31,7 +31,7 @@ class EvaluationRun(BaseModel):
|
|
31
31
|
eval_name: Optional[str] = None
|
32
32
|
examples: Union[List[Example], List[CustomExample]]
|
33
33
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
|
34
|
-
model: Union[str, List[str], JudgevalJudge]
|
34
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
|
35
35
|
aggregator: Optional[str] = None
|
36
36
|
metadata: Optional[Dict[str, Any]] = None
|
37
37
|
trace_span_id: Optional[str] = None
|
@@ -79,7 +79,7 @@ class EvaluationRun(BaseModel):
|
|
79
79
|
raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
|
80
80
|
return v
|
81
81
|
|
82
|
-
@field_validator('examples'
|
82
|
+
@field_validator('examples')
|
83
83
|
def validate_examples(cls, v):
|
84
84
|
if not v:
|
85
85
|
raise ValueError("Examples cannot be empty.")
|