judgeval 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +869 -928
- judgeval/common/utils.py +18 -0
- judgeval/constants.py +6 -3
- judgeval/data/__init__.py +4 -0
- judgeval/data/datasets/dataset.py +3 -2
- judgeval/data/datasets/eval_dataset_client.py +63 -3
- judgeval/data/example.py +29 -7
- judgeval/data/sequence.py +5 -4
- judgeval/data/sequence_run.py +4 -3
- judgeval/data/trace.py +129 -0
- judgeval/evaluation_run.py +1 -1
- judgeval/integrations/langgraph.py +1962 -299
- judgeval/judgment_client.py +85 -66
- judgeval/run_evaluation.py +191 -45
- judgeval/scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -0
- judgeval/scorers/score.py +2 -1
- judgeval/utils/data_utils.py +57 -0
- judgeval-0.0.37.dist-info/METADATA +214 -0
- {judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/RECORD +23 -20
- judgeval-0.0.35.dist-info/METADATA +0 -170
- {judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/WHEEL +0 -0
- {judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/utils.py
CHANGED
@@ -12,6 +12,7 @@ NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is a
|
|
12
12
|
import asyncio
|
13
13
|
import concurrent.futures
|
14
14
|
import os
|
15
|
+
import requests
|
15
16
|
import pprint
|
16
17
|
from typing import Any, Dict, List, Literal, Mapping, Optional, Union
|
17
18
|
|
@@ -96,6 +97,23 @@ def read_file(file_path: str) -> str:
|
|
96
97
|
with open(file_path, "r", encoding='utf-8') as file:
|
97
98
|
return file.read()
|
98
99
|
|
100
|
+
def validate_api_key(judgment_api_key: str):
|
101
|
+
"""
|
102
|
+
Validates that the user api key is valid
|
103
|
+
"""
|
104
|
+
response = requests.post(
|
105
|
+
f"{ROOT_API}/validate_api_key/",
|
106
|
+
headers={
|
107
|
+
"Content-Type": "application/json",
|
108
|
+
"Authorization": f"Bearer {judgment_api_key}",
|
109
|
+
},
|
110
|
+
json={}, # Empty body now
|
111
|
+
verify=True
|
112
|
+
)
|
113
|
+
if response.status_code == 200:
|
114
|
+
return True, response.json()
|
115
|
+
else:
|
116
|
+
return False, response.json().get("detail", "Error validating API key")
|
99
117
|
|
100
118
|
def fetch_together_api_response(model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None) -> str:
|
101
119
|
"""
|
judgeval/constants.py
CHANGED
@@ -27,7 +27,7 @@ class APIScorer(str, Enum):
|
|
27
27
|
COMPARISON = "comparison"
|
28
28
|
GROUNDEDNESS = "groundedness"
|
29
29
|
DERAILMENT = "derailment"
|
30
|
-
|
30
|
+
TOOL_ORDER = "tool_order"
|
31
31
|
@classmethod
|
32
32
|
def _missing_(cls, value):
|
33
33
|
# Handle case-insensitive lookup
|
@@ -42,14 +42,16 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
|
42
42
|
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
43
43
|
JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
|
44
44
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
45
|
-
|
45
|
+
JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
46
|
+
JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL = f"{ROOT_API}/datasets/insert_sequences/"
|
46
47
|
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
|
47
48
|
JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
|
48
49
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
49
50
|
JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
|
50
51
|
JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
51
52
|
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
52
|
-
|
53
|
+
JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL = f"{ROOT_API}/traces/convert_trace_to_sequence/"
|
54
|
+
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/"
|
53
55
|
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
|
54
56
|
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
55
57
|
JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
|
@@ -57,6 +59,7 @@ JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
|
|
57
59
|
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
58
60
|
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
59
61
|
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
62
|
+
JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
|
60
63
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
61
64
|
# RabbitMQ
|
62
65
|
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
|
judgeval/data/__init__.py
CHANGED
@@ -3,6 +3,8 @@ from judgeval.data.custom_example import CustomExample
|
|
3
3
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
4
4
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
5
5
|
from judgeval.data.sequence import Sequence
|
6
|
+
from judgeval.data.trace import Trace, TraceSpan
|
7
|
+
|
6
8
|
|
7
9
|
__all__ = [
|
8
10
|
"Example",
|
@@ -13,4 +15,6 @@ __all__ = [
|
|
13
15
|
"ScoringResult",
|
14
16
|
"generate_scoring_result",
|
15
17
|
"Sequence",
|
18
|
+
"Trace",
|
19
|
+
"TraceSpan",
|
16
20
|
]
|
@@ -224,6 +224,9 @@ class EvalDataset:
|
|
224
224
|
self.examples = self.examples + [e]
|
225
225
|
# TODO if we need to add rank, then we need to do it here
|
226
226
|
|
227
|
+
def add_sequence(self, s: Sequence) -> None:
|
228
|
+
self.sequences = self.sequences + [s]
|
229
|
+
|
227
230
|
def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
|
228
231
|
"""
|
229
232
|
Saves the dataset as a file. Save only the examples.
|
@@ -270,7 +273,6 @@ class EvalDataset:
|
|
270
273
|
None, # Example does not have comments
|
271
274
|
None, # Example does not have source file
|
272
275
|
True, # Adding an Example
|
273
|
-
e.trace_id
|
274
276
|
]
|
275
277
|
)
|
276
278
|
|
@@ -292,7 +294,6 @@ class EvalDataset:
|
|
292
294
|
"comments": None, # Example does not have comments
|
293
295
|
"source_file": None, # Example does not have source file
|
294
296
|
"example": True, # Adding an Example
|
295
|
-
"trace_id": e.trace_id
|
296
297
|
}
|
297
298
|
for e in self.examples
|
298
299
|
],
|
@@ -6,7 +6,8 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
6
6
|
from judgeval.common.logger import debug, error, warning, info
|
7
7
|
from judgeval.constants import (
|
8
8
|
JUDGMENT_DATASETS_PUSH_API_URL,
|
9
|
-
|
9
|
+
JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
|
10
|
+
JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
|
10
11
|
JUDGMENT_DATASETS_PULL_API_URL,
|
11
12
|
JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
|
12
13
|
JUDGMENT_DATASETS_DELETE_API_URL,
|
@@ -58,6 +59,8 @@ class EvalDatasetClient:
|
|
58
59
|
"dataset_alias": alias,
|
59
60
|
"project_name": project_name,
|
60
61
|
"examples": [e.to_dict() for e in dataset.examples],
|
62
|
+
"sequences": [s.model_dump() for s in dataset.sequences],
|
63
|
+
"is_sequence": len(dataset.sequences) > 0,
|
61
64
|
"overwrite": overwrite,
|
62
65
|
}
|
63
66
|
try:
|
@@ -92,7 +95,7 @@ class EvalDatasetClient:
|
|
92
95
|
return True
|
93
96
|
|
94
97
|
|
95
|
-
def
|
98
|
+
def append_examples(self, alias: str, examples: List[Example], project_name: str) -> bool:
|
96
99
|
debug(f"Appending dataset with alias '{alias}'")
|
97
100
|
"""
|
98
101
|
Appends the dataset to Judgment platform
|
@@ -124,7 +127,7 @@ class EvalDatasetClient:
|
|
124
127
|
}
|
125
128
|
try:
|
126
129
|
response = requests.post(
|
127
|
-
|
130
|
+
JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
|
128
131
|
json=content,
|
129
132
|
headers={
|
130
133
|
"Content-Type": "application/json",
|
@@ -149,6 +152,63 @@ class EvalDatasetClient:
|
|
149
152
|
)
|
150
153
|
return True
|
151
154
|
|
155
|
+
def append_sequences(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
|
156
|
+
debug(f"Appending dataset with alias '{alias}'")
|
157
|
+
"""
|
158
|
+
Appends the dataset to Judgment platform
|
159
|
+
|
160
|
+
Mock request:
|
161
|
+
dataset = {
|
162
|
+
"alias": alias,
|
163
|
+
"examples": [...],
|
164
|
+
"project_name": project_name
|
165
|
+
} ==>
|
166
|
+
{
|
167
|
+
"_alias": alias,
|
168
|
+
"_id": "..." # ID of the dataset
|
169
|
+
}
|
170
|
+
"""
|
171
|
+
with Progress(
|
172
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
173
|
+
TextColumn("[progress.description]{task.description}"),
|
174
|
+
transient=False,
|
175
|
+
) as progress:
|
176
|
+
task_id = progress.add_task(
|
177
|
+
f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
|
178
|
+
total=100,
|
179
|
+
)
|
180
|
+
content = {
|
181
|
+
"dataset_alias": alias,
|
182
|
+
"project_name": project_name,
|
183
|
+
"sequences": [s.model_dump() for s in sequences],
|
184
|
+
}
|
185
|
+
try:
|
186
|
+
response = requests.post(
|
187
|
+
JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
|
188
|
+
json=content,
|
189
|
+
headers={
|
190
|
+
"Content-Type": "application/json",
|
191
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
192
|
+
"X-Organization-Id": self.organization_id
|
193
|
+
},
|
194
|
+
verify=True
|
195
|
+
)
|
196
|
+
if response.status_code != 200:
|
197
|
+
error(f"Server error during append: {response.json()}")
|
198
|
+
raise Exception(f"Server error during append: {response.json()}")
|
199
|
+
response.raise_for_status()
|
200
|
+
except requests.exceptions.HTTPError as err:
|
201
|
+
if response.status_code == 422:
|
202
|
+
error(f"Validation error during append: {err.response.json()}")
|
203
|
+
else:
|
204
|
+
error(f"HTTP error during append: {err}")
|
205
|
+
|
206
|
+
progress.update(
|
207
|
+
task_id,
|
208
|
+
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
209
|
+
)
|
210
|
+
return True
|
211
|
+
|
152
212
|
def pull(self, alias: str, project_name: str) -> EvalDataset:
|
153
213
|
debug(f"Pulling dataset with alias '{alias}'")
|
154
214
|
"""
|
judgeval/data/example.py
CHANGED
@@ -24,14 +24,14 @@ class ExampleParams(Enum):
|
|
24
24
|
|
25
25
|
|
26
26
|
class Example(BaseModel):
|
27
|
-
input: Optional[str] = None
|
27
|
+
input: Optional[Union[str, Dict[str, Any]]] = None
|
28
28
|
actual_output: Optional[Union[str, List[str]]] = None
|
29
29
|
expected_output: Optional[Union[str, List[str]]] = None
|
30
30
|
context: Optional[List[str]] = None
|
31
31
|
retrieval_context: Optional[List[str]] = None
|
32
32
|
additional_metadata: Optional[Dict[str, Any]] = None
|
33
33
|
tools_called: Optional[List[str]] = None
|
34
|
-
expected_tools: Optional[List[str]] = None
|
34
|
+
expected_tools: Optional[List[Dict[str, Any]]] = None
|
35
35
|
name: Optional[str] = None
|
36
36
|
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
37
37
|
example_index: Optional[int] = None
|
@@ -50,8 +50,18 @@ class Example(BaseModel):
|
|
50
50
|
@field_validator('input', mode='before')
|
51
51
|
@classmethod
|
52
52
|
def validate_input(cls, v):
|
53
|
-
if v is not None
|
54
|
-
|
53
|
+
if v is not None:
|
54
|
+
if not isinstance(v, (str, dict)):
|
55
|
+
raise ValueError(f"Input must be a string or dictionary but got {v} of type {type(v)}")
|
56
|
+
|
57
|
+
# If it's a string, check that it's not empty
|
58
|
+
if isinstance(v, str) and not v:
|
59
|
+
raise ValueError(f"Input string must be non-empty but got '{v}'")
|
60
|
+
|
61
|
+
# If it's a dictionary, check that it's not empty
|
62
|
+
if isinstance(v, dict) and not v:
|
63
|
+
raise ValueError(f"Input dictionary must be non-empty but got {v}")
|
64
|
+
|
55
65
|
return v
|
56
66
|
|
57
67
|
@field_validator('actual_output', mode='before')
|
@@ -73,7 +83,21 @@ class Example(BaseModel):
|
|
73
83
|
raise ValueError(f"All items in expected_output must be strings but got {v}")
|
74
84
|
return v
|
75
85
|
|
76
|
-
@field_validator('
|
86
|
+
@field_validator('expected_tools', mode='before')
|
87
|
+
@classmethod
|
88
|
+
def validate_expected_tools(cls, v):
|
89
|
+
if v is not None:
|
90
|
+
if not isinstance(v, list):
|
91
|
+
raise ValueError(f"Expected tools must be a list of dictionaries or None but got {v} of type {type(v)}")
|
92
|
+
|
93
|
+
# Check that each item in the list is a dictionary
|
94
|
+
for i, item in enumerate(v):
|
95
|
+
if not isinstance(item, dict):
|
96
|
+
raise ValueError(f"Expected tools must be a list of dictionaries, but item at index {i} is {item} of type {type(item)}")
|
97
|
+
|
98
|
+
return v
|
99
|
+
|
100
|
+
@field_validator('context', 'retrieval_context', 'tools_called', mode='before')
|
77
101
|
@classmethod
|
78
102
|
def validate_string_lists(cls, v, info):
|
79
103
|
field_name = info.field_name
|
@@ -127,7 +151,6 @@ class Example(BaseModel):
|
|
127
151
|
"example_id": self.example_id,
|
128
152
|
"example_index": self.example_index,
|
129
153
|
"timestamp": self.timestamp,
|
130
|
-
"trace_id": self.trace_id
|
131
154
|
}
|
132
155
|
|
133
156
|
def __str__(self):
|
@@ -144,5 +167,4 @@ class Example(BaseModel):
|
|
144
167
|
f"example_id={self.example_id}, "
|
145
168
|
f"example_index={self.example_index}, "
|
146
169
|
f"timestamp={self.timestamp}, "
|
147
|
-
f"trace_id={self.trace_id})"
|
148
170
|
)
|
judgeval/data/sequence.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from pydantic import BaseModel, Field, field_validator, model_validator
|
2
|
-
from typing import List, Optional, Union, Any
|
2
|
+
from typing import List, Optional, Union, Any, Dict
|
3
3
|
from judgeval.data.example import Example
|
4
4
|
from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
|
5
5
|
from uuid import uuid4
|
@@ -12,13 +12,14 @@ class Sequence(BaseModel):
|
|
12
12
|
sequence_id: str = Field(default_factory=lambda: str(uuid4()))
|
13
13
|
name: Optional[str] = "Sequence"
|
14
14
|
created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
|
15
|
-
items: List[Union["Sequence", Example]]
|
15
|
+
items: List[Union["Sequence", Example]] = []
|
16
16
|
scorers: Optional[Any] = None
|
17
17
|
parent_sequence_id: Optional[str] = None
|
18
18
|
sequence_order: Optional[int] = 0
|
19
19
|
root_sequence_id: Optional[str] = None
|
20
|
-
inputs: Optional[str] = None
|
21
|
-
output: Optional[
|
20
|
+
inputs: Optional[Dict[str, Any]] = None
|
21
|
+
output: Optional[Any] = None
|
22
|
+
expected_tools: Optional[List[Dict[str, Any]]] = None
|
22
23
|
|
23
24
|
@field_validator("scorers")
|
24
25
|
def validate_scorer(cls, v):
|
judgeval/data/sequence_run.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
|
2
2
|
from pydantic import BaseModel
|
3
|
-
from typing import List, Optional, Dict, Any, Union
|
3
|
+
from typing import List, Optional, Dict, Any, Union, Callable
|
4
4
|
from judgeval.data import Sequence
|
5
5
|
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
6
6
|
from judgeval.judges import JudgevalJudge
|
@@ -29,8 +29,9 @@ class SequenceRun(BaseModel):
|
|
29
29
|
organization_id: Optional[str] = None
|
30
30
|
project_name: Optional[str] = None
|
31
31
|
eval_name: Optional[str] = None
|
32
|
-
sequences: List[Sequence]
|
33
|
-
|
32
|
+
sequences: Optional[List[Sequence]] = None
|
33
|
+
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
|
34
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
|
34
35
|
aggregator: Optional[str] = None
|
35
36
|
metadata: Optional[Dict[str, Any]] = None
|
36
37
|
trace_span_id: Optional[str] = None
|
judgeval/data/trace.py
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
from pydantic import BaseModel
|
2
|
+
from typing import Optional, Dict, Any, List
|
3
|
+
from judgeval.evaluation_run import EvaluationRun
|
4
|
+
import json
|
5
|
+
from datetime import datetime, timezone
|
6
|
+
|
7
|
+
class TraceSpan(BaseModel):
|
8
|
+
span_id: str
|
9
|
+
trace_id: str
|
10
|
+
function: Optional[str] = None
|
11
|
+
depth: int
|
12
|
+
created_at: Optional[float] = None
|
13
|
+
parent_span_id: Optional[str] = None
|
14
|
+
span_type: Optional[str] = "span"
|
15
|
+
inputs: Optional[Dict[str, Any]] = None
|
16
|
+
output: Optional[Any] = None
|
17
|
+
duration: Optional[float] = None
|
18
|
+
annotation: Optional[List[Dict[str, Any]]] = None
|
19
|
+
evaluation_runs: Optional[List[EvaluationRun]] = []
|
20
|
+
|
21
|
+
def model_dump(self, **kwargs):
|
22
|
+
return {
|
23
|
+
"span_id": self.span_id,
|
24
|
+
"trace_id": self.trace_id,
|
25
|
+
"depth": self.depth,
|
26
|
+
# "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
|
27
|
+
"created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
|
28
|
+
"inputs": self._serialize_inputs(),
|
29
|
+
"output": self._serialize_output(),
|
30
|
+
"evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
|
31
|
+
"parent_span_id": self.parent_span_id,
|
32
|
+
"function": self.function,
|
33
|
+
"duration": self.duration,
|
34
|
+
"span_type": self.span_type
|
35
|
+
}
|
36
|
+
|
37
|
+
def print_span(self):
|
38
|
+
"""Print the span with proper formatting and parent relationship information."""
|
39
|
+
indent = " " * self.depth
|
40
|
+
parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
|
41
|
+
print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
|
42
|
+
|
43
|
+
def _serialize_inputs(self) -> dict:
|
44
|
+
"""Helper method to serialize input data safely."""
|
45
|
+
if self.inputs is None:
|
46
|
+
return {}
|
47
|
+
|
48
|
+
serialized_inputs = {}
|
49
|
+
for key, value in self.inputs.items():
|
50
|
+
if isinstance(value, BaseModel):
|
51
|
+
serialized_inputs[key] = value.model_dump()
|
52
|
+
elif isinstance(value, (list, tuple)):
|
53
|
+
# Handle lists/tuples of arguments
|
54
|
+
serialized_inputs[key] = [
|
55
|
+
item.model_dump() if isinstance(item, BaseModel)
|
56
|
+
else None if not self._is_json_serializable(item)
|
57
|
+
else item
|
58
|
+
for item in value
|
59
|
+
]
|
60
|
+
else:
|
61
|
+
if self._is_json_serializable(value):
|
62
|
+
serialized_inputs[key] = value
|
63
|
+
else:
|
64
|
+
serialized_inputs[key] = self.safe_stringify(value, self.function)
|
65
|
+
return serialized_inputs
|
66
|
+
|
67
|
+
def _is_json_serializable(self, obj: Any) -> bool:
|
68
|
+
"""Helper method to check if an object is JSON serializable."""
|
69
|
+
try:
|
70
|
+
json.dumps(obj)
|
71
|
+
return True
|
72
|
+
except (TypeError, OverflowError, ValueError):
|
73
|
+
return False
|
74
|
+
|
75
|
+
def safe_stringify(self, output, function_name):
|
76
|
+
"""
|
77
|
+
Safely converts an object to a string or repr, handling serialization issues gracefully.
|
78
|
+
"""
|
79
|
+
try:
|
80
|
+
return str(output)
|
81
|
+
except (TypeError, OverflowError, ValueError):
|
82
|
+
pass
|
83
|
+
|
84
|
+
try:
|
85
|
+
return repr(output)
|
86
|
+
except (TypeError, OverflowError, ValueError):
|
87
|
+
pass
|
88
|
+
|
89
|
+
warnings.warn(
|
90
|
+
f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
|
91
|
+
)
|
92
|
+
return None
|
93
|
+
|
94
|
+
def _serialize_output(self) -> Any:
|
95
|
+
"""Helper method to serialize output data safely."""
|
96
|
+
if self.output is None:
|
97
|
+
return None
|
98
|
+
|
99
|
+
def serialize_value(value):
|
100
|
+
if isinstance(value, BaseModel):
|
101
|
+
return value.model_dump()
|
102
|
+
elif isinstance(value, dict):
|
103
|
+
# Recursively serialize dictionary values
|
104
|
+
return {k: serialize_value(v) for k, v in value.items()}
|
105
|
+
elif isinstance(value, (list, tuple)):
|
106
|
+
# Recursively serialize list/tuple items
|
107
|
+
return [serialize_value(item) for item in value]
|
108
|
+
else:
|
109
|
+
# Try direct JSON serialization first
|
110
|
+
try:
|
111
|
+
json.dumps(value)
|
112
|
+
return value
|
113
|
+
except (TypeError, OverflowError, ValueError):
|
114
|
+
# Fallback to safe stringification
|
115
|
+
return self.safe_stringify(value, self.function)
|
116
|
+
|
117
|
+
# Start serialization with the top-level output
|
118
|
+
return serialize_value(self.output)
|
119
|
+
|
120
|
+
class Trace(BaseModel):
|
121
|
+
trace_id: str
|
122
|
+
name: str
|
123
|
+
created_at: str
|
124
|
+
duration: float
|
125
|
+
entries: List[TraceSpan]
|
126
|
+
overwrite: bool = False
|
127
|
+
rules: Optional[Dict[str, Any]] = None
|
128
|
+
has_notification: Optional[bool] = False
|
129
|
+
|
judgeval/evaluation_run.py
CHANGED
@@ -31,7 +31,7 @@ class EvaluationRun(BaseModel):
|
|
31
31
|
eval_name: Optional[str] = None
|
32
32
|
examples: Union[List[Example], List[CustomExample]]
|
33
33
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
|
34
|
-
model: Union[str, List[str], JudgevalJudge]
|
34
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
|
35
35
|
aggregator: Optional[str] = None
|
36
36
|
metadata: Optional[Dict[str, Any]] = None
|
37
37
|
trace_span_id: Optional[str] = None
|