judgeval 0.0.36__py3-none-any.whl → 0.0.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/common/utils.py CHANGED
@@ -12,6 +12,7 @@ NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is a
12
12
  import asyncio
13
13
  import concurrent.futures
14
14
  import os
15
+ import requests
15
16
  import pprint
16
17
  from typing import Any, Dict, List, Literal, Mapping, Optional, Union
17
18
 
@@ -96,6 +97,23 @@ def read_file(file_path: str) -> str:
96
97
  with open(file_path, "r", encoding='utf-8') as file:
97
98
  return file.read()
98
99
 
100
+ def validate_api_key(judgment_api_key: str):
101
+ """
102
+ Validates that the user api key is valid
103
+ """
104
+ response = requests.post(
105
+ f"{ROOT_API}/validate_api_key/",
106
+ headers={
107
+ "Content-Type": "application/json",
108
+ "Authorization": f"Bearer {judgment_api_key}",
109
+ },
110
+ json={}, # Empty body now
111
+ verify=True
112
+ )
113
+ if response.status_code == 200:
114
+ return True, response.json()
115
+ else:
116
+ return False, response.json().get("detail", "Error validating API key")
99
117
 
100
118
  def fetch_together_api_response(model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None) -> str:
101
119
  """
@@ -747,7 +765,7 @@ if __name__ == "__main__":
747
765
  # Batched single completion to multiple models
748
766
  pprint.pprint(get_completion_multiple_models(
749
767
  models=[
750
- "LLAMA3_70B_INSTRUCT_TURBO", "LLAMA3_405B_INSTRUCT_TURBO", "gpt-4o-mini"
768
+ "LLAMA3_70B_INSTRUCT_TURBO", "LLAMA3_405B_INSTRUCT_TURBO", "gpt-4.1-mini"
751
769
  ],
752
770
  messages=[
753
771
  [
judgeval/constants.py CHANGED
@@ -27,7 +27,7 @@ class APIScorer(str, Enum):
27
27
  COMPARISON = "comparison"
28
28
  GROUNDEDNESS = "groundedness"
29
29
  DERAILMENT = "derailment"
30
-
30
+ TOOL_ORDER = "tool_order"
31
31
  @classmethod
32
32
  def _missing_(cls, value):
33
33
  # Handle case-insensitive lookup
@@ -40,10 +40,9 @@ UNBOUNDED_SCORERS = set([APIScorer.COMPARISON]) # scorers whose scores are not
40
40
  ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
41
41
  # API URLs
42
42
  JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
43
- JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
43
+ JUDGMENT_TRACE_EVAL_API_URL = f"{ROOT_API}/evaluate_trace/"
44
44
  JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
45
45
  JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
46
- JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL = f"{ROOT_API}/datasets/insert_sequences/"
47
46
  JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
48
47
  JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
49
48
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
@@ -58,6 +57,7 @@ JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
58
57
  JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
59
58
  JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
60
59
  JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
60
+ JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
61
61
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
62
62
  # RabbitMQ
63
63
  RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
judgeval/data/__init__.py CHANGED
@@ -2,7 +2,8 @@ from judgeval.data.example import Example, ExampleParams
2
2
  from judgeval.data.custom_example import CustomExample
3
3
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
4
4
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
- from judgeval.data.sequence import Sequence
5
+ from judgeval.data.trace import Trace, TraceSpan
6
+
6
7
 
7
8
  __all__ = [
8
9
  "Example",
@@ -12,5 +13,6 @@ __all__ = [
12
13
  "create_scorer_data",
13
14
  "ScoringResult",
14
15
  "generate_scoring_result",
15
- "Sequence",
16
+ "Trace",
17
+ "TraceSpan",
16
18
  ]
@@ -7,13 +7,12 @@ import yaml
7
7
  from dataclasses import dataclass, field
8
8
  from typing import List, Union, Literal
9
9
 
10
- from judgeval.data import Example, Sequence
10
+ from judgeval.data import Example
11
11
  from judgeval.common.logger import debug, error, warning, info
12
12
 
13
13
  @dataclass
14
14
  class EvalDataset:
15
15
  examples: List[Example]
16
- sequences: List[Sequence]
17
16
  _alias: Union[str, None] = field(default=None)
18
17
  _id: Union[str, None] = field(default=None)
19
18
  judgment_api_key: str = field(default="")
@@ -22,13 +21,11 @@ class EvalDataset:
22
21
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
23
22
  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
24
23
  examples: List[Example] = [],
25
- sequences: List[Sequence] = []
26
24
  ):
27
25
  debug(f"Initializing EvalDataset with {len(examples)} examples")
28
26
  if not judgment_api_key:
29
27
  warning("No judgment_api_key provided")
30
28
  self.examples = examples
31
- self.sequences = sequences
32
29
  self._alias = None
33
30
  self._id = None
34
31
  self.judgment_api_key = judgment_api_key
@@ -223,10 +220,7 @@ class EvalDataset:
223
220
  def add_example(self, e: Example) -> None:
224
221
  self.examples = self.examples + [e]
225
222
  # TODO if we need to add rank, then we need to do it here
226
-
227
- def add_sequence(self, s: Sequence) -> None:
228
- self.sequences = self.sequences + [s]
229
-
223
+
230
224
  def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
231
225
  """
232
226
  Saves the dataset as a file. Save only the examples.
@@ -273,7 +267,6 @@ class EvalDataset:
273
267
  None, # Example does not have comments
274
268
  None, # Example does not have source file
275
269
  True, # Adding an Example
276
- e.trace_id
277
270
  ]
278
271
  )
279
272
 
@@ -295,7 +288,6 @@ class EvalDataset:
295
288
  "comments": None, # Example does not have comments
296
289
  "source_file": None, # Example does not have source file
297
290
  "example": True, # Adding an Example
298
- "trace_id": e.trace_id
299
291
  }
300
292
  for e in self.examples
301
293
  ],
@@ -315,7 +307,6 @@ class EvalDataset:
315
307
  return (
316
308
  f"{self.__class__.__name__}("
317
309
  f"examples={self.examples}, "
318
- f"sequences={self.sequences}, "
319
310
  f"_alias={self._alias}, "
320
311
  f"_id={self._id}"
321
312
  f")"
@@ -7,14 +7,13 @@ from judgeval.common.logger import debug, error, warning, info
7
7
  from judgeval.constants import (
8
8
  JUDGMENT_DATASETS_PUSH_API_URL,
9
9
  JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
10
- JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
11
10
  JUDGMENT_DATASETS_PULL_API_URL,
12
11
  JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
13
12
  JUDGMENT_DATASETS_DELETE_API_URL,
14
13
  JUDGMENT_DATASETS_INSERT_API_URL,
15
14
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
16
15
  )
17
- from judgeval.data import Example, Sequence
16
+ from judgeval.data import Example
18
17
  from judgeval.data.datasets import EvalDataset
19
18
 
20
19
 
@@ -59,8 +58,6 @@ class EvalDatasetClient:
59
58
  "dataset_alias": alias,
60
59
  "project_name": project_name,
61
60
  "examples": [e.to_dict() for e in dataset.examples],
62
- "sequences": [s.model_dump() for s in dataset.sequences],
63
- "is_sequence": len(dataset.sequences) > 0,
64
61
  "overwrite": overwrite,
65
62
  }
66
63
  try:
@@ -151,63 +148,6 @@ class EvalDatasetClient:
151
148
  description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
152
149
  )
153
150
  return True
154
-
155
- def append_sequences(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
156
- debug(f"Appending dataset with alias '{alias}'")
157
- """
158
- Appends the dataset to Judgment platform
159
-
160
- Mock request:
161
- dataset = {
162
- "alias": alias,
163
- "examples": [...],
164
- "project_name": project_name
165
- } ==>
166
- {
167
- "_alias": alias,
168
- "_id": "..." # ID of the dataset
169
- }
170
- """
171
- with Progress(
172
- SpinnerColumn(style="rgb(106,0,255)"),
173
- TextColumn("[progress.description]{task.description}"),
174
- transient=False,
175
- ) as progress:
176
- task_id = progress.add_task(
177
- f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
178
- total=100,
179
- )
180
- content = {
181
- "dataset_alias": alias,
182
- "project_name": project_name,
183
- "sequences": [s.model_dump() for s in sequences],
184
- }
185
- try:
186
- response = requests.post(
187
- JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
188
- json=content,
189
- headers={
190
- "Content-Type": "application/json",
191
- "Authorization": f"Bearer {self.judgment_api_key}",
192
- "X-Organization-Id": self.organization_id
193
- },
194
- verify=True
195
- )
196
- if response.status_code != 200:
197
- error(f"Server error during append: {response.json()}")
198
- raise Exception(f"Server error during append: {response.json()}")
199
- response.raise_for_status()
200
- except requests.exceptions.HTTPError as err:
201
- if response.status_code == 422:
202
- error(f"Validation error during append: {err.response.json()}")
203
- else:
204
- error(f"HTTP error during append: {err}")
205
-
206
- progress.update(
207
- task_id,
208
- description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
209
- )
210
- return True
211
151
 
212
152
  def pull(self, alias: str, project_name: str) -> EvalDataset:
213
153
  debug(f"Pulling dataset with alias '{alias}'")
@@ -262,7 +202,6 @@ class EvalDatasetClient:
262
202
  info(f"Successfully pulled dataset with alias '{alias}'")
263
203
  payload = response.json()
264
204
  dataset.examples = [Example(**e) for e in payload.get("examples", [])]
265
- dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])]
266
205
  dataset._alias = payload.get("alias")
267
206
  dataset._id = payload.get("id")
268
207
  progress.update(
judgeval/data/example.py CHANGED
@@ -24,20 +24,19 @@ class ExampleParams(Enum):
24
24
 
25
25
 
26
26
  class Example(BaseModel):
27
- input: Optional[str] = None
27
+ input: Optional[Union[str, Dict[str, Any]]] = None
28
28
  actual_output: Optional[Union[str, List[str]]] = None
29
29
  expected_output: Optional[Union[str, List[str]]] = None
30
30
  context: Optional[List[str]] = None
31
31
  retrieval_context: Optional[List[str]] = None
32
32
  additional_metadata: Optional[Dict[str, Any]] = None
33
33
  tools_called: Optional[List[str]] = None
34
- expected_tools: Optional[List[str]] = None
34
+ expected_tools: Optional[List[Dict[str, Any]]] = None
35
35
  name: Optional[str] = None
36
36
  example_id: str = Field(default_factory=lambda: str(uuid4()))
37
37
  example_index: Optional[int] = None
38
38
  timestamp: Optional[str] = None
39
39
  trace_id: Optional[str] = None
40
- sequence_order: Optional[int] = 0
41
40
 
42
41
  def __init__(self, **data):
43
42
  if 'example_id' not in data:
@@ -50,8 +49,18 @@ class Example(BaseModel):
50
49
  @field_validator('input', mode='before')
51
50
  @classmethod
52
51
  def validate_input(cls, v):
53
- if v is not None and (not v or not isinstance(v, str)):
54
- raise ValueError(f"Input must be a non-empty string but got '{v}' of type {type(v)}")
52
+ if v is not None:
53
+ if not isinstance(v, (str, dict)):
54
+ raise ValueError(f"Input must be a string or dictionary but got {v} of type {type(v)}")
55
+
56
+ # If it's a string, check that it's not empty
57
+ if isinstance(v, str) and not v:
58
+ raise ValueError(f"Input string must be non-empty but got '{v}'")
59
+
60
+ # If it's a dictionary, check that it's not empty
61
+ if isinstance(v, dict) and not v:
62
+ raise ValueError(f"Input dictionary must be non-empty but got {v}")
63
+
55
64
  return v
56
65
 
57
66
  @field_validator('actual_output', mode='before')
@@ -73,7 +82,21 @@ class Example(BaseModel):
73
82
  raise ValueError(f"All items in expected_output must be strings but got {v}")
74
83
  return v
75
84
 
76
- @field_validator('context', 'retrieval_context', 'tools_called', 'expected_tools', mode='before')
85
+ @field_validator('expected_tools', mode='before')
86
+ @classmethod
87
+ def validate_expected_tools(cls, v):
88
+ if v is not None:
89
+ if not isinstance(v, list):
90
+ raise ValueError(f"Expected tools must be a list of dictionaries or None but got {v} of type {type(v)}")
91
+
92
+ # Check that each item in the list is a dictionary
93
+ for i, item in enumerate(v):
94
+ if not isinstance(item, dict):
95
+ raise ValueError(f"Expected tools must be a list of dictionaries, but item at index {i} is {item} of type {type(item)}")
96
+
97
+ return v
98
+
99
+ @field_validator('context', 'retrieval_context', 'tools_called', mode='before')
77
100
  @classmethod
78
101
  def validate_string_lists(cls, v, info):
79
102
  field_name = info.field_name
@@ -127,7 +150,6 @@ class Example(BaseModel):
127
150
  "example_id": self.example_id,
128
151
  "example_index": self.example_index,
129
152
  "timestamp": self.timestamp,
130
- "trace_id": self.trace_id
131
153
  }
132
154
 
133
155
  def __str__(self):
@@ -144,5 +166,4 @@ class Example(BaseModel):
144
166
  f"example_id={self.example_id}, "
145
167
  f"example_index={self.example_index}, "
146
168
  f"timestamp={self.timestamp}, "
147
- f"trace_id={self.trace_id})"
148
169
  )
judgeval/data/result.py CHANGED
@@ -3,7 +3,7 @@ from typing import List, Union, Optional, Dict, Any, Union
3
3
  from judgeval.common.logger import debug, error
4
4
  from pydantic import BaseModel
5
5
  from judgeval.data import ScorerData, Example, CustomExample
6
- from judgeval.data.sequence import Sequence
6
+ from judgeval.data.trace import TraceSpan
7
7
 
8
8
 
9
9
  class ScoringResult(BaseModel):
@@ -24,7 +24,7 @@ class ScoringResult(BaseModel):
24
24
  name: Optional[str] = None
25
25
 
26
26
  # The original example object that was used to create the ScoringResult
27
- data_object: Optional[Union[Sequence, CustomExample, Example]] = None
27
+ data_object: Optional[Union[TraceSpan, CustomExample, Example]] = None
28
28
  trace_id: Optional[str] = None
29
29
 
30
30
  # Additional fields for internal use
@@ -49,7 +49,7 @@ class ScoringResult(BaseModel):
49
49
 
50
50
 
51
51
  def generate_scoring_result(
52
- data_object: Union[Example, Sequence],
52
+ data_object: Union[Example, TraceSpan],
53
53
  scorers_data: List[ScorerData],
54
54
  run_duration: float,
55
55
  success: bool,
judgeval/data/trace.py ADDED
@@ -0,0 +1,132 @@
1
+ from pydantic import BaseModel
2
+ from typing import Optional, Dict, Any, List
3
+ from judgeval.evaluation_run import EvaluationRun
4
+ import json
5
+ from datetime import datetime, timezone
6
+
7
+ class TraceSpan(BaseModel):
8
+ span_id: str
9
+ trace_id: str
10
+ function: Optional[str] = None
11
+ depth: int
12
+ created_at: Optional[Any] = None
13
+ parent_span_id: Optional[str] = None
14
+ span_type: Optional[str] = "span"
15
+ inputs: Optional[Dict[str, Any]] = None
16
+ output: Optional[Any] = None
17
+ duration: Optional[float] = None
18
+ annotation: Optional[List[Dict[str, Any]]] = None
19
+ evaluation_runs: Optional[List[EvaluationRun]] = []
20
+ expected_tools: Optional[List[Dict[str, Any]]] = None
21
+ additional_metadata: Optional[Dict[str, Any]] = None
22
+
23
+ def model_dump(self, **kwargs):
24
+ return {
25
+ "span_id": self.span_id,
26
+ "trace_id": self.trace_id,
27
+ "depth": self.depth,
28
+ # "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
29
+ "created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
30
+ "inputs": self._serialize_inputs(),
31
+ "output": self._serialize_output(),
32
+ "evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
33
+ "parent_span_id": self.parent_span_id,
34
+ "function": self.function,
35
+ "duration": self.duration,
36
+ "span_type": self.span_type
37
+ }
38
+
39
+ def print_span(self):
40
+ """Print the span with proper formatting and parent relationship information."""
41
+ indent = " " * self.depth
42
+ parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
43
+ print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
44
+
45
+ def _serialize_inputs(self) -> dict:
46
+ """Helper method to serialize input data safely."""
47
+ if self.inputs is None:
48
+ return {}
49
+
50
+ serialized_inputs = {}
51
+ for key, value in self.inputs.items():
52
+ if isinstance(value, BaseModel):
53
+ serialized_inputs[key] = value.model_dump()
54
+ elif isinstance(value, (list, tuple)):
55
+ # Handle lists/tuples of arguments
56
+ serialized_inputs[key] = [
57
+ item.model_dump() if isinstance(item, BaseModel)
58
+ else None if not self._is_json_serializable(item)
59
+ else item
60
+ for item in value
61
+ ]
62
+ else:
63
+ if self._is_json_serializable(value):
64
+ serialized_inputs[key] = value
65
+ else:
66
+ serialized_inputs[key] = self.safe_stringify(value, self.function)
67
+ return serialized_inputs
68
+
69
+ def _is_json_serializable(self, obj: Any) -> bool:
70
+ """Helper method to check if an object is JSON serializable."""
71
+ try:
72
+ json.dumps(obj)
73
+ return True
74
+ except (TypeError, OverflowError, ValueError):
75
+ return False
76
+
77
+ def safe_stringify(self, output, function_name):
78
+ """
79
+ Safely converts an object to a string or repr, handling serialization issues gracefully.
80
+ """
81
+ try:
82
+ return str(output)
83
+ except (TypeError, OverflowError, ValueError):
84
+ pass
85
+
86
+ try:
87
+ return repr(output)
88
+ except (TypeError, OverflowError, ValueError):
89
+ pass
90
+
91
+ warnings.warn(
92
+ f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
93
+ )
94
+ return None
95
+
96
+ def _serialize_output(self) -> Any:
97
+ """Helper method to serialize output data safely."""
98
+ if self.output is None:
99
+ return None
100
+
101
+ def serialize_value(value):
102
+ if isinstance(value, BaseModel):
103
+ return value.model_dump()
104
+ elif isinstance(value, dict):
105
+ # Recursively serialize dictionary values
106
+ return {k: serialize_value(v) for k, v in value.items()}
107
+ elif isinstance(value, (list, tuple)):
108
+ # Recursively serialize list/tuple items
109
+ return [serialize_value(item) for item in value]
110
+ else:
111
+ # Try direct JSON serialization first
112
+ try:
113
+ json.dumps(value)
114
+ return value
115
+ except (TypeError, OverflowError, ValueError):
116
+ # Fallback to safe stringification
117
+ return self.safe_stringify(value, self.function)
118
+
119
+ # Start serialization with the top-level output
120
+ return serialize_value(self.output)
121
+
122
+ class Trace(BaseModel):
123
+ trace_id: str
124
+ name: str
125
+ created_at: str
126
+ duration: float
127
+ entries: List[TraceSpan]
128
+ overwrite: bool = False
129
+ offline_mode: bool = False
130
+ rules: Optional[Dict[str, Any]] = None
131
+ has_notification: Optional[bool] = False
132
+
@@ -1,20 +1,20 @@
1
1
 
2
2
  from pydantic import BaseModel
3
- from typing import List, Optional, Dict, Any, Union
4
- from judgeval.data import Sequence
3
+ from typing import List, Optional, Dict, Any, Union, Callable
4
+ from judgeval.data import Trace
5
5
  from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
6
6
  from judgeval.judges import JudgevalJudge
7
7
  from judgeval.rules import Rule
8
8
 
9
9
 
10
- class SequenceRun(BaseModel):
10
+ class TraceRun(BaseModel):
11
11
  """
12
12
  Stores example and evaluation scorers together for running an eval task
13
13
 
14
14
  Args:
15
15
  project_name (str): The name of the project the evaluation results belong to
16
16
  eval_name (str): A name for this evaluation run
17
- sequences (List[Sequence]): The sequences to evaluate
17
+ traces (List[Trace]): The traces to evaluate
18
18
  scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
19
19
  model (str): The model used as a judge when using LLM as a Judge
20
20
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -29,8 +29,9 @@ class SequenceRun(BaseModel):
29
29
  organization_id: Optional[str] = None
30
30
  project_name: Optional[str] = None
31
31
  eval_name: Optional[str] = None
32
- sequences: List[Sequence]
33
- model: Union[str, List[str], JudgevalJudge]
32
+ traces: Optional[List[Trace]] = None
33
+ scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
34
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
34
35
  aggregator: Optional[str] = None
35
36
  metadata: Optional[Dict[str, Any]] = None
36
37
  trace_span_id: Optional[str] = None
@@ -31,7 +31,7 @@ class EvaluationRun(BaseModel):
31
31
  eval_name: Optional[str] = None
32
32
  examples: Union[List[Example], List[CustomExample]]
33
33
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
34
- model: Union[str, List[str], JudgevalJudge]
34
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
35
35
  aggregator: Optional[str] = None
36
36
  metadata: Optional[Dict[str, Any]] = None
37
37
  trace_span_id: Optional[str] = None
@@ -79,7 +79,7 @@ class EvaluationRun(BaseModel):
79
79
  raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
80
80
  return v
81
81
 
82
- @field_validator('examples', mode='before')
82
+ @field_validator('examples')
83
83
  def validate_examples(cls, v):
84
84
  if not v:
85
85
  raise ValueError("Examples cannot be empty.")