judgeval 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/common/tracer.py CHANGED
@@ -24,9 +24,9 @@ import requests
24
24
  from litellm import cost_per_token
25
25
  from pydantic import BaseModel
26
26
  from rich import print as rprint
27
- from openai import OpenAI
28
- from together import Together
29
- from anthropic import Anthropic
27
+ from openai import OpenAI, AsyncOpenAI
28
+ from together import Together, AsyncTogether
29
+ from anthropic import Anthropic, AsyncAnthropic
30
30
 
31
31
  # Local application/library-specific imports
32
32
  from judgeval.constants import (
@@ -37,7 +37,6 @@ from judgeval.constants import (
37
37
  RABBITMQ_QUEUE,
38
38
  JUDGMENT_TRACES_DELETE_API_URL,
39
39
  JUDGMENT_PROJECT_DELETE_API_URL,
40
- JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
41
40
  )
42
41
  from judgeval.judgment_client import JudgmentClient
43
42
  from judgeval.data import Example
@@ -54,7 +53,7 @@ current_trace_var = contextvars.ContextVar('current_trace', default=None)
54
53
  current_span_var = contextvars.ContextVar('current_span', default=None) # NEW: ContextVar for the active span name
55
54
 
56
55
  # Define type aliases for better code readability and maintainability
57
- ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic] # Supported API clients
56
+ ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic, AsyncOpenAI, AsyncAnthropic, AsyncTogether] # Supported API clients
58
57
  TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation'] # Valid trace entry types
59
58
  SpanType = Literal['span', 'tool', 'llm', 'evaluation', 'chain']
60
59
  @dataclass
@@ -69,11 +68,11 @@ class TraceEntry:
69
68
  - evaluation: Evaluation: (evaluation results)
70
69
  """
71
70
  type: TraceEntryType
72
- function: str # Name of the function being traced
73
71
  span_id: str # Unique ID for this specific span instance
74
72
  depth: int # Indentation level for nested calls
75
- message: str # Human-readable description
76
73
  created_at: float # Unix timestamp when entry was created, replacing the deprecated 'timestamp' field
74
+ function: Optional[str] = None # Name of the function being traced
75
+ message: Optional[str] = None # Human-readable description
77
76
  duration: Optional[float] = None # Time taken (for exit/evaluation entries)
78
77
  trace_id: str = None # ID of the trace this entry belongs to
79
78
  output: Any = None # Function output value
@@ -229,6 +228,8 @@ class TraceManagerClient:
229
228
  raise ValueError(f"Failed to fetch traces: {response.text}")
230
229
 
231
230
  return response.json()
231
+
232
+
232
233
 
233
234
  def save_trace(self, trace_data: dict):
234
235
  """
@@ -356,6 +357,18 @@ class TraceClient:
356
357
  self.executed_tools = []
357
358
  self.executed_node_tools = []
358
359
  self._span_depths: Dict[str, int] = {} # NEW: To track depth of active spans
360
+
361
+ def get_current_span(self):
362
+ """Get the current span from the context var"""
363
+ return current_span_var.get()
364
+
365
+ def set_current_span(self, span: Any):
366
+ """Set the current span from the context var"""
367
+ return current_span_var.set(span)
368
+
369
+ def reset_current_span(self, token: Any):
370
+ """Reset the current span from the context var"""
371
+ return current_span_var.reset(token)
359
372
 
360
373
  @contextmanager
361
374
  def span(self, name: str, span_type: SpanType = "span"):
@@ -874,27 +887,7 @@ class TraceClient:
874
887
  "overwrite": overwrite,
875
888
  "parent_trace_id": self.parent_trace_id,
876
889
  "parent_name": self.parent_name
877
- }
878
- # Execute asynchrous evaluation in the background
879
- # if not empty_save: # Only send to RabbitMQ if the trace is not empty
880
- # # Send trace data to evaluation queue via API
881
- # try:
882
- # response = requests.post(
883
- # JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL,
884
- # json=trace_data,
885
- # headers={
886
- # "Content-Type": "application/json",
887
- # "Authorization": f"Bearer {self.tracer.api_key}",
888
- # "X-Organization-Id": self.tracer.organization_id
889
- # },
890
- # verify=True
891
- # )
892
-
893
- # if response.status_code != HTTPStatus.OK:
894
- # warnings.warn(f"Failed to add trace to evaluation queue: {response.text}")
895
- # except Exception as e:
896
- # warnings.warn(f"Error sending trace to evaluation queue: {str(e)}")
897
-
890
+ }
898
891
  self.trace_manager_client.save_trace(trace_data)
899
892
 
900
893
  return self.trace_id, trace_data
@@ -941,6 +934,18 @@ class Tracer:
941
934
  "To use a different project name, ensure the first Tracer initialization uses the desired project name.",
942
935
  RuntimeWarning
943
936
  )
937
+
938
+ def set_current_trace(self, trace: TraceClient):
939
+ """
940
+ Set the current trace context in contextvars
941
+ """
942
+ current_trace_var.set(trace)
943
+
944
+ def get_current_trace(self):
945
+ """
946
+ Get the current trace context from contextvars
947
+ """
948
+ return current_trace_var.get()
944
949
 
945
950
  @contextmanager
946
951
  def trace(
@@ -1199,33 +1204,66 @@ def wrap(client: Any) -> Any:
1199
1204
  """
1200
1205
  # Get the appropriate configuration for this client type
1201
1206
  span_name, original_create = _get_client_config(client)
1202
-
1203
- def traced_create(*args, **kwargs):
1204
- # Get the current trace from contextvars
1205
- current_trace = current_trace_var.get()
1206
-
1207
- # Skip tracing if no active trace
1208
- if not current_trace:
1209
- return original_create(*args, **kwargs)
1210
-
1211
- with current_trace.span(span_name, span_type="llm") as span:
1212
- # Format and record the input parameters
1213
- input_data = _format_input_data(client, **kwargs)
1214
- span.record_input(input_data)
1215
-
1216
- # Make the actual API call
1217
- response = original_create(*args, **kwargs)
1207
+
1208
+ # Handle async clients differently than synchronous clients (need an async function for async clients)
1209
+ if (isinstance(client, (AsyncOpenAI, AsyncAnthropic, AsyncTogether))):
1210
+ async def traced_create(*args, **kwargs):
1211
+ # Get the current trace from contextvars
1212
+ current_trace = current_trace_var.get()
1218
1213
 
1219
- # Format and record the output
1220
- output_data = _format_output_data(client, response)
1221
- span.record_output(output_data)
1214
+ # Skip tracing if no active trace
1215
+ if not current_trace:
1216
+ return original_create(*args, **kwargs)
1217
+
1218
+ with current_trace.span(span_name, span_type="llm") as span:
1219
+ # Format and record the input parameters
1220
+ input_data = _format_input_data(client, **kwargs)
1221
+ span.record_input(input_data)
1222
+
1223
+ # Make the actual API call
1224
+ try:
1225
+ response = await original_create(*args, **kwargs)
1226
+ except Exception as e:
1227
+ print(f"Error during API call: {e}")
1228
+ raise
1229
+
1230
+ # Format and record the output
1231
+ output_data = _format_output_data(client, response)
1232
+ span.record_output(output_data)
1233
+
1234
+ return response
1235
+ else:
1236
+ def traced_create(*args, **kwargs):
1237
+ # Get the current trace from contextvars
1238
+ current_trace = current_trace_var.get()
1222
1239
 
1223
- return response
1240
+ # Skip tracing if no active trace
1241
+ if not current_trace:
1242
+ return original_create(*args, **kwargs)
1243
+
1244
+ with current_trace.span(span_name, span_type="llm") as span:
1245
+ # Format and record the input parameters
1246
+ input_data = _format_input_data(client, **kwargs)
1247
+ span.record_input(input_data)
1248
+
1249
+ # Make the actual API call
1250
+ try:
1251
+ response = original_create(*args, **kwargs)
1252
+ except Exception as e:
1253
+ print(f"Error during API call: {e}")
1254
+ raise
1255
+
1256
+ # Format and record the output
1257
+ output_data = _format_output_data(client, response)
1258
+ span.record_output(output_data)
1259
+
1260
+ return response
1261
+
1224
1262
 
1225
1263
  # Replace the original method with our traced version
1226
- if isinstance(client, (OpenAI, Together)):
1264
+ if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
1227
1265
  client.chat.completions.create = traced_create
1228
- elif isinstance(client, Anthropic):
1266
+ elif isinstance(client, (Anthropic, AsyncAnthropic)):
1229
1267
  client.messages.create = traced_create
1230
1268
 
1231
1269
  return client
@@ -1246,11 +1284,11 @@ def _get_client_config(client: ApiClient) -> tuple[str, callable]:
1246
1284
  Raises:
1247
1285
  ValueError: If client type is not supported
1248
1286
  """
1249
- if isinstance(client, OpenAI):
1287
+ if isinstance(client, (OpenAI, AsyncOpenAI)):
1250
1288
  return "OPENAI_API_CALL", client.chat.completions.create
1251
- elif isinstance(client, Together):
1289
+ elif isinstance(client, (Together, AsyncTogether)):
1252
1290
  return "TOGETHER_API_CALL", client.chat.completions.create
1253
- elif isinstance(client, Anthropic):
1291
+ elif isinstance(client, (Anthropic, AsyncAnthropic)):
1254
1292
  return "ANTHROPIC_API_CALL", client.messages.create
1255
1293
  raise ValueError(f"Unsupported client type: {type(client)}")
1256
1294
 
@@ -1260,7 +1298,7 @@ def _format_input_data(client: ApiClient, **kwargs) -> dict:
1260
1298
  Extracts relevant parameters from kwargs based on the client type
1261
1299
  to ensure consistent tracing across different APIs.
1262
1300
  """
1263
- if isinstance(client, (OpenAI, Together)):
1301
+ if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
1264
1302
  return {
1265
1303
  "model": kwargs.get("model"),
1266
1304
  "messages": kwargs.get("messages"),
@@ -1283,7 +1321,7 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
1283
1321
  - content: The generated text
1284
1322
  - usage: Token usage statistics
1285
1323
  """
1286
- if isinstance(client, (OpenAI, Together)):
1324
+ if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
1287
1325
  return {
1288
1326
  "content": response.choices[0].message.content,
1289
1327
  "usage": {
judgeval/constants.py CHANGED
@@ -26,7 +26,8 @@ class APIScorer(str, Enum):
26
26
  JSON_CORRECTNESS = "json_correctness"
27
27
  COMPARISON = "comparison"
28
28
  GROUNDEDNESS = "groundedness"
29
-
29
+ DERAILMENT = "derailment"
30
+
30
31
  @classmethod
31
32
  def _missing_(cls, value):
32
33
  # Handle case-insensitive lookup
@@ -39,7 +40,9 @@ UNBOUNDED_SCORERS = set([APIScorer.COMPARISON]) # scorers whose scores are not
39
40
  ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
40
41
  # API URLs
41
42
  JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
43
+ JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
42
44
  JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
45
+ JUDGMENT_DATASETS_APPEND_API_URL = f"{ROOT_API}/datasets/insert_examples/"
43
46
  JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
44
47
  JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
45
48
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
@@ -54,7 +57,6 @@ JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
54
57
  JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
55
58
  JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
56
59
  JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
57
- JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_trace_eval_queue/"
58
60
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
59
61
  # RabbitMQ
60
62
  RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
judgeval/data/__init__.py CHANGED
@@ -1,12 +1,16 @@
1
1
  from judgeval.data.example import Example, ExampleParams
2
+ from judgeval.data.custom_example import CustomExample
2
3
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
3
4
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
+ from judgeval.data.sequence import Sequence
4
6
 
5
7
  __all__ = [
6
8
  "Example",
7
9
  "ExampleParams",
10
+ "CustomExample",
8
11
  "ScorerData",
9
12
  "create_scorer_data",
10
13
  "ScoringResult",
11
14
  "generate_scoring_result",
15
+ "Sequence",
12
16
  ]
@@ -0,0 +1,18 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing import Optional, Union, List, Dict, Any
3
+ from uuid import uuid4
4
+
5
+ class CustomExample(BaseModel):
6
+ input: Optional[Dict[str, Any]] = None
7
+ actual_output: Optional[Dict[str, Any]] = None
8
+ expected_output: Optional[Dict[str, Any]] = None
9
+ context: Optional[List[str]] = None
10
+ retrieval_context: Optional[List[str]] = None
11
+ additional_metadata: Optional[Dict[str, Any]] = None
12
+ tools_called: Optional[List[str]] = None
13
+ expected_tools: Optional[List[str]] = None
14
+ name: Optional[str] = None
15
+ example_id: str = Field(default_factory=lambda: str(uuid4()))
16
+ example_index: Optional[int] = None
17
+ timestamp: Optional[str] = None
18
+ trace_id: Optional[str] = None
@@ -6,6 +6,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
6
6
  from judgeval.common.logger import debug, error, warning, info
7
7
  from judgeval.constants import (
8
8
  JUDGMENT_DATASETS_PUSH_API_URL,
9
+ JUDGMENT_DATASETS_APPEND_API_URL,
9
10
  JUDGMENT_DATASETS_PULL_API_URL,
10
11
  JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
11
12
  JUDGMENT_DATASETS_DELETE_API_URL,
@@ -70,9 +71,9 @@ class EvalDatasetClient:
70
71
  },
71
72
  verify=True
72
73
  )
73
- if response.status_code == 500:
74
- error(f"Server error during push: {content.get('message')}")
75
- return False
74
+ if response.status_code != 200:
75
+ error(f"Server error during push: {response.json()}")
76
+ raise Exception(f"Server error during push: {response.json()}")
76
77
  response.raise_for_status()
77
78
  except requests.exceptions.HTTPError as err:
78
79
  if response.status_code == 422:
@@ -90,6 +91,64 @@ class EvalDatasetClient:
90
91
  )
91
92
  return True
92
93
 
94
+
95
+ def append(self, alias: str, examples: List[Example], project_name: str) -> bool:
96
+ debug(f"Appending dataset with alias '{alias}'")
97
+ """
98
+ Appends the dataset to Judgment platform
99
+
100
+ Mock request:
101
+ dataset = {
102
+ "alias": alias,
103
+ "examples": [...],
104
+ "project_name": project_name
105
+ } ==>
106
+ {
107
+ "_alias": alias,
108
+ "_id": "..." # ID of the dataset
109
+ }
110
+ """
111
+ with Progress(
112
+ SpinnerColumn(style="rgb(106,0,255)"),
113
+ TextColumn("[progress.description]{task.description}"),
114
+ transient=False,
115
+ ) as progress:
116
+ task_id = progress.add_task(
117
+ f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
118
+ total=100,
119
+ )
120
+ content = {
121
+ "dataset_alias": alias,
122
+ "project_name": project_name,
123
+ "examples": [e.to_dict() for e in examples],
124
+ }
125
+ try:
126
+ response = requests.post(
127
+ JUDGMENT_DATASETS_APPEND_API_URL,
128
+ json=content,
129
+ headers={
130
+ "Content-Type": "application/json",
131
+ "Authorization": f"Bearer {self.judgment_api_key}",
132
+ "X-Organization-Id": self.organization_id
133
+ },
134
+ verify=True
135
+ )
136
+ if response.status_code != 200:
137
+ error(f"Server error during append: {response.json()}")
138
+ raise Exception(f"Server error during append: {response.json()}")
139
+ response.raise_for_status()
140
+ except requests.exceptions.HTTPError as err:
141
+ if response.status_code == 422:
142
+ error(f"Validation error during append: {err.response.json()}")
143
+ else:
144
+ error(f"HTTP error during append: {err}")
145
+
146
+ progress.update(
147
+ task_id,
148
+ description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
149
+ )
150
+ return True
151
+
93
152
  def pull(self, alias: str, project_name: str) -> EvalDataset:
94
153
  debug(f"Pulling dataset with alias '{alias}'")
95
154
  """
judgeval/data/example.py CHANGED
@@ -37,6 +37,7 @@ class Example(BaseModel):
37
37
  example_index: Optional[int] = None
38
38
  timestamp: Optional[str] = None
39
39
  trace_id: Optional[str] = None
40
+ sequence_order: Optional[int] = 0
40
41
 
41
42
  def __init__(self, **data):
42
43
  if 'example_id' not in data:
judgeval/data/result.py CHANGED
@@ -2,7 +2,8 @@ from dataclasses import dataclass
2
2
  from typing import List, Union, Optional, Dict, Any, Union
3
3
  from judgeval.common.logger import debug, error
4
4
  from pydantic import BaseModel
5
- from judgeval.data import ScorerData, Example
5
+ from judgeval.data import ScorerData, Example, CustomExample
6
+ from judgeval.data.sequence import Sequence
6
7
 
7
8
 
8
9
  class ScoringResult(BaseModel):
@@ -23,7 +24,7 @@ class ScoringResult(BaseModel):
23
24
  name: Optional[str] = None
24
25
 
25
26
  # The original example object that was used to create the ScoringResult
26
- data_object: Optional[Example] = None #can be Example, CustomExample (future), WorkflowRun (future)
27
+ data_object: Optional[Union[Sequence, CustomExample, Example]] = None
27
28
  trace_id: Optional[str] = None
28
29
 
29
30
  # Additional fields for internal use
@@ -48,7 +49,7 @@ class ScoringResult(BaseModel):
48
49
 
49
50
 
50
51
  def generate_scoring_result(
51
- example: Example,
52
+ data_object: Union[Example, Sequence],
52
53
  scorers_data: List[ScorerData],
53
54
  run_duration: float,
54
55
  success: bool,
@@ -59,15 +60,15 @@ def generate_scoring_result(
59
60
  When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
60
61
  At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
61
62
  """
62
- if example.name is not None:
63
- name = example.name
63
+ if data_object.name is not None:
64
+ name = data_object.name
64
65
  else:
65
66
  name = "Test Case Placeholder"
66
67
  debug(f"No name provided for example, using default name: {name}")
67
68
  debug(f"Creating ScoringResult for: {name}")
68
69
  scoring_result = ScoringResult(
69
70
  name=name,
70
- data_object=example,
71
+ data_object=data_object,
71
72
  success=success,
72
73
  scorers_data=scorers_data,
73
74
  run_duration=run_duration,
@@ -0,0 +1,59 @@
1
+ from pydantic import BaseModel, Field, field_validator, model_validator
2
+ from typing import List, Optional, Union, Any
3
+ from judgeval.data.example import Example
4
+ from judgeval.scorers import ScorerWrapper, JudgevalScorer
5
+ from uuid import uuid4
6
+ from datetime import datetime, timezone
7
+
8
+ class Sequence(BaseModel):
9
+ """
10
+ A sequence is a list of either Examples or nested Sequence objects.
11
+ """
12
+ sequence_id: str = Field(default_factory=lambda: str(uuid4()))
13
+ name: Optional[str] = "Sequence"
14
+ created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
15
+ items: List[Union["Sequence", Example]]
16
+ scorers: Optional[Any] = None
17
+ parent_sequence_id: Optional[str] = None
18
+ sequence_order: Optional[int] = 0
19
+
20
+ @field_validator("scorers")
21
+ def validate_scorer(cls, v):
22
+ loaded_scorers = []
23
+ for scorer in v or []:
24
+ try:
25
+ if isinstance(scorer, ScorerWrapper):
26
+ loaded_scorers.append(scorer.load_implementation())
27
+ else:
28
+ loaded_scorers.append(scorer)
29
+ except Exception as e:
30
+ raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
31
+ return loaded_scorers
32
+
33
+ @model_validator(mode='after')
34
+ def set_parent_sequence_ids(self) -> "Sequence":
35
+ """Recursively set the parent_sequence_id for all nested Sequences."""
36
+ for item in self.items:
37
+ if isinstance(item, Sequence):
38
+ item.parent_sequence_id = self.sequence_id
39
+ # Recurse into deeper nested sequences
40
+ item.set_parent_sequence_ids()
41
+ return self
42
+
43
+ @model_validator(mode='after')
44
+ def set_parent_and_order(self) -> "Sequence":
45
+ """Set parent_sequence_id and sequence_order for all items."""
46
+ for idx, item in enumerate(self.items):
47
+ # Set sequence_order for both Example and Sequence objects
48
+ item.sequence_order = idx
49
+
50
+ if isinstance(item, Sequence):
51
+ item.parent_sequence_id = self.sequence_id
52
+ item.set_parent_and_order() # Recurse for nested sequences
53
+ return self
54
+
55
+ class Config:
56
+ arbitrary_types_allowed = True
57
+
58
+ # Update forward references so that "Sequence" inside items is resolved.
59
+ Sequence.model_rebuild()
@@ -0,0 +1,42 @@
1
+
2
+ from pydantic import BaseModel
3
+ from typing import List, Optional, Dict, Any, Union
4
+ from judgeval.data import Sequence
5
+ from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
6
+ from judgeval.judges import JudgevalJudge
7
+ from judgeval.rules import Rule
8
+
9
+
10
+ class SequenceRun(BaseModel):
11
+ """
12
+ Stores example and evaluation scorers together for running an eval task
13
+
14
+ Args:
15
+ project_name (str): The name of the project the evaluation results belong to
16
+ eval_name (str): A name for this evaluation run
17
+ sequences (List[Sequence]): The sequences to evaluate
18
+ scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
19
+ model (str): The model used as a judge when using LLM as a Judge
20
+ aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
21
+ metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
22
+ judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
23
+ rules (Optional[List[Rule]]): Rules to evaluate against scoring results
24
+ """
25
+
26
+ # The user will specify whether they want log_results when they call run_eval
27
+ log_results: bool = False # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
28
+ organization_id: Optional[str] = None
29
+ project_name: Optional[str] = None
30
+ eval_name: Optional[str] = None
31
+ sequences: List[Sequence]
32
+ model: Union[str, List[str], JudgevalJudge]
33
+ aggregator: Optional[str] = None
34
+ metadata: Optional[Dict[str, Any]] = None
35
+ trace_span_id: Optional[str] = None
36
+ # API Key will be "" until user calls client.run_eval(), then API Key will be set
37
+ judgment_api_key: Optional[str] = ""
38
+ override: Optional[bool] = False
39
+ rules: Optional[List[Rule]] = None
40
+
41
+ class Config:
42
+ arbitrary_types_allowed = True
@@ -1,7 +1,7 @@
1
1
  from typing import List, Optional, Dict, Any, Union
2
2
  from pydantic import BaseModel, field_validator
3
3
 
4
- from judgeval.data import Example
4
+ from judgeval.data import Example, CustomExample
5
5
  from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
6
6
  from judgeval.constants import ACCEPTABLE_MODELS
7
7
  from judgeval.common.logger import debug, error
@@ -15,7 +15,7 @@ class EvaluationRun(BaseModel):
15
15
  Args:
16
16
  project_name (str): The name of the project the evaluation results belong to
17
17
  eval_name (str): A name for this evaluation run
18
- examples (List[Example]): The examples to evaluate
18
+ examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
19
19
  scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
20
20
  model (str): The model used as a judge when using LLM as a Judge
21
21
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -29,7 +29,7 @@ class EvaluationRun(BaseModel):
29
29
  organization_id: Optional[str] = None
30
30
  project_name: Optional[str] = None
31
31
  eval_name: Optional[str] = None
32
- examples: List[Example]
32
+ examples: Union[List[Example], List[CustomExample]]
33
33
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
34
34
  model: Union[str, List[str], JudgevalJudge]
35
35
  aggregator: Optional[str] = None
@@ -38,6 +38,7 @@ class EvaluationRun(BaseModel):
38
38
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
39
39
  judgment_api_key: Optional[str] = ""
40
40
  override: Optional[bool] = False
41
+ append: Optional[bool] = False
41
42
  rules: Optional[List[Rule]] = None
42
43
 
43
44
  def model_dump(self, **kwargs):
@@ -78,13 +79,17 @@ class EvaluationRun(BaseModel):
78
79
  raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
79
80
  return v
80
81
 
81
- @field_validator('examples')
82
+ @field_validator('examples', mode='before')
82
83
  def validate_examples(cls, v):
83
84
  if not v:
84
85
  raise ValueError("Examples cannot be empty.")
85
- for ex in v:
86
- if not isinstance(ex, Example):
87
- raise ValueError(f"Invalid type for Example: {type(ex)}")
86
+
87
+ first_type = type(v[0])
88
+ if first_type not in (Example, CustomExample):
89
+ raise ValueError(f"Invalid type for Example/CustomExample: {first_type}")
90
+ if not all(isinstance(ex, first_type) for ex in v):
91
+ raise ValueError("All examples must be of the same type, either all Example or all CustomExample.")
92
+
88
93
  return v
89
94
 
90
95
  @field_validator('scorers')