judgeval 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +93 -55
- judgeval/constants.py +4 -2
- judgeval/data/__init__.py +4 -0
- judgeval/data/custom_example.py +18 -0
- judgeval/data/datasets/eval_dataset_client.py +62 -3
- judgeval/data/example.py +1 -0
- judgeval/data/result.py +7 -6
- judgeval/data/sequence.py +59 -0
- judgeval/data/sequence_run.py +42 -0
- judgeval/evaluation_run.py +12 -7
- judgeval/integrations/langgraph.py +89 -72
- judgeval/judgment_client.py +77 -14
- judgeval/run_evaluation.py +87 -13
- judgeval/scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorer.py +3 -0
- judgeval/scorers/judgeval_scorers/__init__.py +7 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -1
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +21 -0
- judgeval/scorers/score.py +6 -5
- {judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/METADATA +1 -1
- {judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/RECORD +23 -20
- judgeval/data/custom_api_example.py +0 -91
- {judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/WHEEL +0 -0
- {judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/tracer.py
CHANGED
@@ -24,9 +24,9 @@ import requests
|
|
24
24
|
from litellm import cost_per_token
|
25
25
|
from pydantic import BaseModel
|
26
26
|
from rich import print as rprint
|
27
|
-
from openai import OpenAI
|
28
|
-
from together import Together
|
29
|
-
from anthropic import Anthropic
|
27
|
+
from openai import OpenAI, AsyncOpenAI
|
28
|
+
from together import Together, AsyncTogether
|
29
|
+
from anthropic import Anthropic, AsyncAnthropic
|
30
30
|
|
31
31
|
# Local application/library-specific imports
|
32
32
|
from judgeval.constants import (
|
@@ -37,7 +37,6 @@ from judgeval.constants import (
|
|
37
37
|
RABBITMQ_QUEUE,
|
38
38
|
JUDGMENT_TRACES_DELETE_API_URL,
|
39
39
|
JUDGMENT_PROJECT_DELETE_API_URL,
|
40
|
-
JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
|
41
40
|
)
|
42
41
|
from judgeval.judgment_client import JudgmentClient
|
43
42
|
from judgeval.data import Example
|
@@ -54,7 +53,7 @@ current_trace_var = contextvars.ContextVar('current_trace', default=None)
|
|
54
53
|
current_span_var = contextvars.ContextVar('current_span', default=None) # NEW: ContextVar for the active span name
|
55
54
|
|
56
55
|
# Define type aliases for better code readability and maintainability
|
57
|
-
ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic] # Supported API clients
|
56
|
+
ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic, AsyncOpenAI, AsyncAnthropic, AsyncTogether] # Supported API clients
|
58
57
|
TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation'] # Valid trace entry types
|
59
58
|
SpanType = Literal['span', 'tool', 'llm', 'evaluation', 'chain']
|
60
59
|
@dataclass
|
@@ -69,11 +68,11 @@ class TraceEntry:
|
|
69
68
|
- evaluation: Evaluation: (evaluation results)
|
70
69
|
"""
|
71
70
|
type: TraceEntryType
|
72
|
-
function: str # Name of the function being traced
|
73
71
|
span_id: str # Unique ID for this specific span instance
|
74
72
|
depth: int # Indentation level for nested calls
|
75
|
-
message: str # Human-readable description
|
76
73
|
created_at: float # Unix timestamp when entry was created, replacing the deprecated 'timestamp' field
|
74
|
+
function: Optional[str] = None # Name of the function being traced
|
75
|
+
message: Optional[str] = None # Human-readable description
|
77
76
|
duration: Optional[float] = None # Time taken (for exit/evaluation entries)
|
78
77
|
trace_id: str = None # ID of the trace this entry belongs to
|
79
78
|
output: Any = None # Function output value
|
@@ -229,6 +228,8 @@ class TraceManagerClient:
|
|
229
228
|
raise ValueError(f"Failed to fetch traces: {response.text}")
|
230
229
|
|
231
230
|
return response.json()
|
231
|
+
|
232
|
+
|
232
233
|
|
233
234
|
def save_trace(self, trace_data: dict):
|
234
235
|
"""
|
@@ -356,6 +357,18 @@ class TraceClient:
|
|
356
357
|
self.executed_tools = []
|
357
358
|
self.executed_node_tools = []
|
358
359
|
self._span_depths: Dict[str, int] = {} # NEW: To track depth of active spans
|
360
|
+
|
361
|
+
def get_current_span(self):
|
362
|
+
"""Get the current span from the context var"""
|
363
|
+
return current_span_var.get()
|
364
|
+
|
365
|
+
def set_current_span(self, span: Any):
|
366
|
+
"""Set the current span from the context var"""
|
367
|
+
return current_span_var.set(span)
|
368
|
+
|
369
|
+
def reset_current_span(self, token: Any):
|
370
|
+
"""Reset the current span from the context var"""
|
371
|
+
return current_span_var.reset(token)
|
359
372
|
|
360
373
|
@contextmanager
|
361
374
|
def span(self, name: str, span_type: SpanType = "span"):
|
@@ -874,27 +887,7 @@ class TraceClient:
|
|
874
887
|
"overwrite": overwrite,
|
875
888
|
"parent_trace_id": self.parent_trace_id,
|
876
889
|
"parent_name": self.parent_name
|
877
|
-
}
|
878
|
-
# Execute asynchrous evaluation in the background
|
879
|
-
# if not empty_save: # Only send to RabbitMQ if the trace is not empty
|
880
|
-
# # Send trace data to evaluation queue via API
|
881
|
-
# try:
|
882
|
-
# response = requests.post(
|
883
|
-
# JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL,
|
884
|
-
# json=trace_data,
|
885
|
-
# headers={
|
886
|
-
# "Content-Type": "application/json",
|
887
|
-
# "Authorization": f"Bearer {self.tracer.api_key}",
|
888
|
-
# "X-Organization-Id": self.tracer.organization_id
|
889
|
-
# },
|
890
|
-
# verify=True
|
891
|
-
# )
|
892
|
-
|
893
|
-
# if response.status_code != HTTPStatus.OK:
|
894
|
-
# warnings.warn(f"Failed to add trace to evaluation queue: {response.text}")
|
895
|
-
# except Exception as e:
|
896
|
-
# warnings.warn(f"Error sending trace to evaluation queue: {str(e)}")
|
897
|
-
|
890
|
+
}
|
898
891
|
self.trace_manager_client.save_trace(trace_data)
|
899
892
|
|
900
893
|
return self.trace_id, trace_data
|
@@ -941,6 +934,18 @@ class Tracer:
|
|
941
934
|
"To use a different project name, ensure the first Tracer initialization uses the desired project name.",
|
942
935
|
RuntimeWarning
|
943
936
|
)
|
937
|
+
|
938
|
+
def set_current_trace(self, trace: TraceClient):
|
939
|
+
"""
|
940
|
+
Set the current trace context in contextvars
|
941
|
+
"""
|
942
|
+
current_trace_var.set(trace)
|
943
|
+
|
944
|
+
def get_current_trace(self):
|
945
|
+
"""
|
946
|
+
Get the current trace context from contextvars
|
947
|
+
"""
|
948
|
+
return current_trace_var.get()
|
944
949
|
|
945
950
|
@contextmanager
|
946
951
|
def trace(
|
@@ -1199,33 +1204,66 @@ def wrap(client: Any) -> Any:
|
|
1199
1204
|
"""
|
1200
1205
|
# Get the appropriate configuration for this client type
|
1201
1206
|
span_name, original_create = _get_client_config(client)
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
1208
|
-
if not current_trace:
|
1209
|
-
return original_create(*args, **kwargs)
|
1210
|
-
|
1211
|
-
with current_trace.span(span_name, span_type="llm") as span:
|
1212
|
-
# Format and record the input parameters
|
1213
|
-
input_data = _format_input_data(client, **kwargs)
|
1214
|
-
span.record_input(input_data)
|
1215
|
-
|
1216
|
-
# Make the actual API call
|
1217
|
-
response = original_create(*args, **kwargs)
|
1207
|
+
|
1208
|
+
# Handle async clients differently than synchronous clients (need an async function for async clients)
|
1209
|
+
if (isinstance(client, (AsyncOpenAI, AsyncAnthropic, AsyncTogether))):
|
1210
|
+
async def traced_create(*args, **kwargs):
|
1211
|
+
# Get the current trace from contextvars
|
1212
|
+
current_trace = current_trace_var.get()
|
1218
1213
|
|
1219
|
-
#
|
1220
|
-
|
1221
|
-
|
1214
|
+
# Skip tracing if no active trace
|
1215
|
+
if not current_trace:
|
1216
|
+
return original_create(*args, **kwargs)
|
1217
|
+
|
1218
|
+
with current_trace.span(span_name, span_type="llm") as span:
|
1219
|
+
# Format and record the input parameters
|
1220
|
+
input_data = _format_input_data(client, **kwargs)
|
1221
|
+
span.record_input(input_data)
|
1222
|
+
|
1223
|
+
# Make the actual API call
|
1224
|
+
try:
|
1225
|
+
response = await original_create(*args, **kwargs)
|
1226
|
+
except Exception as e:
|
1227
|
+
print(f"Error during API call: {e}")
|
1228
|
+
raise
|
1229
|
+
|
1230
|
+
# Format and record the output
|
1231
|
+
output_data = _format_output_data(client, response)
|
1232
|
+
span.record_output(output_data)
|
1233
|
+
|
1234
|
+
return response
|
1235
|
+
else:
|
1236
|
+
def traced_create(*args, **kwargs):
|
1237
|
+
# Get the current trace from contextvars
|
1238
|
+
current_trace = current_trace_var.get()
|
1222
1239
|
|
1223
|
-
|
1240
|
+
# Skip tracing if no active trace
|
1241
|
+
if not current_trace:
|
1242
|
+
return original_create(*args, **kwargs)
|
1243
|
+
|
1244
|
+
with current_trace.span(span_name, span_type="llm") as span:
|
1245
|
+
# Format and record the input parameters
|
1246
|
+
input_data = _format_input_data(client, **kwargs)
|
1247
|
+
span.record_input(input_data)
|
1248
|
+
|
1249
|
+
# Make the actual API call
|
1250
|
+
try:
|
1251
|
+
response = original_create(*args, **kwargs)
|
1252
|
+
except Exception as e:
|
1253
|
+
print(f"Error during API call: {e}")
|
1254
|
+
raise
|
1255
|
+
|
1256
|
+
# Format and record the output
|
1257
|
+
output_data = _format_output_data(client, response)
|
1258
|
+
span.record_output(output_data)
|
1259
|
+
|
1260
|
+
return response
|
1261
|
+
|
1224
1262
|
|
1225
1263
|
# Replace the original method with our traced version
|
1226
|
-
if isinstance(client, (OpenAI, Together)):
|
1264
|
+
if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
|
1227
1265
|
client.chat.completions.create = traced_create
|
1228
|
-
elif isinstance(client, Anthropic):
|
1266
|
+
elif isinstance(client, (Anthropic, AsyncAnthropic)):
|
1229
1267
|
client.messages.create = traced_create
|
1230
1268
|
|
1231
1269
|
return client
|
@@ -1246,11 +1284,11 @@ def _get_client_config(client: ApiClient) -> tuple[str, callable]:
|
|
1246
1284
|
Raises:
|
1247
1285
|
ValueError: If client type is not supported
|
1248
1286
|
"""
|
1249
|
-
if isinstance(client, OpenAI):
|
1287
|
+
if isinstance(client, (OpenAI, AsyncOpenAI)):
|
1250
1288
|
return "OPENAI_API_CALL", client.chat.completions.create
|
1251
|
-
elif isinstance(client, Together):
|
1289
|
+
elif isinstance(client, (Together, AsyncTogether)):
|
1252
1290
|
return "TOGETHER_API_CALL", client.chat.completions.create
|
1253
|
-
elif isinstance(client, Anthropic):
|
1291
|
+
elif isinstance(client, (Anthropic, AsyncAnthropic)):
|
1254
1292
|
return "ANTHROPIC_API_CALL", client.messages.create
|
1255
1293
|
raise ValueError(f"Unsupported client type: {type(client)}")
|
1256
1294
|
|
@@ -1260,7 +1298,7 @@ def _format_input_data(client: ApiClient, **kwargs) -> dict:
|
|
1260
1298
|
Extracts relevant parameters from kwargs based on the client type
|
1261
1299
|
to ensure consistent tracing across different APIs.
|
1262
1300
|
"""
|
1263
|
-
if isinstance(client, (OpenAI, Together)):
|
1301
|
+
if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
|
1264
1302
|
return {
|
1265
1303
|
"model": kwargs.get("model"),
|
1266
1304
|
"messages": kwargs.get("messages"),
|
@@ -1283,7 +1321,7 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
|
|
1283
1321
|
- content: The generated text
|
1284
1322
|
- usage: Token usage statistics
|
1285
1323
|
"""
|
1286
|
-
if isinstance(client, (OpenAI, Together)):
|
1324
|
+
if isinstance(client, (OpenAI, Together, AsyncOpenAI, AsyncTogether)):
|
1287
1325
|
return {
|
1288
1326
|
"content": response.choices[0].message.content,
|
1289
1327
|
"usage": {
|
judgeval/constants.py
CHANGED
@@ -26,7 +26,8 @@ class APIScorer(str, Enum):
|
|
26
26
|
JSON_CORRECTNESS = "json_correctness"
|
27
27
|
COMPARISON = "comparison"
|
28
28
|
GROUNDEDNESS = "groundedness"
|
29
|
-
|
29
|
+
DERAILMENT = "derailment"
|
30
|
+
|
30
31
|
@classmethod
|
31
32
|
def _missing_(cls, value):
|
32
33
|
# Handle case-insensitive lookup
|
@@ -39,7 +40,9 @@ UNBOUNDED_SCORERS = set([APIScorer.COMPARISON]) # scorers whose scores are not
|
|
39
40
|
ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
40
41
|
# API URLs
|
41
42
|
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
43
|
+
JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
|
42
44
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
45
|
+
JUDGMENT_DATASETS_APPEND_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
43
46
|
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
|
44
47
|
JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
|
45
48
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
@@ -54,7 +57,6 @@ JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
|
|
54
57
|
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
55
58
|
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
56
59
|
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
57
|
-
JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_trace_eval_queue/"
|
58
60
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
59
61
|
# RabbitMQ
|
60
62
|
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
|
judgeval/data/__init__.py
CHANGED
@@ -1,12 +1,16 @@
|
|
1
1
|
from judgeval.data.example import Example, ExampleParams
|
2
|
+
from judgeval.data.custom_example import CustomExample
|
2
3
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
3
4
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
5
|
+
from judgeval.data.sequence import Sequence
|
4
6
|
|
5
7
|
__all__ = [
|
6
8
|
"Example",
|
7
9
|
"ExampleParams",
|
10
|
+
"CustomExample",
|
8
11
|
"ScorerData",
|
9
12
|
"create_scorer_data",
|
10
13
|
"ScoringResult",
|
11
14
|
"generate_scoring_result",
|
15
|
+
"Sequence",
|
12
16
|
]
|
@@ -0,0 +1,18 @@
|
|
1
|
+
from pydantic import BaseModel, Field
|
2
|
+
from typing import Optional, Union, List, Dict, Any
|
3
|
+
from uuid import uuid4
|
4
|
+
|
5
|
+
class CustomExample(BaseModel):
|
6
|
+
input: Optional[Dict[str, Any]] = None
|
7
|
+
actual_output: Optional[Dict[str, Any]] = None
|
8
|
+
expected_output: Optional[Dict[str, Any]] = None
|
9
|
+
context: Optional[List[str]] = None
|
10
|
+
retrieval_context: Optional[List[str]] = None
|
11
|
+
additional_metadata: Optional[Dict[str, Any]] = None
|
12
|
+
tools_called: Optional[List[str]] = None
|
13
|
+
expected_tools: Optional[List[str]] = None
|
14
|
+
name: Optional[str] = None
|
15
|
+
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
16
|
+
example_index: Optional[int] = None
|
17
|
+
timestamp: Optional[str] = None
|
18
|
+
trace_id: Optional[str] = None
|
@@ -6,6 +6,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
6
6
|
from judgeval.common.logger import debug, error, warning, info
|
7
7
|
from judgeval.constants import (
|
8
8
|
JUDGMENT_DATASETS_PUSH_API_URL,
|
9
|
+
JUDGMENT_DATASETS_APPEND_API_URL,
|
9
10
|
JUDGMENT_DATASETS_PULL_API_URL,
|
10
11
|
JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
|
11
12
|
JUDGMENT_DATASETS_DELETE_API_URL,
|
@@ -70,9 +71,9 @@ class EvalDatasetClient:
|
|
70
71
|
},
|
71
72
|
verify=True
|
72
73
|
)
|
73
|
-
if response.status_code
|
74
|
-
error(f"Server error during push: {
|
75
|
-
|
74
|
+
if response.status_code != 200:
|
75
|
+
error(f"Server error during push: {response.json()}")
|
76
|
+
raise Exception(f"Server error during push: {response.json()}")
|
76
77
|
response.raise_for_status()
|
77
78
|
except requests.exceptions.HTTPError as err:
|
78
79
|
if response.status_code == 422:
|
@@ -90,6 +91,64 @@ class EvalDatasetClient:
|
|
90
91
|
)
|
91
92
|
return True
|
92
93
|
|
94
|
+
|
95
|
+
def append(self, alias: str, examples: List[Example], project_name: str) -> bool:
|
96
|
+
debug(f"Appending dataset with alias '{alias}'")
|
97
|
+
"""
|
98
|
+
Appends the dataset to Judgment platform
|
99
|
+
|
100
|
+
Mock request:
|
101
|
+
dataset = {
|
102
|
+
"alias": alias,
|
103
|
+
"examples": [...],
|
104
|
+
"project_name": project_name
|
105
|
+
} ==>
|
106
|
+
{
|
107
|
+
"_alias": alias,
|
108
|
+
"_id": "..." # ID of the dataset
|
109
|
+
}
|
110
|
+
"""
|
111
|
+
with Progress(
|
112
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
113
|
+
TextColumn("[progress.description]{task.description}"),
|
114
|
+
transient=False,
|
115
|
+
) as progress:
|
116
|
+
task_id = progress.add_task(
|
117
|
+
f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
|
118
|
+
total=100,
|
119
|
+
)
|
120
|
+
content = {
|
121
|
+
"dataset_alias": alias,
|
122
|
+
"project_name": project_name,
|
123
|
+
"examples": [e.to_dict() for e in examples],
|
124
|
+
}
|
125
|
+
try:
|
126
|
+
response = requests.post(
|
127
|
+
JUDGMENT_DATASETS_APPEND_API_URL,
|
128
|
+
json=content,
|
129
|
+
headers={
|
130
|
+
"Content-Type": "application/json",
|
131
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
132
|
+
"X-Organization-Id": self.organization_id
|
133
|
+
},
|
134
|
+
verify=True
|
135
|
+
)
|
136
|
+
if response.status_code != 200:
|
137
|
+
error(f"Server error during append: {response.json()}")
|
138
|
+
raise Exception(f"Server error during append: {response.json()}")
|
139
|
+
response.raise_for_status()
|
140
|
+
except requests.exceptions.HTTPError as err:
|
141
|
+
if response.status_code == 422:
|
142
|
+
error(f"Validation error during append: {err.response.json()}")
|
143
|
+
else:
|
144
|
+
error(f"HTTP error during append: {err}")
|
145
|
+
|
146
|
+
progress.update(
|
147
|
+
task_id,
|
148
|
+
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
149
|
+
)
|
150
|
+
return True
|
151
|
+
|
93
152
|
def pull(self, alias: str, project_name: str) -> EvalDataset:
|
94
153
|
debug(f"Pulling dataset with alias '{alias}'")
|
95
154
|
"""
|
judgeval/data/example.py
CHANGED
judgeval/data/result.py
CHANGED
@@ -2,7 +2,8 @@ from dataclasses import dataclass
|
|
2
2
|
from typing import List, Union, Optional, Dict, Any, Union
|
3
3
|
from judgeval.common.logger import debug, error
|
4
4
|
from pydantic import BaseModel
|
5
|
-
from judgeval.data import ScorerData, Example
|
5
|
+
from judgeval.data import ScorerData, Example, CustomExample
|
6
|
+
from judgeval.data.sequence import Sequence
|
6
7
|
|
7
8
|
|
8
9
|
class ScoringResult(BaseModel):
|
@@ -23,7 +24,7 @@ class ScoringResult(BaseModel):
|
|
23
24
|
name: Optional[str] = None
|
24
25
|
|
25
26
|
# The original example object that was used to create the ScoringResult
|
26
|
-
data_object: Optional[Example] = None
|
27
|
+
data_object: Optional[Union[Sequence, CustomExample, Example]] = None
|
27
28
|
trace_id: Optional[str] = None
|
28
29
|
|
29
30
|
# Additional fields for internal use
|
@@ -48,7 +49,7 @@ class ScoringResult(BaseModel):
|
|
48
49
|
|
49
50
|
|
50
51
|
def generate_scoring_result(
|
51
|
-
|
52
|
+
data_object: Union[Example, Sequence],
|
52
53
|
scorers_data: List[ScorerData],
|
53
54
|
run_duration: float,
|
54
55
|
success: bool,
|
@@ -59,15 +60,15 @@ def generate_scoring_result(
|
|
59
60
|
When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
|
60
61
|
At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
|
61
62
|
"""
|
62
|
-
if
|
63
|
-
name =
|
63
|
+
if data_object.name is not None:
|
64
|
+
name = data_object.name
|
64
65
|
else:
|
65
66
|
name = "Test Case Placeholder"
|
66
67
|
debug(f"No name provided for example, using default name: {name}")
|
67
68
|
debug(f"Creating ScoringResult for: {name}")
|
68
69
|
scoring_result = ScoringResult(
|
69
70
|
name=name,
|
70
|
-
data_object=
|
71
|
+
data_object=data_object,
|
71
72
|
success=success,
|
72
73
|
scorers_data=scorers_data,
|
73
74
|
run_duration=run_duration,
|
@@ -0,0 +1,59 @@
|
|
1
|
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
2
|
+
from typing import List, Optional, Union, Any
|
3
|
+
from judgeval.data.example import Example
|
4
|
+
from judgeval.scorers import ScorerWrapper, JudgevalScorer
|
5
|
+
from uuid import uuid4
|
6
|
+
from datetime import datetime, timezone
|
7
|
+
|
8
|
+
class Sequence(BaseModel):
|
9
|
+
"""
|
10
|
+
A sequence is a list of either Examples or nested Sequence objects.
|
11
|
+
"""
|
12
|
+
sequence_id: str = Field(default_factory=lambda: str(uuid4()))
|
13
|
+
name: Optional[str] = "Sequence"
|
14
|
+
created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
|
15
|
+
items: List[Union["Sequence", Example]]
|
16
|
+
scorers: Optional[Any] = None
|
17
|
+
parent_sequence_id: Optional[str] = None
|
18
|
+
sequence_order: Optional[int] = 0
|
19
|
+
|
20
|
+
@field_validator("scorers")
|
21
|
+
def validate_scorer(cls, v):
|
22
|
+
loaded_scorers = []
|
23
|
+
for scorer in v or []:
|
24
|
+
try:
|
25
|
+
if isinstance(scorer, ScorerWrapper):
|
26
|
+
loaded_scorers.append(scorer.load_implementation())
|
27
|
+
else:
|
28
|
+
loaded_scorers.append(scorer)
|
29
|
+
except Exception as e:
|
30
|
+
raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
31
|
+
return loaded_scorers
|
32
|
+
|
33
|
+
@model_validator(mode='after')
|
34
|
+
def set_parent_sequence_ids(self) -> "Sequence":
|
35
|
+
"""Recursively set the parent_sequence_id for all nested Sequences."""
|
36
|
+
for item in self.items:
|
37
|
+
if isinstance(item, Sequence):
|
38
|
+
item.parent_sequence_id = self.sequence_id
|
39
|
+
# Recurse into deeper nested sequences
|
40
|
+
item.set_parent_sequence_ids()
|
41
|
+
return self
|
42
|
+
|
43
|
+
@model_validator(mode='after')
|
44
|
+
def set_parent_and_order(self) -> "Sequence":
|
45
|
+
"""Set parent_sequence_id and sequence_order for all items."""
|
46
|
+
for idx, item in enumerate(self.items):
|
47
|
+
# Set sequence_order for both Example and Sequence objects
|
48
|
+
item.sequence_order = idx
|
49
|
+
|
50
|
+
if isinstance(item, Sequence):
|
51
|
+
item.parent_sequence_id = self.sequence_id
|
52
|
+
item.set_parent_and_order() # Recurse for nested sequences
|
53
|
+
return self
|
54
|
+
|
55
|
+
class Config:
|
56
|
+
arbitrary_types_allowed = True
|
57
|
+
|
58
|
+
# Update forward references so that "Sequence" inside items is resolved.
|
59
|
+
Sequence.model_rebuild()
|
@@ -0,0 +1,42 @@
|
|
1
|
+
|
2
|
+
from pydantic import BaseModel
|
3
|
+
from typing import List, Optional, Dict, Any, Union
|
4
|
+
from judgeval.data import Sequence
|
5
|
+
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
6
|
+
from judgeval.judges import JudgevalJudge
|
7
|
+
from judgeval.rules import Rule
|
8
|
+
|
9
|
+
|
10
|
+
class SequenceRun(BaseModel):
|
11
|
+
"""
|
12
|
+
Stores example and evaluation scorers together for running an eval task
|
13
|
+
|
14
|
+
Args:
|
15
|
+
project_name (str): The name of the project the evaluation results belong to
|
16
|
+
eval_name (str): A name for this evaluation run
|
17
|
+
sequences (List[Sequence]): The sequences to evaluate
|
18
|
+
scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
19
|
+
model (str): The model used as a judge when using LLM as a Judge
|
20
|
+
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
21
|
+
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
22
|
+
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
23
|
+
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
24
|
+
"""
|
25
|
+
|
26
|
+
# The user will specify whether they want log_results when they call run_eval
|
27
|
+
log_results: bool = False # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
|
28
|
+
organization_id: Optional[str] = None
|
29
|
+
project_name: Optional[str] = None
|
30
|
+
eval_name: Optional[str] = None
|
31
|
+
sequences: List[Sequence]
|
32
|
+
model: Union[str, List[str], JudgevalJudge]
|
33
|
+
aggregator: Optional[str] = None
|
34
|
+
metadata: Optional[Dict[str, Any]] = None
|
35
|
+
trace_span_id: Optional[str] = None
|
36
|
+
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
37
|
+
judgment_api_key: Optional[str] = ""
|
38
|
+
override: Optional[bool] = False
|
39
|
+
rules: Optional[List[Rule]] = None
|
40
|
+
|
41
|
+
class Config:
|
42
|
+
arbitrary_types_allowed = True
|
judgeval/evaluation_run.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from typing import List, Optional, Dict, Any, Union
|
2
2
|
from pydantic import BaseModel, field_validator
|
3
3
|
|
4
|
-
from judgeval.data import Example
|
4
|
+
from judgeval.data import Example, CustomExample
|
5
5
|
from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
|
6
6
|
from judgeval.constants import ACCEPTABLE_MODELS
|
7
7
|
from judgeval.common.logger import debug, error
|
@@ -15,7 +15,7 @@ class EvaluationRun(BaseModel):
|
|
15
15
|
Args:
|
16
16
|
project_name (str): The name of the project the evaluation results belong to
|
17
17
|
eval_name (str): A name for this evaluation run
|
18
|
-
examples (List[Example]): The examples to evaluate
|
18
|
+
examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
|
19
19
|
scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
20
20
|
model (str): The model used as a judge when using LLM as a Judge
|
21
21
|
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
@@ -29,7 +29,7 @@ class EvaluationRun(BaseModel):
|
|
29
29
|
organization_id: Optional[str] = None
|
30
30
|
project_name: Optional[str] = None
|
31
31
|
eval_name: Optional[str] = None
|
32
|
-
examples: List[Example]
|
32
|
+
examples: Union[List[Example], List[CustomExample]]
|
33
33
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
|
34
34
|
model: Union[str, List[str], JudgevalJudge]
|
35
35
|
aggregator: Optional[str] = None
|
@@ -38,6 +38,7 @@ class EvaluationRun(BaseModel):
|
|
38
38
|
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
39
39
|
judgment_api_key: Optional[str] = ""
|
40
40
|
override: Optional[bool] = False
|
41
|
+
append: Optional[bool] = False
|
41
42
|
rules: Optional[List[Rule]] = None
|
42
43
|
|
43
44
|
def model_dump(self, **kwargs):
|
@@ -78,13 +79,17 @@ class EvaluationRun(BaseModel):
|
|
78
79
|
raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
|
79
80
|
return v
|
80
81
|
|
81
|
-
@field_validator('examples')
|
82
|
+
@field_validator('examples', mode='before')
|
82
83
|
def validate_examples(cls, v):
|
83
84
|
if not v:
|
84
85
|
raise ValueError("Examples cannot be empty.")
|
85
|
-
|
86
|
-
|
87
|
-
|
86
|
+
|
87
|
+
first_type = type(v[0])
|
88
|
+
if first_type not in (Example, CustomExample):
|
89
|
+
raise ValueError(f"Invalid type for Example/CustomExample: {first_type}")
|
90
|
+
if not all(isinstance(ex, first_type) for ex in v):
|
91
|
+
raise ValueError("All examples must be of the same type, either all Example or all CustomExample.")
|
92
|
+
|
88
93
|
return v
|
89
94
|
|
90
95
|
@field_validator('scorers')
|