judgeval 0.0.25__py3-none-any.whl → 0.0.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +528 -166
- judgeval/constants.py +7 -4
- judgeval/data/__init__.py +0 -3
- judgeval/data/datasets/dataset.py +42 -19
- judgeval/data/datasets/eval_dataset_client.py +59 -20
- judgeval/data/result.py +34 -56
- judgeval/integrations/langgraph.py +16 -12
- judgeval/judgment_client.py +85 -23
- judgeval/rules.py +177 -60
- judgeval/run_evaluation.py +143 -122
- judgeval/scorers/score.py +21 -18
- judgeval/utils/alerts.py +32 -1
- {judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/METADATA +1 -1
- {judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/RECORD +16 -17
- judgeval/data/api_example.py +0 -98
- {judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/WHEEL +0 -0
- {judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/licenses/LICENSE.md +0 -0
judgeval/constants.py
CHANGED
@@ -41,18 +41,21 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
|
41
41
|
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
42
42
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
43
43
|
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
|
44
|
+
JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
|
44
45
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
45
|
-
|
46
|
-
|
46
|
+
JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
|
47
|
+
JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
47
48
|
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
48
49
|
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
|
49
|
-
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/
|
50
|
+
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
|
50
51
|
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
51
52
|
JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
|
53
|
+
JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
|
52
54
|
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
53
55
|
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
54
56
|
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
55
|
-
JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/
|
57
|
+
JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_trace_eval_queue/"
|
58
|
+
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
56
59
|
# RabbitMQ
|
57
60
|
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
|
58
61
|
RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
|
judgeval/data/__init__.py
CHANGED
@@ -1,13 +1,10 @@
|
|
1
1
|
from judgeval.data.example import Example, ExampleParams
|
2
|
-
from judgeval.data.api_example import ProcessExample, create_process_example
|
3
2
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
4
3
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
5
4
|
|
6
5
|
__all__ = [
|
7
6
|
"Example",
|
8
7
|
"ExampleParams",
|
9
|
-
"ProcessExample",
|
10
|
-
"create_process_example",
|
11
8
|
"ScorerData",
|
12
9
|
"create_scorer_data",
|
13
10
|
"ScoringResult",
|
@@ -90,9 +90,18 @@ class EvalDataset:
|
|
90
90
|
def add_from_csv(
|
91
91
|
self,
|
92
92
|
file_path: str,
|
93
|
+
header_mapping: dict,
|
94
|
+
primary_delimiter: str = ",",
|
95
|
+
secondary_delimiter: str = ";"
|
93
96
|
) -> None:
|
94
97
|
"""
|
95
98
|
Add Examples from a CSV file.
|
99
|
+
|
100
|
+
Args:
|
101
|
+
file_path (str): Path to the CSV file
|
102
|
+
header_mapping (dict): Dictionary mapping Example headers to custom headers
|
103
|
+
primary_delimiter (str, optional): Main delimiter used in CSV file. Defaults to ","
|
104
|
+
secondary_delimiter (str, optional): Secondary delimiter for list fields. Defaults to ";"
|
96
105
|
"""
|
97
106
|
try:
|
98
107
|
import pandas as pd
|
@@ -102,9 +111,10 @@ class EvalDataset:
|
|
102
111
|
)
|
103
112
|
|
104
113
|
# Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)
|
105
|
-
df = pd.read_csv(file_path, dtype={'trace_id': str})
|
114
|
+
df = pd.read_csv(file_path, dtype={'trace_id': str}, sep=primary_delimiter)
|
106
115
|
"""
|
107
|
-
|
116
|
+
The user should pass in a dict mapping from Judgment Example headers to their custom defined headers.
|
117
|
+
Available headers for Example objects are as follows:
|
108
118
|
|
109
119
|
"input", "actual_output", "expected_output", "context", \
|
110
120
|
"retrieval_context", "additional_metadata", "tools_called", \
|
@@ -113,35 +123,48 @@ class EvalDataset:
|
|
113
123
|
|
114
124
|
We want to collect the examples separately which can
|
115
125
|
be determined by the "example" column. If the value is True, then it is an
|
116
|
-
example
|
126
|
+
example, and we expect the `input` and `actual_output` fields to be non-null.
|
117
127
|
|
118
|
-
We also assume that if there are multiple retrieval contexts or
|
119
|
-
This can be adjusted using the `
|
128
|
+
We also assume that if there are multiple retrieval contexts, contexts, or tools called, they are separated by semicolons.
|
129
|
+
This can be adjusted using the `secondary_delimiter` parameter.
|
120
130
|
"""
|
121
131
|
examples = []
|
122
|
-
|
132
|
+
|
133
|
+
def process_csv_row(value, header):
|
134
|
+
"""
|
135
|
+
Maps a singular value in the CSV file to the appropriate type based on the header.
|
136
|
+
If value exists and can be split into type List[*], we will split upon the user's provided secondary delimiter.
|
137
|
+
"""
|
138
|
+
# check that the CSV value is not null for entry
|
139
|
+
null_replacement = dict() if header == 'additional_metadata' else None
|
140
|
+
if pd.isna(value) or value == '':
|
141
|
+
return null_replacement
|
142
|
+
try:
|
143
|
+
value = ast.literal_eval(value) if header == 'additional_metadata' else str(value)
|
144
|
+
except (ValueError, SyntaxError):
|
145
|
+
value = str(value)
|
146
|
+
if header in ["context", "retrieval_context", "tools_called", "expected_tools"]:
|
147
|
+
# attempt to split the value by the secondary delimiter
|
148
|
+
value = value.split(secondary_delimiter)
|
149
|
+
|
150
|
+
return value
|
151
|
+
|
123
152
|
for _, row in df.iterrows():
|
124
153
|
data = {
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
"retrieval_context": row["retrieval_context"].split(";") if pd.notna(row["retrieval_context"]) else [],
|
130
|
-
"additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
|
131
|
-
"tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
|
132
|
-
"expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
|
133
|
-
"trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None,
|
134
|
-
"example_id": str(row["example_id"]) if pd.notna(row["example_id"]) else None
|
154
|
+
header: process_csv_row(
|
155
|
+
row[header_mapping[header]], header
|
156
|
+
)
|
157
|
+
for header in header_mapping
|
135
158
|
}
|
136
|
-
if row["example"]:
|
137
|
-
|
159
|
+
if "example" in header_mapping and row[header_mapping["example"]]:
|
160
|
+
if "name" in header_mapping:
|
161
|
+
data["name"] = row[header_mapping["name"]] if pd.notna(row[header_mapping["name"]]) else None
|
138
162
|
# every Example has `input` and `actual_output` fields
|
139
163
|
if data["input"] is not None and data["actual_output"] is not None:
|
140
164
|
e = Example(**data)
|
141
165
|
examples.append(e)
|
142
166
|
else:
|
143
167
|
raise ValueError("Every example must have an 'input' and 'actual_output' field.")
|
144
|
-
|
145
168
|
|
146
169
|
for e in examples:
|
147
170
|
self.add_example(e)
|
@@ -7,8 +7,9 @@ from judgeval.common.logger import debug, error, warning, info
|
|
7
7
|
from judgeval.constants import (
|
8
8
|
JUDGMENT_DATASETS_PUSH_API_URL,
|
9
9
|
JUDGMENT_DATASETS_PULL_API_URL,
|
10
|
-
|
11
|
-
|
10
|
+
JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
|
11
|
+
JUDGMENT_DATASETS_DELETE_API_URL,
|
12
|
+
JUDGMENT_DATASETS_INSERT_API_URL,
|
12
13
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
|
13
14
|
)
|
14
15
|
from judgeval.data import Example
|
@@ -25,7 +26,7 @@ class EvalDatasetClient:
|
|
25
26
|
def create_dataset(self) -> EvalDataset:
|
26
27
|
return EvalDataset(judgment_api_key=self.judgment_api_key)
|
27
28
|
|
28
|
-
def push(self, dataset: EvalDataset, alias: str, overwrite: Optional[bool] = False) -> bool:
|
29
|
+
def push(self, dataset: EvalDataset, alias: str, project_name: str, overwrite: Optional[bool] = False) -> bool:
|
29
30
|
debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
|
30
31
|
if overwrite:
|
31
32
|
warning(f"Overwrite enabled for alias '{alias}'")
|
@@ -53,7 +54,8 @@ class EvalDatasetClient:
|
|
53
54
|
total=100,
|
54
55
|
)
|
55
56
|
content = {
|
56
|
-
"
|
57
|
+
"dataset_alias": alias,
|
58
|
+
"project_name": project_name,
|
57
59
|
"examples": [e.to_dict() for e in dataset.examples],
|
58
60
|
"overwrite": overwrite,
|
59
61
|
}
|
@@ -88,7 +90,7 @@ class EvalDatasetClient:
|
|
88
90
|
)
|
89
91
|
return True
|
90
92
|
|
91
|
-
def pull(self, alias: str) -> EvalDataset:
|
93
|
+
def pull(self, alias: str, project_name: str) -> EvalDataset:
|
92
94
|
debug(f"Pulling dataset with alias '{alias}'")
|
93
95
|
"""
|
94
96
|
Pulls the dataset from Judgment platform
|
@@ -96,7 +98,7 @@ class EvalDatasetClient:
|
|
96
98
|
Mock request:
|
97
99
|
{
|
98
100
|
"alias": alias,
|
99
|
-
"
|
101
|
+
"project_name": project_name
|
100
102
|
}
|
101
103
|
==>
|
102
104
|
{
|
@@ -118,7 +120,8 @@ class EvalDatasetClient:
|
|
118
120
|
total=100,
|
119
121
|
)
|
120
122
|
request_body = {
|
121
|
-
"
|
123
|
+
"dataset_alias": alias,
|
124
|
+
"project_name": project_name
|
122
125
|
}
|
123
126
|
|
124
127
|
try:
|
@@ -139,24 +142,58 @@ class EvalDatasetClient:
|
|
139
142
|
|
140
143
|
info(f"Successfully pulled dataset with alias '{alias}'")
|
141
144
|
payload = response.json()
|
145
|
+
|
142
146
|
dataset.examples = [Example(**e) for e in payload.get("examples", [])]
|
143
|
-
dataset._alias = payload.get("
|
144
|
-
dataset._id = payload.get("
|
147
|
+
dataset._alias = payload.get("alias")
|
148
|
+
dataset._id = payload.get("id")
|
145
149
|
progress.update(
|
146
150
|
task_id,
|
147
151
|
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
148
152
|
)
|
149
153
|
|
150
154
|
return dataset
|
155
|
+
|
156
|
+
def delete(self, alias: str, project_name: str) -> bool:
|
157
|
+
with Progress(
|
158
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
159
|
+
TextColumn("[progress.description]{task.description}"),
|
160
|
+
transient=False,
|
161
|
+
) as progress:
|
162
|
+
task_id = progress.add_task(
|
163
|
+
f"Deleting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
|
164
|
+
total=100,
|
165
|
+
)
|
166
|
+
request_body = {
|
167
|
+
"dataset_alias": alias,
|
168
|
+
"project_name": project_name
|
169
|
+
}
|
151
170
|
|
152
|
-
|
153
|
-
|
171
|
+
try:
|
172
|
+
response = requests.post(
|
173
|
+
JUDGMENT_DATASETS_DELETE_API_URL,
|
174
|
+
json=request_body,
|
175
|
+
headers={
|
176
|
+
"Content-Type": "application/json",
|
177
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
178
|
+
"X-Organization-Id": self.organization_id
|
179
|
+
},
|
180
|
+
verify=True
|
181
|
+
)
|
182
|
+
response.raise_for_status()
|
183
|
+
except requests.exceptions.RequestException as e:
|
184
|
+
error(f"Error deleting dataset: {str(e)}")
|
185
|
+
raise
|
186
|
+
|
187
|
+
return True
|
188
|
+
|
189
|
+
def pull_project_dataset_stats(self, project_name: str) -> dict:
|
190
|
+
debug(f"Pulling project datasets stats for project_name: {project_name}'")
|
154
191
|
"""
|
155
|
-
Pulls the
|
192
|
+
Pulls the project datasets stats from Judgment platform
|
156
193
|
|
157
194
|
Mock request:
|
158
195
|
{
|
159
|
-
"
|
196
|
+
"project_name": project_name
|
160
197
|
}
|
161
198
|
==>
|
162
199
|
{
|
@@ -177,11 +214,12 @@ class EvalDatasetClient:
|
|
177
214
|
total=100,
|
178
215
|
)
|
179
216
|
request_body = {
|
217
|
+
"project_name": project_name
|
180
218
|
}
|
181
219
|
|
182
220
|
try:
|
183
221
|
response = requests.post(
|
184
|
-
|
222
|
+
JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
|
185
223
|
json=request_body,
|
186
224
|
headers={
|
187
225
|
"Content-Type": "application/json",
|
@@ -205,7 +243,7 @@ class EvalDatasetClient:
|
|
205
243
|
|
206
244
|
return payload
|
207
245
|
|
208
|
-
def
|
246
|
+
def insert_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
|
209
247
|
"""
|
210
248
|
Edits the dataset on Judgment platform by adding new examples
|
211
249
|
|
@@ -213,7 +251,7 @@ class EvalDatasetClient:
|
|
213
251
|
{
|
214
252
|
"alias": alias,
|
215
253
|
"examples": [...],
|
216
|
-
"
|
254
|
+
"project_name": project_name
|
217
255
|
}
|
218
256
|
"""
|
219
257
|
with Progress(
|
@@ -227,13 +265,14 @@ class EvalDatasetClient:
|
|
227
265
|
)
|
228
266
|
|
229
267
|
content = {
|
230
|
-
"
|
268
|
+
"dataset_alias": alias,
|
231
269
|
"examples": [e.to_dict() for e in examples],
|
270
|
+
"project_name": project_name
|
232
271
|
}
|
233
272
|
|
234
273
|
try:
|
235
274
|
response = requests.post(
|
236
|
-
|
275
|
+
JUDGMENT_DATASETS_INSERT_API_URL,
|
237
276
|
json=content,
|
238
277
|
headers={
|
239
278
|
"Content-Type": "application/json",
|
@@ -250,7 +289,7 @@ class EvalDatasetClient:
|
|
250
289
|
info(f"Successfully edited dataset '{alias}'")
|
251
290
|
return True
|
252
291
|
|
253
|
-
def export_jsonl(self, alias: str) -> requests.Response:
|
292
|
+
def export_jsonl(self, alias: str, project_name: str) -> requests.Response:
|
254
293
|
"""Export dataset in JSONL format from Judgment platform"""
|
255
294
|
debug(f"Exporting dataset with alias '{alias}' as JSONL")
|
256
295
|
with Progress(
|
@@ -265,7 +304,7 @@ class EvalDatasetClient:
|
|
265
304
|
try:
|
266
305
|
response = requests.post(
|
267
306
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL,
|
268
|
-
json={"alias":
|
307
|
+
json={"dataset_alias": alias, "project_name": project_name},
|
269
308
|
headers={
|
270
309
|
"Content-Type": "application/json",
|
271
310
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
judgeval/data/result.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
from dataclasses import dataclass
|
2
2
|
from typing import List, Union, Optional, Dict, Any, Union
|
3
|
+
from judgeval.common.logger import debug, error
|
4
|
+
from pydantic import BaseModel
|
5
|
+
from judgeval.data import ScorerData, Example
|
3
6
|
|
4
|
-
from judgeval.data import ScorerData, ProcessExample
|
5
7
|
|
6
|
-
|
7
|
-
class ScoringResult:
|
8
|
+
class ScoringResult(BaseModel):
|
8
9
|
"""
|
9
10
|
A ScoringResult contains the output of one or more scorers applied to a single example.
|
10
11
|
Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
|
@@ -13,69 +14,44 @@ class ScoringResult:
|
|
13
14
|
success (bool): Whether the evaluation was successful.
|
14
15
|
This means that all scorers applied to this example returned a success.
|
15
16
|
scorer_data (List[ScorerData]): The scorers data for the evaluated example
|
16
|
-
|
17
|
-
actual_output (Optional[str]): The actual output of the example
|
18
|
-
expected_output (Optional[str]): The expected output of the example
|
19
|
-
context (Optional[List[str]]): The context of the example
|
20
|
-
retrieval_context (Optional[List[str]]): The retrieval context of the example
|
21
|
-
additional_metadata (Optional[Dict[str, Any]]): The additional metadata of the example
|
22
|
-
tools_called (Optional[List[str]]): The tools called by the example
|
23
|
-
expected_tools (Optional[List[str]]): The expected tools of the example
|
24
|
-
trace_id (Optional[str]): The trace id of the example
|
17
|
+
data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example, CustomExample (future), WorkflowRun (future)
|
25
18
|
|
26
19
|
"""
|
27
20
|
# Fields for scoring outputs
|
28
21
|
success: bool # used for unit testing
|
29
22
|
scorers_data: Union[List[ScorerData], None]
|
23
|
+
name: Optional[str] = None
|
30
24
|
|
31
|
-
#
|
32
|
-
|
33
|
-
actual_output: Optional[Union[str, List[str]]] = None
|
34
|
-
expected_output: Optional[Union[str, List[str]]] = None
|
35
|
-
context: Optional[List[str]] = None
|
36
|
-
retrieval_context: Optional[List[str]] = None
|
37
|
-
additional_metadata: Optional[Dict[str, Any]] = None
|
38
|
-
tools_called: Optional[List[str]] = None
|
39
|
-
expected_tools: Optional[List[str]] = None
|
25
|
+
# The original example object that was used to create the ScoringResult
|
26
|
+
data_object: Optional[Example] = None #can be Example, CustomExample (future), WorkflowRun (future)
|
40
27
|
trace_id: Optional[str] = None
|
41
28
|
|
42
|
-
|
43
|
-
|
29
|
+
# Additional fields for internal use
|
30
|
+
run_duration: Optional[float] = None
|
31
|
+
evaluation_cost: Optional[float] = None
|
44
32
|
|
45
33
|
def to_dict(self) -> dict:
|
46
34
|
"""Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
|
47
35
|
return {
|
48
36
|
"success": self.success,
|
49
37
|
"scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data] if self.scorers_data else None,
|
50
|
-
"
|
51
|
-
"actual_output": self.actual_output,
|
52
|
-
"expected_output": self.expected_output,
|
53
|
-
"context": self.context,
|
54
|
-
"retrieval_context": self.retrieval_context,
|
55
|
-
"additional_metadata": self.additional_metadata,
|
56
|
-
"tools_called": self.tools_called,
|
57
|
-
"expected_tools": self.expected_tools,
|
58
|
-
"trace_id": self.trace_id,
|
59
|
-
"example_id": self.example_id
|
38
|
+
"data_object": self.data_object.to_dict() if self.data_object else None,
|
60
39
|
}
|
61
|
-
|
40
|
+
|
62
41
|
def __str__(self) -> str:
|
63
42
|
return f"ScoringResult(\
|
64
43
|
success={self.success}, \
|
65
44
|
scorer_data={self.scorers_data}, \
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
context={self.context}, \
|
70
|
-
retrieval_context={self.retrieval_context}, \
|
71
|
-
additional_metadata={self.additional_metadata}, \
|
72
|
-
tools_called={self.tools_called}, \
|
73
|
-
expected_tools={self.expected_tools}, \
|
74
|
-
trace_id={self.trace_id})"
|
45
|
+
data_object={self.data_object}, \
|
46
|
+
run_duration={self.run_duration}, \
|
47
|
+
evaluation_cost={self.evaluation_cost})"
|
75
48
|
|
76
49
|
|
77
50
|
def generate_scoring_result(
|
78
|
-
|
51
|
+
example: Example,
|
52
|
+
success: bool,
|
53
|
+
scorers_data: List[ScorerData],
|
54
|
+
run_duration: float,
|
79
55
|
) -> ScoringResult:
|
80
56
|
"""
|
81
57
|
Creates a final ScoringResult object for an evaluation run based on the results from a completed LLMApiTestCase.
|
@@ -83,16 +59,18 @@ def generate_scoring_result(
|
|
83
59
|
When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
|
84
60
|
At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
|
85
61
|
"""
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
62
|
+
if example.name is not None:
|
63
|
+
name = example.name
|
64
|
+
else:
|
65
|
+
name = "Test Case Placeholder"
|
66
|
+
debug(f"No name provided for example, using default name: {name}")
|
67
|
+
debug(f"Creating ScoringResult for: {name}")
|
68
|
+
scoring_result = ScoringResult(
|
69
|
+
name=name,
|
70
|
+
data_object=example,
|
71
|
+
success=success,
|
72
|
+
scorers_data=scorers_data,
|
73
|
+
run_duration=run_duration,
|
74
|
+
evaluation_cost=None,
|
98
75
|
)
|
76
|
+
return scoring_result
|
@@ -146,16 +146,17 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
146
146
|
|
147
147
|
self.start_span("LangGraph", span_type="Main Function")
|
148
148
|
|
149
|
-
|
150
|
-
if node
|
151
|
-
self.
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
149
|
+
metadata = kwargs.get("metadata", {})
|
150
|
+
if node := metadata.get("langgraph_node"):
|
151
|
+
if node != self.previous_node:
|
152
|
+
# Track node execution
|
153
|
+
self.trace_client.visited_nodes.append(node)
|
154
|
+
self.trace_client.executed_node_tools.append(node)
|
155
|
+
self.trace_client.record_input({
|
156
|
+
'args': inputs,
|
157
|
+
'kwargs': kwargs
|
158
|
+
})
|
159
|
+
self.previous_node = node
|
159
160
|
|
160
161
|
def on_chain_end(
|
161
162
|
self,
|
@@ -198,8 +199,11 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
198
199
|
):
|
199
200
|
name = serialized["name"]
|
200
201
|
self.start_span(name, span_type="tool")
|
201
|
-
|
202
|
-
|
202
|
+
if name:
|
203
|
+
# Track tool execution
|
204
|
+
self.trace_client.executed_tools.append(name)
|
205
|
+
node_tool = f"{self.previous_node}:{name}" if self.previous_node else name
|
206
|
+
self.trace_client.executed_node_tools.append(node_tool)
|
203
207
|
self.trace_client.record_input({
|
204
208
|
'args': input_str,
|
205
209
|
'kwargs': kwargs
|