judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +5 -4
- judgeval/clients.py +6 -6
- judgeval/common/__init__.py +7 -2
- judgeval/common/exceptions.py +2 -3
- judgeval/common/logger.py +74 -49
- judgeval/common/s3_storage.py +30 -23
- judgeval/common/tracer.py +1273 -939
- judgeval/common/utils.py +416 -244
- judgeval/constants.py +73 -61
- judgeval/data/__init__.py +1 -1
- judgeval/data/custom_example.py +3 -2
- judgeval/data/datasets/dataset.py +80 -54
- judgeval/data/datasets/eval_dataset_client.py +131 -181
- judgeval/data/example.py +67 -43
- judgeval/data/result.py +11 -9
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +25 -16
- judgeval/data/trace.py +57 -29
- judgeval/data/trace_run.py +5 -11
- judgeval/evaluation_run.py +22 -82
- judgeval/integrations/langgraph.py +546 -184
- judgeval/judges/base_judge.py +1 -2
- judgeval/judges/litellm_judge.py +33 -11
- judgeval/judges/mixture_of_judges.py +128 -78
- judgeval/judges/together_judge.py +22 -9
- judgeval/judges/utils.py +14 -5
- judgeval/judgment_client.py +259 -271
- judgeval/rules.py +169 -142
- judgeval/run_evaluation.py +462 -305
- judgeval/scorers/api_scorer.py +20 -11
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorer.py +77 -58
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
- judgeval/scorers/prompt_scorer.py +48 -37
- judgeval/scorers/score.py +86 -53
- judgeval/scorers/utils.py +11 -7
- judgeval/tracer/__init__.py +1 -1
- judgeval/utils/alerts.py +23 -12
- judgeval/utils/{data_utils.py → file_utils.py} +5 -9
- judgeval/utils/requests.py +29 -0
- judgeval/version_check.py +5 -2
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
- judgeval-0.0.46.dist-info/RECORD +69 -0
- judgeval-0.0.44.dist-info/RECORD +0 -68
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
CHANGED
@@ -1,26 +1,17 @@
|
|
1
1
|
import asyncio
|
2
|
-
import
|
2
|
+
import concurrent.futures
|
3
|
+
from requests import exceptions
|
4
|
+
from judgeval.utils.requests import requests
|
3
5
|
import time
|
4
6
|
import json
|
5
7
|
import sys
|
6
8
|
import itertools
|
7
9
|
import threading
|
8
10
|
from typing import List, Dict, Any, Union, Optional, Callable
|
9
|
-
from datetime import datetime
|
10
11
|
from rich import print as rprint
|
11
12
|
|
12
|
-
from judgeval.data import
|
13
|
-
|
14
|
-
ScoringResult,
|
15
|
-
Example,
|
16
|
-
CustomExample,
|
17
|
-
Trace
|
18
|
-
)
|
19
|
-
from judgeval.scorers import (
|
20
|
-
JudgevalScorer,
|
21
|
-
APIJudgmentScorer,
|
22
|
-
ClassifierScorer
|
23
|
-
)
|
13
|
+
from judgeval.data import ScorerData, ScoringResult, Example, Trace
|
14
|
+
from judgeval.scorers import JudgevalScorer, APIJudgmentScorer, ClassifierScorer
|
24
15
|
from judgeval.scorers.score import a_execute_scoring
|
25
16
|
from judgeval.constants import (
|
26
17
|
ROOT_API,
|
@@ -30,21 +21,39 @@ from judgeval.constants import (
|
|
30
21
|
MAX_CONCURRENT_EVALUATIONS,
|
31
22
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
32
23
|
JUDGMENT_GET_EVAL_STATUS_API_URL,
|
33
|
-
JUDGMENT_EVAL_FETCH_API_URL
|
24
|
+
JUDGMENT_EVAL_FETCH_API_URL,
|
34
25
|
)
|
35
26
|
from judgeval.common.exceptions import JudgmentAPIError
|
36
|
-
from judgeval.common.logger import
|
37
|
-
debug,
|
38
|
-
info,
|
39
|
-
error,
|
40
|
-
warning,
|
41
|
-
example_logging_context
|
42
|
-
)
|
27
|
+
from judgeval.common.logger import debug, info, error, warning, example_logging_context
|
43
28
|
from judgeval.evaluation_run import EvaluationRun
|
44
29
|
from judgeval.data.trace_run import TraceRun
|
45
30
|
from judgeval.common.tracer import Tracer
|
46
31
|
from langchain_core.callbacks import BaseCallbackHandler
|
47
32
|
|
33
|
+
|
34
|
+
def safe_run_async(coro):
|
35
|
+
"""
|
36
|
+
Safely run an async coroutine whether or not there's already an event loop running.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
coro: The coroutine to run
|
40
|
+
|
41
|
+
Returns:
|
42
|
+
The result of the coroutine
|
43
|
+
"""
|
44
|
+
try:
|
45
|
+
# Try to get the running loop
|
46
|
+
asyncio.get_running_loop()
|
47
|
+
# If we get here, there's already a loop running
|
48
|
+
# Run in a separate thread to avoid "asyncio.run() cannot be called from a running event loop"
|
49
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
50
|
+
future = executor.submit(asyncio.run, coro)
|
51
|
+
return future.result()
|
52
|
+
except RuntimeError:
|
53
|
+
# No event loop is running, safe to use asyncio.run()
|
54
|
+
return asyncio.run(coro)
|
55
|
+
|
56
|
+
|
48
57
|
def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
|
49
58
|
"""
|
50
59
|
Sends an evaluation run to the RabbitMQ evaluation queue.
|
@@ -55,14 +64,15 @@ def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
|
|
55
64
|
headers={
|
56
65
|
"Content-Type": "application/json",
|
57
66
|
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
58
|
-
"X-Organization-Id": evaluation_run.organization_id
|
59
|
-
},
|
67
|
+
"X-Organization-Id": evaluation_run.organization_id,
|
68
|
+
},
|
60
69
|
json=payload,
|
61
|
-
verify=True
|
70
|
+
verify=True,
|
62
71
|
)
|
63
72
|
return response.json()
|
64
73
|
|
65
|
-
|
74
|
+
|
75
|
+
def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
|
66
76
|
"""
|
67
77
|
Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
|
68
78
|
|
@@ -71,67 +81,75 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
71
81
|
|
72
82
|
Returns:
|
73
83
|
List[Dict]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult`
|
74
|
-
object.
|
84
|
+
object.
|
75
85
|
"""
|
76
|
-
|
86
|
+
|
77
87
|
try:
|
78
88
|
# submit API request to execute evals
|
79
89
|
payload = evaluation_run.model_dump(warnings=False)
|
80
90
|
response = requests.post(
|
81
|
-
JUDGMENT_EVAL_API_URL,
|
91
|
+
JUDGMENT_EVAL_API_URL,
|
82
92
|
headers={
|
83
93
|
"Content-Type": "application/json",
|
84
94
|
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
85
|
-
"X-Organization-Id": evaluation_run.organization_id
|
86
|
-
},
|
95
|
+
"X-Organization-Id": evaluation_run.organization_id,
|
96
|
+
},
|
87
97
|
json=payload,
|
88
|
-
verify=True
|
98
|
+
verify=True,
|
89
99
|
)
|
90
100
|
response_data = response.json()
|
91
101
|
except Exception as e:
|
92
102
|
error(f"Error: {e}")
|
93
103
|
details = response.json().get("detail", "No details provided")
|
94
|
-
raise JudgmentAPIError(
|
104
|
+
raise JudgmentAPIError(
|
105
|
+
"An error occurred while executing the Judgment API request: " + details
|
106
|
+
)
|
95
107
|
# Check if the response status code is not 2XX
|
96
108
|
# Add check for the duplicate eval run name
|
97
109
|
if not response.ok:
|
98
|
-
error_message = response_data.get(
|
110
|
+
error_message = response_data.get("detail", "An unknown error occurred.")
|
99
111
|
error(f"Error: {error_message=}")
|
100
112
|
raise JudgmentAPIError(error_message)
|
101
113
|
return response_data
|
102
114
|
|
115
|
+
|
103
116
|
def execute_api_trace_eval(trace_run: TraceRun) -> Dict:
|
104
117
|
"""
|
105
118
|
Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
|
106
119
|
"""
|
107
|
-
|
120
|
+
|
108
121
|
try:
|
109
122
|
# submit API request to execute evals
|
110
123
|
payload = trace_run.model_dump(warnings=False)
|
111
124
|
response = requests.post(
|
112
|
-
JUDGMENT_TRACE_EVAL_API_URL,
|
125
|
+
JUDGMENT_TRACE_EVAL_API_URL,
|
113
126
|
headers={
|
114
127
|
"Content-Type": "application/json",
|
115
128
|
"Authorization": f"Bearer {trace_run.judgment_api_key}",
|
116
|
-
"X-Organization-Id": trace_run.organization_id
|
117
|
-
},
|
129
|
+
"X-Organization-Id": trace_run.organization_id,
|
130
|
+
},
|
118
131
|
json=payload,
|
119
|
-
verify=True
|
132
|
+
verify=True,
|
120
133
|
)
|
121
134
|
response_data = response.json()
|
122
135
|
except Exception as e:
|
123
136
|
error(f"Error: {e}")
|
124
137
|
details = response.json().get("detail", "No details provided")
|
125
|
-
raise JudgmentAPIError(
|
138
|
+
raise JudgmentAPIError(
|
139
|
+
"An error occurred while executing the Judgment API request: " + details
|
140
|
+
)
|
126
141
|
# Check if the response status code is not 2XX
|
127
142
|
# Add check for the duplicate eval run name
|
128
143
|
if not response.ok:
|
129
|
-
error_message = response_data.get(
|
144
|
+
error_message = response_data.get("detail", "An unknown error occurred.")
|
130
145
|
error(f"Error: {error_message=}")
|
131
146
|
raise JudgmentAPIError(error_message)
|
132
147
|
return response_data
|
133
148
|
|
134
|
-
|
149
|
+
|
150
|
+
def merge_results(
|
151
|
+
api_results: List[ScoringResult], local_results: List[ScoringResult]
|
152
|
+
) -> List[ScoringResult]:
|
135
153
|
"""
|
136
154
|
When executing scorers that come from both the Judgment API and local scorers, we're left with
|
137
155
|
results for each type of scorer. This function merges the results from the API and local evaluations,
|
@@ -152,32 +170,52 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
|
|
152
170
|
|
153
171
|
if len(api_results) != len(local_results):
|
154
172
|
# Results should be of same length because each ScoringResult is a 1-1 mapping to an Example
|
155
|
-
raise ValueError(
|
156
|
-
|
173
|
+
raise ValueError(
|
174
|
+
f"The number of API and local results do not match: {len(api_results)} vs {len(local_results)}"
|
175
|
+
)
|
176
|
+
|
157
177
|
# Create a copy of api_results to avoid modifying the input
|
158
178
|
merged_results = [result.model_copy() for result in api_results]
|
159
|
-
|
179
|
+
|
160
180
|
# Each ScoringResult in api and local have all the same fields besides `scorers_data`
|
161
181
|
for merged_result, local_result in zip(merged_results, local_results):
|
162
182
|
if not (merged_result.data_object and local_result.data_object):
|
163
183
|
raise ValueError("Data object is None in one of the results.")
|
164
184
|
if merged_result.data_object.input != local_result.data_object.input:
|
165
185
|
raise ValueError("The API and local results are not aligned.")
|
166
|
-
if
|
186
|
+
if (
|
187
|
+
merged_result.data_object.actual_output
|
188
|
+
!= local_result.data_object.actual_output
|
189
|
+
):
|
167
190
|
raise ValueError("The API and local results are not aligned.")
|
168
|
-
if
|
191
|
+
if (
|
192
|
+
merged_result.data_object.expected_output
|
193
|
+
!= local_result.data_object.expected_output
|
194
|
+
):
|
169
195
|
raise ValueError("The API and local results are not aligned.")
|
170
196
|
if merged_result.data_object.context != local_result.data_object.context:
|
171
197
|
raise ValueError("The API and local results are not aligned.")
|
172
|
-
if
|
198
|
+
if (
|
199
|
+
merged_result.data_object.retrieval_context
|
200
|
+
!= local_result.data_object.retrieval_context
|
201
|
+
):
|
173
202
|
raise ValueError("The API and local results are not aligned.")
|
174
|
-
if
|
203
|
+
if (
|
204
|
+
merged_result.data_object.additional_metadata
|
205
|
+
!= local_result.data_object.additional_metadata
|
206
|
+
):
|
175
207
|
raise ValueError("The API and local results are not aligned.")
|
176
|
-
if
|
208
|
+
if (
|
209
|
+
merged_result.data_object.tools_called
|
210
|
+
!= local_result.data_object.tools_called
|
211
|
+
):
|
177
212
|
raise ValueError("The API and local results are not aligned.")
|
178
|
-
if
|
213
|
+
if (
|
214
|
+
merged_result.data_object.expected_tools
|
215
|
+
!= local_result.data_object.expected_tools
|
216
|
+
):
|
179
217
|
raise ValueError("The API and local results are not aligned.")
|
180
|
-
|
218
|
+
|
181
219
|
# Merge ScorerData from the API and local scorers together
|
182
220
|
api_scorer_data = merged_result.scorers_data
|
183
221
|
local_scorer_data = local_result.scorers_data
|
@@ -185,7 +223,7 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
|
|
185
223
|
merged_result.scorers_data = local_scorer_data
|
186
224
|
elif api_scorer_data is not None and local_scorer_data is not None:
|
187
225
|
merged_result.scorers_data = api_scorer_data + local_scorer_data
|
188
|
-
|
226
|
+
|
189
227
|
return merged_results
|
190
228
|
|
191
229
|
|
@@ -206,7 +244,14 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
|
|
206
244
|
)
|
207
245
|
return results
|
208
246
|
|
209
|
-
|
247
|
+
|
248
|
+
def check_experiment_type(
|
249
|
+
eval_name: str,
|
250
|
+
project_name: str,
|
251
|
+
judgment_api_key: str,
|
252
|
+
organization_id: str,
|
253
|
+
is_trace: bool,
|
254
|
+
) -> None:
|
210
255
|
"""
|
211
256
|
Checks if the current experiment, if one exists, has the same type (examples of traces)
|
212
257
|
"""
|
@@ -216,32 +261,35 @@ def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: s
|
|
216
261
|
headers={
|
217
262
|
"Content-Type": "application/json",
|
218
263
|
"Authorization": f"Bearer {judgment_api_key}",
|
219
|
-
"X-Organization-Id": organization_id
|
264
|
+
"X-Organization-Id": organization_id,
|
220
265
|
},
|
221
266
|
json={
|
222
267
|
"eval_name": eval_name,
|
223
268
|
"project_name": project_name,
|
224
269
|
"judgment_api_key": judgment_api_key,
|
225
|
-
"is_trace": is_trace
|
270
|
+
"is_trace": is_trace,
|
226
271
|
},
|
227
|
-
verify=True
|
272
|
+
verify=True,
|
228
273
|
)
|
229
|
-
|
274
|
+
|
230
275
|
if response.status_code == 422:
|
231
276
|
error(f"{response.json()}")
|
232
277
|
raise ValueError(f"{response.json()}")
|
233
|
-
|
278
|
+
|
234
279
|
if not response.ok:
|
235
280
|
response_data = response.json()
|
236
|
-
error_message = response_data.get(
|
281
|
+
error_message = response_data.get("detail", "An unknown error occurred.")
|
237
282
|
error(f"Error checking eval run name: {error_message}")
|
238
283
|
raise JudgmentAPIError(error_message)
|
239
|
-
|
240
|
-
except
|
284
|
+
|
285
|
+
except exceptions.RequestException as e:
|
241
286
|
error(f"Failed to check if experiment type exists: {str(e)}")
|
242
287
|
raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
|
243
288
|
|
244
|
-
|
289
|
+
|
290
|
+
def check_eval_run_name_exists(
|
291
|
+
eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
|
292
|
+
) -> None:
|
245
293
|
"""
|
246
294
|
Checks if an evaluation run name already exists for a given project.
|
247
295
|
|
@@ -260,32 +308,38 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
260
308
|
headers={
|
261
309
|
"Content-Type": "application/json",
|
262
310
|
"Authorization": f"Bearer {judgment_api_key}",
|
263
|
-
"X-Organization-Id": organization_id
|
311
|
+
"X-Organization-Id": organization_id,
|
264
312
|
},
|
265
313
|
json={
|
266
314
|
"eval_name": eval_name,
|
267
315
|
"project_name": project_name,
|
268
316
|
"judgment_api_key": judgment_api_key,
|
269
317
|
},
|
270
|
-
verify=True
|
318
|
+
verify=True,
|
271
319
|
)
|
272
|
-
|
320
|
+
|
273
321
|
if response.status_code == 409:
|
274
|
-
error(
|
275
|
-
|
276
|
-
|
322
|
+
error(
|
323
|
+
f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true."
|
324
|
+
)
|
325
|
+
raise ValueError(
|
326
|
+
f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true."
|
327
|
+
)
|
328
|
+
|
277
329
|
if not response.ok:
|
278
330
|
response_data = response.json()
|
279
|
-
error_message = response_data.get(
|
331
|
+
error_message = response_data.get("detail", "An unknown error occurred.")
|
280
332
|
error(f"Error checking eval run name: {error_message}")
|
281
333
|
raise JudgmentAPIError(error_message)
|
282
|
-
|
283
|
-
except
|
334
|
+
|
335
|
+
except exceptions.RequestException as e:
|
284
336
|
error(f"Failed to check if eval run name exists: {str(e)}")
|
285
337
|
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
286
338
|
|
287
339
|
|
288
|
-
def log_evaluation_results(
|
340
|
+
def log_evaluation_results(
|
341
|
+
scoring_results: List[ScoringResult], run: Union[EvaluationRun, TraceRun]
|
342
|
+
) -> str | None:
|
289
343
|
"""
|
290
344
|
Logs evaluation results to the Judgment API database.
|
291
345
|
|
@@ -303,64 +357,73 @@ def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[Eval
|
|
303
357
|
headers={
|
304
358
|
"Content-Type": "application/json",
|
305
359
|
"Authorization": f"Bearer {run.judgment_api_key}",
|
306
|
-
"X-Organization-Id": run.organization_id
|
307
|
-
},
|
308
|
-
json={
|
309
|
-
"results": scoring_results,
|
310
|
-
"run": run.model_dump(warnings=False)
|
360
|
+
"X-Organization-Id": run.organization_id,
|
311
361
|
},
|
312
|
-
|
362
|
+
json={"results": scoring_results, "run": run.model_dump(warnings=False)},
|
363
|
+
verify=True,
|
313
364
|
)
|
314
|
-
|
365
|
+
|
315
366
|
if not res.ok:
|
316
367
|
response_data = res.json()
|
317
|
-
error_message = response_data.get(
|
368
|
+
error_message = response_data.get("detail", "An unknown error occurred.")
|
318
369
|
error(f"Error {res.status_code}: {error_message}")
|
319
370
|
raise JudgmentAPIError(error_message)
|
320
|
-
|
371
|
+
|
321
372
|
if "ui_results_url" in res.json():
|
322
|
-
url = res.json()[
|
373
|
+
url = res.json()["ui_results_url"]
|
323
374
|
pretty_str = f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
324
375
|
return pretty_str
|
325
|
-
|
326
|
-
|
376
|
+
|
377
|
+
return None
|
378
|
+
|
379
|
+
except exceptions.RequestException as e:
|
327
380
|
error(f"Request failed while saving evaluation results to DB: {str(e)}")
|
328
|
-
raise JudgmentAPIError(
|
381
|
+
raise JudgmentAPIError(
|
382
|
+
f"Request failed while saving evaluation results to DB: {str(e)}"
|
383
|
+
)
|
329
384
|
except Exception as e:
|
330
385
|
error(f"Failed to save evaluation results to DB: {str(e)}")
|
331
386
|
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
332
387
|
|
388
|
+
|
333
389
|
def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
|
334
|
-
|
335
|
-
|
390
|
+
"""Run a function with a spinner in the terminal."""
|
391
|
+
spinner = itertools.cycle(["|", "/", "-", "\\"])
|
336
392
|
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
393
|
+
def display_spinner():
|
394
|
+
while not stop_spinner_event.is_set():
|
395
|
+
sys.stdout.write(f"\r{message}{next(spinner)}")
|
396
|
+
sys.stdout.flush()
|
397
|
+
time.sleep(0.1)
|
342
398
|
|
343
|
-
|
344
|
-
|
345
|
-
|
399
|
+
stop_spinner_event = threading.Event()
|
400
|
+
spinner_thread = threading.Thread(target=display_spinner)
|
401
|
+
spinner_thread.start()
|
346
402
|
|
347
|
-
|
403
|
+
try:
|
404
|
+
if asyncio.iscoroutinefunction(func):
|
405
|
+
coro = func(*args, **kwargs)
|
406
|
+
result = safe_run_async(coro)
|
407
|
+
else:
|
348
408
|
result = func(*args, **kwargs)
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
sys.stdout.write('\r' + ' ' * (len(message) + 1) + '\r')
|
359
|
-
sys.stdout.flush()
|
409
|
+
except Exception as e:
|
410
|
+
error(f"An error occurred: {str(e)}")
|
411
|
+
stop_spinner_event.set()
|
412
|
+
spinner_thread.join()
|
413
|
+
raise e
|
414
|
+
finally:
|
415
|
+
stop_spinner_event.set()
|
416
|
+
spinner_thread.join()
|
360
417
|
|
361
|
-
|
418
|
+
sys.stdout.write("\r" + " " * (len(message) + 1) + "\r")
|
419
|
+
sys.stdout.flush()
|
420
|
+
|
421
|
+
return result
|
362
422
|
|
363
|
-
|
423
|
+
|
424
|
+
def check_examples(
|
425
|
+
examples: List[Example], scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
|
426
|
+
) -> None:
|
364
427
|
"""
|
365
428
|
Checks if the example contains the necessary parameters for the scorer.
|
366
429
|
"""
|
@@ -372,27 +435,36 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
|
|
372
435
|
if getattr(example, param.value) is None:
|
373
436
|
missing_params.append(f"{param.value}")
|
374
437
|
if missing_params:
|
375
|
-
rprint(
|
438
|
+
rprint(
|
439
|
+
f"[yellow]⚠️ WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]"
|
440
|
+
)
|
376
441
|
rprint(f"Missing parameters: {', '.join(missing_params)}")
|
377
442
|
rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
|
378
|
-
rprint("-"*40)
|
443
|
+
rprint("-" * 40)
|
379
444
|
prompt_user = True
|
380
445
|
|
381
446
|
if prompt_user:
|
382
447
|
user_input = input("Do you want to continue? (y/n)")
|
383
448
|
if user_input.lower() != "y":
|
384
|
-
sys.exit(0)
|
449
|
+
sys.exit(0)
|
385
450
|
else:
|
386
451
|
rprint("[green]Continuing...[/green]")
|
387
452
|
|
388
|
-
|
453
|
+
|
454
|
+
def run_trace_eval(
|
455
|
+
trace_run: TraceRun,
|
456
|
+
override: bool = False,
|
457
|
+
function: Optional[Callable] = None,
|
458
|
+
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
|
459
|
+
examples: Optional[List[Example]] = None,
|
460
|
+
) -> List[ScoringResult]:
|
389
461
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
390
|
-
if not override and
|
462
|
+
if not override and not trace_run.append:
|
391
463
|
check_eval_run_name_exists(
|
392
464
|
trace_run.eval_name,
|
393
465
|
trace_run.project_name,
|
394
466
|
trace_run.judgment_api_key,
|
395
|
-
trace_run.organization_id
|
467
|
+
trace_run.organization_id,
|
396
468
|
)
|
397
469
|
|
398
470
|
if trace_run.append:
|
@@ -402,31 +474,36 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
402
474
|
trace_run.project_name,
|
403
475
|
trace_run.judgment_api_key,
|
404
476
|
trace_run.organization_id,
|
405
|
-
True
|
477
|
+
True,
|
406
478
|
)
|
407
|
-
if function and tracer:
|
479
|
+
if function and tracer and examples is not None:
|
408
480
|
new_traces: List[Trace] = []
|
409
|
-
|
481
|
+
|
410
482
|
# Handle case where tracer is actually a callback handler
|
411
483
|
actual_tracer = tracer
|
412
|
-
if hasattr(tracer,
|
484
|
+
if hasattr(tracer, "tracer") and hasattr(tracer.tracer, "traces"):
|
413
485
|
# This is a callback handler, get the underlying tracer
|
414
486
|
actual_tracer = tracer.tracer
|
415
|
-
|
487
|
+
|
416
488
|
actual_tracer.offline_mode = True
|
417
489
|
actual_tracer.traces = []
|
418
490
|
for example in examples:
|
419
491
|
if example.input:
|
420
492
|
if isinstance(example.input, str):
|
421
|
-
|
493
|
+
run_with_spinner(
|
494
|
+
"Running agent function: ", function, example.input
|
495
|
+
)
|
422
496
|
elif isinstance(example.input, dict):
|
423
|
-
|
497
|
+
run_with_spinner(
|
498
|
+
"Running agent function: ", function, **example.input
|
499
|
+
)
|
424
500
|
else:
|
425
|
-
raise ValueError(
|
501
|
+
raise ValueError(
|
502
|
+
f"Input must be string or dict, got {type(example.input)}"
|
503
|
+
)
|
426
504
|
else:
|
427
|
-
|
428
|
-
|
429
|
-
|
505
|
+
run_with_spinner("Running agent function: ", function)
|
506
|
+
|
430
507
|
for i, trace in enumerate(actual_tracer.traces):
|
431
508
|
# We set the root-level trace span with the expected tools of the Trace
|
432
509
|
trace = Trace(**trace)
|
@@ -434,35 +511,49 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
434
511
|
new_traces.append(trace)
|
435
512
|
trace_run.traces = new_traces
|
436
513
|
actual_tracer.traces = []
|
437
|
-
|
514
|
+
|
438
515
|
# Execute evaluation using Judgment API
|
439
516
|
info("Starting API evaluation")
|
440
517
|
try: # execute an EvaluationRun with just JudgmentScorers
|
441
|
-
debug("Sending request to Judgment API")
|
442
|
-
response_data: Dict = run_with_spinner(
|
443
|
-
|
518
|
+
debug("Sending request to Judgment API")
|
519
|
+
response_data: Dict = run_with_spinner(
|
520
|
+
"Running Trace Evaluation: ", execute_api_trace_eval, trace_run
|
521
|
+
)
|
522
|
+
scoring_results = [
|
523
|
+
ScoringResult(**result) for result in response_data["results"]
|
524
|
+
]
|
444
525
|
info(f"Received {len(scoring_results)} results from API")
|
445
526
|
except JudgmentAPIError as e:
|
446
527
|
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
447
|
-
raise JudgmentAPIError(
|
528
|
+
raise JudgmentAPIError(
|
529
|
+
f"An error occurred while executing the Judgment API request: {str(e)}"
|
530
|
+
)
|
448
531
|
except ValueError as e:
|
449
|
-
raise ValueError(
|
450
|
-
|
532
|
+
raise ValueError(
|
533
|
+
f"Please check your TraceRun object, one or more fields are invalid: {str(e)}"
|
534
|
+
)
|
535
|
+
|
451
536
|
# Convert the response data to `ScoringResult` objects
|
452
537
|
debug("Processing API results")
|
453
538
|
# TODO: allow for custom scorer on traces
|
454
|
-
|
455
|
-
|
456
|
-
|
539
|
+
|
540
|
+
pretty_str = run_with_spinner(
|
541
|
+
"Logging Results: ",
|
542
|
+
log_evaluation_results,
|
543
|
+
response_data["agent_results"],
|
544
|
+
trace_run,
|
545
|
+
)
|
546
|
+
rprint(pretty_str)
|
457
547
|
|
458
548
|
return scoring_results
|
459
|
-
|
460
|
-
|
461
549
|
|
462
|
-
|
550
|
+
|
551
|
+
async def get_evaluation_status(
|
552
|
+
eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
|
553
|
+
) -> Dict:
|
463
554
|
"""
|
464
555
|
Gets the status of an async evaluation run.
|
465
|
-
|
556
|
+
|
466
557
|
Args:
|
467
558
|
eval_name (str): Name of the evaluation run
|
468
559
|
project_name (str): Name of the project
|
@@ -481,38 +572,46 @@ async def get_evaluation_status(eval_name: str, project_name: str, judgment_api_
|
|
481
572
|
headers={
|
482
573
|
"Content-Type": "application/json",
|
483
574
|
"Authorization": f"Bearer {judgment_api_key}",
|
484
|
-
"X-Organization-Id": organization_id
|
575
|
+
"X-Organization-Id": organization_id,
|
485
576
|
},
|
486
577
|
params={
|
487
578
|
"eval_name": eval_name,
|
488
579
|
"project_name": project_name,
|
489
580
|
},
|
490
|
-
verify=True
|
581
|
+
verify=True,
|
491
582
|
)
|
492
|
-
|
583
|
+
|
493
584
|
if not response.ok:
|
494
|
-
error_message = response.json().get(
|
585
|
+
error_message = response.json().get("detail", "An unknown error occurred.")
|
495
586
|
error(f"Error checking evaluation status: {error_message}")
|
496
587
|
raise JudgmentAPIError(error_message)
|
497
|
-
|
588
|
+
|
498
589
|
return response.json()
|
499
|
-
except
|
590
|
+
except exceptions.RequestException as e:
|
500
591
|
error(f"Failed to check evaluation status: {str(e)}")
|
501
592
|
raise JudgmentAPIError(f"Failed to check evaluation status: {str(e)}")
|
502
593
|
|
503
|
-
|
594
|
+
|
595
|
+
async def _poll_evaluation_until_complete(
|
596
|
+
eval_name: str,
|
597
|
+
project_name: str,
|
598
|
+
judgment_api_key: str,
|
599
|
+
organization_id: str,
|
600
|
+
poll_interval_seconds: int = 5,
|
601
|
+
original_examples: Optional[List[Example]] = None,
|
602
|
+
) -> List[ScoringResult]:
|
504
603
|
"""
|
505
604
|
Polls until the evaluation is complete and returns the results.
|
506
|
-
|
605
|
+
|
507
606
|
Args:
|
508
607
|
eval_name (str): Name of the evaluation run
|
509
608
|
project_name (str): Name of the project
|
510
609
|
judgment_api_key (str): API key for authentication
|
511
610
|
organization_id (str): Organization ID for the evaluation
|
512
611
|
poll_interval_seconds (int, optional): Time between status checks in seconds. Defaults to 5.
|
513
|
-
original_examples (List[Example], optional): The original examples sent for evaluation.
|
612
|
+
original_examples (List[Example], optional): The original examples sent for evaluation.
|
514
613
|
If provided, will match results with original examples.
|
515
|
-
|
614
|
+
|
516
615
|
Returns:
|
517
616
|
List[ScoringResult]: The evaluation results
|
518
617
|
"""
|
@@ -522,7 +621,7 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
|
|
522
621
|
if original_examples:
|
523
622
|
for example in original_examples:
|
524
623
|
original_example_map[example.example_id] = example
|
525
|
-
|
624
|
+
|
526
625
|
# Remove the expected scorer names extraction and checking
|
527
626
|
# We'll instead verify all examples have consistent scorer data
|
528
627
|
while True:
|
@@ -530,8 +629,10 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
|
|
530
629
|
try:
|
531
630
|
# Log polling attempt
|
532
631
|
if poll_count % 4 == 0: # Log every 4th poll to avoid excess logging
|
533
|
-
info(
|
534
|
-
|
632
|
+
info(
|
633
|
+
f"Polling for evaluation '{eval_name}' in project '{project_name}' (attempt {poll_count})"
|
634
|
+
)
|
635
|
+
|
535
636
|
# Check status
|
536
637
|
response = await asyncio.to_thread(
|
537
638
|
requests.get,
|
@@ -539,82 +640,89 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
|
|
539
640
|
headers={
|
540
641
|
"Content-Type": "application/json",
|
541
642
|
"Authorization": f"Bearer {judgment_api_key}",
|
542
|
-
"X-Organization-Id": organization_id
|
643
|
+
"X-Organization-Id": organization_id,
|
543
644
|
},
|
544
|
-
params={
|
545
|
-
|
546
|
-
"project_name": project_name
|
547
|
-
},
|
548
|
-
verify=True
|
645
|
+
params={"eval_name": eval_name, "project_name": project_name},
|
646
|
+
verify=True,
|
549
647
|
)
|
550
|
-
|
648
|
+
|
551
649
|
if not response.ok:
|
552
|
-
error_message = response.json().get(
|
650
|
+
error_message = response.json().get(
|
651
|
+
"detail", "An unknown error occurred."
|
652
|
+
)
|
553
653
|
error(f"Error checking evaluation status: {error_message}")
|
554
654
|
# Don't raise exception immediately, just log and continue polling
|
555
655
|
await asyncio.sleep(poll_interval_seconds)
|
556
656
|
continue
|
557
|
-
|
657
|
+
|
558
658
|
status_data = response.json()
|
559
659
|
status = status_data.get("status")
|
560
|
-
|
660
|
+
|
561
661
|
# If complete, get results and return
|
562
662
|
if status == "completed" or status == "complete":
|
563
|
-
info(
|
663
|
+
info(
|
664
|
+
f"Evaluation '{eval_name}' reported as completed, fetching and verifying results..."
|
665
|
+
)
|
564
666
|
results_response = await asyncio.to_thread(
|
565
667
|
requests.post,
|
566
668
|
JUDGMENT_EVAL_FETCH_API_URL,
|
567
669
|
headers={
|
568
670
|
"Content-Type": "application/json",
|
569
671
|
"Authorization": f"Bearer {judgment_api_key}",
|
570
|
-
"X-Organization-Id": organization_id
|
571
|
-
},
|
572
|
-
json={
|
573
|
-
"project_name": project_name,
|
574
|
-
"eval_name": eval_name
|
672
|
+
"X-Organization-Id": organization_id,
|
575
673
|
},
|
576
|
-
|
674
|
+
json={"project_name": project_name, "eval_name": eval_name},
|
675
|
+
verify=True,
|
577
676
|
)
|
578
|
-
|
677
|
+
|
579
678
|
if not results_response.ok:
|
580
|
-
error_message = results_response.json().get(
|
679
|
+
error_message = results_response.json().get(
|
680
|
+
"detail", "An unknown error occurred."
|
681
|
+
)
|
581
682
|
error(f"Error fetching evaluation results: {error_message}")
|
582
683
|
raise JudgmentAPIError(error_message)
|
583
|
-
|
684
|
+
|
584
685
|
result_data = results_response.json()
|
585
|
-
|
686
|
+
|
586
687
|
if "examples" in result_data:
|
587
688
|
examples_data = result_data.get("examples", [])
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
689
|
+
|
690
|
+
info(
|
691
|
+
f"Successfully fetched {len(examples_data)} results for evaluation '{eval_name}'"
|
692
|
+
)
|
693
|
+
|
592
694
|
# Check for result validity if original examples are provided
|
593
695
|
if original_example_map:
|
594
696
|
# Verify all returned examples have matching original examples
|
595
697
|
has_invalid_results = False
|
596
698
|
for example_data in examples_data:
|
597
699
|
example_id = example_data.get("example_id")
|
598
|
-
|
700
|
+
|
599
701
|
if example_id not in original_example_map:
|
600
|
-
warning(
|
601
|
-
|
702
|
+
warning(
|
703
|
+
f"Server returned example with ID {example_id} not found in original examples. "
|
704
|
+
+ "This indicates stale or incorrect data. Continuing to poll..."
|
705
|
+
)
|
602
706
|
has_invalid_results = True
|
603
707
|
break
|
604
|
-
|
708
|
+
|
605
709
|
# If any invalid examples found, continue polling
|
606
710
|
if has_invalid_results:
|
607
711
|
info("Detected stale data. Waiting before polling again...")
|
608
712
|
await asyncio.sleep(poll_interval_seconds)
|
609
713
|
continue
|
610
|
-
|
714
|
+
|
611
715
|
# Check if we received the expected number of results
|
612
|
-
if len(original_examples) != len(
|
613
|
-
|
614
|
-
|
716
|
+
if original_examples and len(original_examples) != len(
|
717
|
+
examples_data
|
718
|
+
):
|
719
|
+
warning(
|
720
|
+
f"Expected {len(original_examples)} results but got {len(examples_data)} results. "
|
721
|
+
+ "This indicates incomplete data. Continuing to poll..."
|
722
|
+
)
|
615
723
|
await asyncio.sleep(poll_interval_seconds)
|
616
724
|
continue
|
617
|
-
|
725
|
+
|
618
726
|
# Collect all example IDs from scorer data
|
619
727
|
scorer_example_ids = set()
|
620
728
|
for example_data in examples_data:
|
@@ -622,114 +730,135 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
|
|
622
730
|
for scorer_data in scorer_data_list:
|
623
731
|
if "example_id" in scorer_data:
|
624
732
|
scorer_example_ids.add(scorer_data["example_id"])
|
625
|
-
|
733
|
+
|
626
734
|
# Get the set of original example IDs
|
627
735
|
original_example_ids = set(original_example_map.keys())
|
628
|
-
|
736
|
+
|
629
737
|
# Check if the sets are equal
|
630
738
|
missing_in_scorer = original_example_ids - scorer_example_ids
|
631
739
|
extra_in_scorer = scorer_example_ids - original_example_ids
|
632
|
-
|
740
|
+
|
633
741
|
if missing_in_scorer or extra_in_scorer:
|
634
742
|
if missing_in_scorer:
|
635
|
-
warning(
|
743
|
+
warning(
|
744
|
+
f"Examples missing in scorer data: {missing_in_scorer}"
|
745
|
+
)
|
636
746
|
if extra_in_scorer:
|
637
|
-
warning(
|
638
|
-
|
747
|
+
warning(
|
748
|
+
f"Extra examples in scorer data: {extra_in_scorer}"
|
749
|
+
)
|
750
|
+
info(
|
751
|
+
"Detected mismatched example IDs in scorer data. Waiting before polling again..."
|
752
|
+
)
|
639
753
|
await asyncio.sleep(poll_interval_seconds)
|
640
754
|
continue
|
641
|
-
|
755
|
+
|
642
756
|
# Create ScoringResult objects from the raw data
|
643
757
|
scoring_results = []
|
644
|
-
|
758
|
+
|
645
759
|
for example_data in examples_data:
|
646
760
|
# Extract example_id from the server response
|
647
761
|
example_id = example_data.get("example_id")
|
648
|
-
|
762
|
+
|
649
763
|
# Create ScorerData objects
|
650
764
|
scorer_data_list = []
|
651
765
|
for raw_scorer_data in example_data.get("scorer_data", []):
|
652
766
|
scorer_data_list.append(ScorerData(**raw_scorer_data))
|
653
|
-
|
767
|
+
|
654
768
|
# Use the original Example object if we have it and the ID matches
|
655
769
|
if original_example_map:
|
656
770
|
example = original_example_map[example_id]
|
657
771
|
debug(f"Matched result with original example {example_id}")
|
658
772
|
else:
|
659
773
|
# Create Example from example data (excluding scorer_data) if no original examples provided
|
660
|
-
example_dict = {
|
774
|
+
example_dict = {
|
775
|
+
k: v
|
776
|
+
for k, v in example_data.items()
|
777
|
+
if k != "scorer_data"
|
778
|
+
}
|
661
779
|
example = Example(**example_dict)
|
662
|
-
|
780
|
+
|
663
781
|
# Calculate success based on whether all scorer_data entries were successful
|
664
|
-
success =
|
665
|
-
|
782
|
+
success = (
|
783
|
+
all(scorer_data.success for scorer_data in scorer_data_list)
|
784
|
+
if scorer_data_list
|
785
|
+
else False
|
786
|
+
)
|
787
|
+
|
666
788
|
# Create ScoringResult
|
667
789
|
scoring_result = ScoringResult(
|
668
790
|
success=success, # Set based on all scorer data success values
|
669
791
|
scorers_data=scorer_data_list,
|
670
|
-
data_object=example
|
792
|
+
data_object=example,
|
671
793
|
)
|
672
794
|
scoring_results.append(scoring_result)
|
673
|
-
|
795
|
+
|
674
796
|
# If we got here, all validation checks passed
|
675
|
-
info(
|
797
|
+
info(
|
798
|
+
f"Verified complete results for all {len(scoring_results)} examples with all expected scorer data"
|
799
|
+
)
|
676
800
|
return scoring_results
|
677
801
|
else:
|
678
802
|
# No examples found
|
679
|
-
info(
|
803
|
+
info(
|
804
|
+
f"No example results found for completed evaluation '{eval_name}'"
|
805
|
+
)
|
680
806
|
return []
|
681
|
-
|
807
|
+
|
682
808
|
elif status == "failed":
|
683
809
|
# Evaluation failed
|
684
810
|
error_message = status_data.get("error", "Unknown error")
|
685
811
|
error(f"Evaluation '{eval_name}' failed: {error_message}")
|
686
812
|
raise JudgmentAPIError(f"Evaluation failed: {error_message}")
|
687
|
-
|
813
|
+
|
688
814
|
elif status == "pending" or status == "running":
|
689
815
|
# Only log occasionally for pending/running to avoid flooding logs
|
690
816
|
if poll_count % 4 == 0:
|
691
817
|
info(f"Evaluation '{eval_name}' status: {status}")
|
692
|
-
|
818
|
+
|
693
819
|
# Wait before checking again
|
694
820
|
await asyncio.sleep(poll_interval_seconds)
|
695
|
-
|
821
|
+
|
696
822
|
except Exception as e:
|
697
823
|
if isinstance(e, JudgmentAPIError):
|
698
824
|
raise
|
699
|
-
|
825
|
+
|
700
826
|
# For other exceptions, log and continue polling
|
701
827
|
error(f"Error checking evaluation status: {str(e)}")
|
702
828
|
if poll_count > 20: # Only raise exception after many failed attempts
|
703
|
-
raise JudgmentAPIError(
|
704
|
-
|
829
|
+
raise JudgmentAPIError(
|
830
|
+
f"Error checking evaluation status after {poll_count} attempts: {str(e)}"
|
831
|
+
)
|
832
|
+
|
705
833
|
# Continue polling after a delay
|
706
834
|
await asyncio.sleep(poll_interval_seconds)
|
707
835
|
|
836
|
+
|
708
837
|
async def await_with_spinner(task, message: str = "Awaiting async task: "):
|
709
838
|
"""
|
710
839
|
Display a spinner while awaiting an async task.
|
711
|
-
|
840
|
+
|
712
841
|
Args:
|
713
842
|
task: The asyncio task to await
|
714
843
|
message (str): Message to display with the spinner
|
715
|
-
|
844
|
+
|
716
845
|
Returns:
|
717
846
|
Any: The result of the awaited task
|
718
847
|
"""
|
719
|
-
spinner = itertools.cycle([
|
720
|
-
|
848
|
+
spinner = itertools.cycle(["|", "/", "-", "\\"])
|
849
|
+
|
721
850
|
# Create an event to signal when to stop the spinner
|
722
851
|
stop_spinner_event = asyncio.Event()
|
723
|
-
|
852
|
+
|
724
853
|
async def display_spinner():
|
725
854
|
while not stop_spinner_event.is_set():
|
726
|
-
sys.stdout.write(f
|
855
|
+
sys.stdout.write(f"\r{message}{next(spinner)}")
|
727
856
|
sys.stdout.flush()
|
728
857
|
await asyncio.sleep(0.1)
|
729
|
-
|
858
|
+
|
730
859
|
# Start the spinner in a separate task
|
731
860
|
spinner_task = asyncio.create_task(display_spinner())
|
732
|
-
|
861
|
+
|
733
862
|
try:
|
734
863
|
# Await the actual task
|
735
864
|
result = await task
|
@@ -737,66 +866,73 @@ async def await_with_spinner(task, message: str = "Awaiting async task: "):
|
|
737
866
|
# Signal the spinner to stop and wait for it to finish
|
738
867
|
stop_spinner_event.set()
|
739
868
|
await spinner_task
|
740
|
-
|
869
|
+
|
741
870
|
# Clear the spinner line
|
742
|
-
sys.stdout.write(
|
871
|
+
sys.stdout.write("\r" + " " * (len(message) + 1) + "\r")
|
743
872
|
sys.stdout.flush()
|
744
|
-
|
873
|
+
|
745
874
|
return result
|
746
875
|
|
876
|
+
|
747
877
|
class SpinnerWrappedTask:
|
748
878
|
"""
|
749
879
|
A wrapper for an asyncio task that displays a spinner when awaited.
|
750
880
|
"""
|
881
|
+
|
751
882
|
def __init__(self, task, message: str):
|
752
883
|
self.task = task
|
753
884
|
self.message = message
|
754
|
-
|
885
|
+
|
755
886
|
def __await__(self):
|
756
887
|
async def _spin_and_await():
|
757
888
|
# self.task resolves to (scoring_results, pretty_str_to_print)
|
758
889
|
task_result_tuple = await await_with_spinner(self.task, self.message)
|
759
|
-
|
890
|
+
|
760
891
|
# Unpack the tuple
|
761
892
|
scoring_results, pretty_str_to_print = task_result_tuple
|
762
|
-
|
893
|
+
|
763
894
|
# Print the pretty string if it exists, after spinner is cleared
|
764
895
|
if pretty_str_to_print:
|
765
896
|
rprint(pretty_str_to_print)
|
766
|
-
|
897
|
+
|
767
898
|
# Return only the scoring_results to the original awaiter
|
768
899
|
return scoring_results
|
900
|
+
|
769
901
|
return _spin_and_await().__await__()
|
770
|
-
|
902
|
+
|
771
903
|
# Proxy all Task attributes and methods to the underlying task
|
772
904
|
def __getattr__(self, name):
|
773
905
|
return getattr(self.task, name)
|
774
906
|
|
775
|
-
|
907
|
+
|
908
|
+
def run_eval(
|
909
|
+
evaluation_run: EvaluationRun,
|
910
|
+
override: bool = False,
|
911
|
+
async_execution: bool = False,
|
912
|
+
) -> Union[List[ScoringResult], asyncio.Task, SpinnerWrappedTask]:
|
776
913
|
"""
|
777
914
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
778
915
|
|
779
916
|
Args:
|
780
917
|
evaluation_run (EvaluationRun): Stores example and evaluation together for running
|
781
918
|
override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
|
782
|
-
ignore_errors (bool, optional): Whether to ignore scorer errors during evaluation. Defaults to True.
|
783
919
|
async_execution (bool, optional): Whether to execute the evaluation asynchronously. Defaults to False.
|
784
|
-
|
920
|
+
|
785
921
|
Returns:
|
786
|
-
Union[List[ScoringResult], Union[asyncio.Task, SpinnerWrappedTask]]:
|
922
|
+
Union[List[ScoringResult], Union[asyncio.Task, SpinnerWrappedTask]]:
|
787
923
|
- If async_execution is False, returns a list of ScoringResult objects
|
788
924
|
- If async_execution is True, returns a Task that will resolve to a list of ScoringResult objects when awaited
|
789
925
|
"""
|
790
926
|
|
791
927
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
792
|
-
if not override and
|
928
|
+
if not override and not evaluation_run.append:
|
793
929
|
check_eval_run_name_exists(
|
794
930
|
evaluation_run.eval_name,
|
795
931
|
evaluation_run.project_name,
|
796
932
|
evaluation_run.judgment_api_key,
|
797
|
-
evaluation_run.organization_id
|
933
|
+
evaluation_run.organization_id,
|
798
934
|
)
|
799
|
-
|
935
|
+
|
800
936
|
if evaluation_run.append:
|
801
937
|
# Check that the current experiment, if one exists, has the same type (examples of traces)
|
802
938
|
check_experiment_type(
|
@@ -804,15 +940,17 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
804
940
|
evaluation_run.project_name,
|
805
941
|
evaluation_run.judgment_api_key,
|
806
942
|
evaluation_run.organization_id,
|
807
|
-
False
|
943
|
+
False,
|
808
944
|
)
|
809
|
-
|
945
|
+
|
810
946
|
# Set example IDs if not already set
|
811
947
|
debug("Initializing examples with IDs and timestamps")
|
812
948
|
for idx, example in enumerate(evaluation_run.examples):
|
813
949
|
example.example_index = idx # Set numeric index
|
814
950
|
with example_logging_context(example.created_at, example.example_id):
|
815
|
-
debug(
|
951
|
+
debug(
|
952
|
+
f"Initialized example {example.example_id} (index: {example.example_index})"
|
953
|
+
)
|
816
954
|
debug(f"Input: {example.input}")
|
817
955
|
debug(f"Actual output: {example.actual_output}")
|
818
956
|
if example.expected_output:
|
@@ -827,9 +965,9 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
827
965
|
debug(f"Tools called: {example.tools_called}")
|
828
966
|
if example.expected_tools:
|
829
967
|
debug(f"Expected tools: {example.expected_tools}")
|
830
|
-
|
968
|
+
|
831
969
|
debug(f"Starting evaluation run with {len(evaluation_run.examples)} examples")
|
832
|
-
|
970
|
+
|
833
971
|
# Group APIJudgmentScorers and JudgevalScorers, then evaluate them in parallel
|
834
972
|
debug("Grouping scorers by type")
|
835
973
|
judgment_scorers: List[APIJudgmentScorer] = []
|
@@ -841,14 +979,16 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
841
979
|
else:
|
842
980
|
local_scorers.append(scorer)
|
843
981
|
debug(f"Added local scorer: {type(scorer).__name__}")
|
844
|
-
|
982
|
+
|
845
983
|
custom_example_check = [scorer.custom_example for scorer in local_scorers]
|
846
984
|
if any(custom_example_check) and not all(custom_example_check):
|
847
985
|
error("All scorers must be custom scorers if using custom examples")
|
848
986
|
raise ValueError("All scorers must be custom scorers if using custom examples")
|
849
|
-
|
850
|
-
debug(
|
851
|
-
|
987
|
+
|
988
|
+
debug(
|
989
|
+
f"Found {len(judgment_scorers)} judgment scorers and {len(local_scorers)} local scorers"
|
990
|
+
)
|
991
|
+
|
852
992
|
api_results: List[ScoringResult] = []
|
853
993
|
local_results: List[ScoringResult] = []
|
854
994
|
|
@@ -856,14 +996,14 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
856
996
|
if len(local_scorers) > 0:
|
857
997
|
error("Local scorers are not supported in async execution")
|
858
998
|
raise ValueError("Local scorers are not supported in async execution")
|
859
|
-
|
999
|
+
|
860
1000
|
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
861
1001
|
info("Starting async evaluation")
|
862
|
-
|
1002
|
+
|
863
1003
|
async def _async_evaluation_workflow():
|
864
1004
|
# Create a payload
|
865
1005
|
payload = evaluation_run.model_dump(warnings=False)
|
866
|
-
|
1006
|
+
|
867
1007
|
# Send the evaluation to the queue
|
868
1008
|
response = await asyncio.to_thread(
|
869
1009
|
requests.post,
|
@@ -871,50 +1011,52 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
871
1011
|
headers={
|
872
1012
|
"Content-Type": "application/json",
|
873
1013
|
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
874
|
-
"X-Organization-Id": evaluation_run.organization_id
|
1014
|
+
"X-Organization-Id": evaluation_run.organization_id,
|
875
1015
|
},
|
876
1016
|
json=payload,
|
877
|
-
verify=True
|
1017
|
+
verify=True,
|
878
1018
|
)
|
879
|
-
|
1019
|
+
|
880
1020
|
if not response.ok:
|
881
|
-
error_message = response.json().get(
|
1021
|
+
error_message = response.json().get(
|
1022
|
+
"detail", "An unknown error occurred."
|
1023
|
+
)
|
882
1024
|
error(f"Error adding evaluation to queue: {error_message}")
|
883
1025
|
raise JudgmentAPIError(error_message)
|
884
|
-
|
1026
|
+
|
885
1027
|
info(f"Successfully added evaluation '{evaluation_run.eval_name}' to queue")
|
886
|
-
|
1028
|
+
|
887
1029
|
# Poll until the evaluation is complete
|
888
1030
|
results = await _poll_evaluation_until_complete(
|
889
1031
|
eval_name=evaluation_run.eval_name,
|
890
1032
|
project_name=evaluation_run.project_name,
|
891
1033
|
judgment_api_key=evaluation_run.judgment_api_key,
|
892
1034
|
organization_id=evaluation_run.organization_id,
|
893
|
-
original_examples=evaluation_run.examples # Pass the original examples
|
1035
|
+
original_examples=evaluation_run.examples, # Pass the original examples
|
894
1036
|
)
|
895
1037
|
|
896
1038
|
pretty_str_to_print = None
|
897
|
-
if
|
898
|
-
send_results = [
|
1039
|
+
if results: # Ensure results exist before logging
|
1040
|
+
send_results = [
|
1041
|
+
scoring_result.model_dump(warnings=False)
|
1042
|
+
for scoring_result in results
|
1043
|
+
]
|
899
1044
|
try:
|
900
1045
|
# Run the blocking log_evaluation_results in a separate thread
|
901
1046
|
pretty_str_to_print = await asyncio.to_thread(
|
902
|
-
log_evaluation_results,
|
903
|
-
send_results,
|
904
|
-
evaluation_run
|
1047
|
+
log_evaluation_results, send_results, evaluation_run
|
905
1048
|
)
|
906
1049
|
except Exception as e:
|
907
1050
|
error(f"Error logging results after async evaluation: {str(e)}")
|
908
|
-
|
1051
|
+
|
909
1052
|
return results, pretty_str_to_print
|
910
|
-
|
1053
|
+
|
911
1054
|
# Create a regular task
|
912
1055
|
task = asyncio.create_task(_async_evaluation_workflow())
|
913
|
-
|
1056
|
+
|
914
1057
|
# Wrap it in our custom awaitable that will show a spinner only when awaited
|
915
1058
|
return SpinnerWrappedTask(
|
916
|
-
task,
|
917
|
-
f"Processing evaluation '{evaluation_run.eval_name}': "
|
1059
|
+
task, f"Processing evaluation '{evaluation_run.eval_name}': "
|
918
1060
|
)
|
919
1061
|
else:
|
920
1062
|
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
@@ -929,25 +1071,31 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
929
1071
|
examples=evaluation_run.examples,
|
930
1072
|
scorers=judgment_scorers,
|
931
1073
|
model=evaluation_run.model,
|
932
|
-
aggregator=evaluation_run.aggregator,
|
933
|
-
metadata=evaluation_run.metadata,
|
934
1074
|
judgment_api_key=evaluation_run.judgment_api_key,
|
935
1075
|
organization_id=evaluation_run.organization_id,
|
936
|
-
log_results=evaluation_run.log_results,
|
937
|
-
rules=evaluation_run.rules
|
938
1076
|
)
|
939
|
-
debug("Sending request to Judgment API")
|
940
|
-
response_data:
|
1077
|
+
debug("Sending request to Judgment API")
|
1078
|
+
response_data: Dict = run_with_spinner(
|
1079
|
+
"Running Evaluation: ", execute_api_eval, api_evaluation_run
|
1080
|
+
)
|
941
1081
|
info(f"Received {len(response_data['results'])} results from API")
|
942
1082
|
except JudgmentAPIError as e:
|
943
|
-
error(
|
944
|
-
|
1083
|
+
error(
|
1084
|
+
f"An error occurred while executing the Judgment API request: {str(e)}"
|
1085
|
+
)
|
1086
|
+
raise JudgmentAPIError(
|
1087
|
+
f"An error occurred while executing the Judgment API request: {str(e)}"
|
1088
|
+
)
|
945
1089
|
except ValueError as e:
|
946
|
-
raise ValueError(
|
947
|
-
|
1090
|
+
raise ValueError(
|
1091
|
+
f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}"
|
1092
|
+
)
|
1093
|
+
|
948
1094
|
# Convert the response data to `ScoringResult` objects
|
949
1095
|
debug("Processing API results")
|
950
|
-
api_results = [
|
1096
|
+
api_results = [
|
1097
|
+
ScoringResult(**result) for result in response_data["results"]
|
1098
|
+
]
|
951
1099
|
# Run local evals
|
952
1100
|
if local_scorers: # List[JudgevalScorer]
|
953
1101
|
# We should be removing local scorers soon
|
@@ -955,13 +1103,12 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
955
1103
|
for example in evaluation_run.examples:
|
956
1104
|
with example_logging_context(example.created_at, example.example_id):
|
957
1105
|
debug(f"Processing example {example.example_id}: {example.input}")
|
958
|
-
|
959
|
-
results: List[ScoringResult] =
|
1106
|
+
|
1107
|
+
results: List[ScoringResult] = safe_run_async(
|
960
1108
|
a_execute_scoring(
|
961
1109
|
evaluation_run.examples,
|
962
1110
|
local_scorers,
|
963
1111
|
model=evaluation_run.model,
|
964
|
-
ignore_errors=ignore_errors,
|
965
1112
|
skip_on_missing_params=True,
|
966
1113
|
show_indicator=True,
|
967
1114
|
_use_bar_indicator=True,
|
@@ -981,22 +1128,34 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
981
1128
|
# Evaluate rules against local scoring results if rules exist (this cant be done just yet)
|
982
1129
|
# if evaluation_run.rules and merged_results:
|
983
1130
|
# run_rules(
|
984
|
-
# local_results=merged_results,
|
985
|
-
# rules=evaluation_run.rules,
|
1131
|
+
# local_results=merged_results,
|
1132
|
+
# rules=evaluation_run.rules,
|
986
1133
|
# judgment_api_key=evaluation_run.judgment_api_key,
|
987
1134
|
# organization_id=evaluation_run.organization_id
|
988
1135
|
# )
|
989
1136
|
# print(merged_results)
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
1137
|
+
send_results = [
|
1138
|
+
scoring_result.model_dump(warnings=False)
|
1139
|
+
for scoring_result in merged_results
|
1140
|
+
]
|
1141
|
+
pretty_str = run_with_spinner(
|
1142
|
+
"Logging Results: ",
|
1143
|
+
log_evaluation_results,
|
1144
|
+
send_results,
|
1145
|
+
evaluation_run,
|
1146
|
+
)
|
1147
|
+
rprint(pretty_str)
|
994
1148
|
|
995
1149
|
for i, result in enumerate(merged_results):
|
996
|
-
if
|
997
|
-
|
1150
|
+
if (
|
1151
|
+
not result.scorers_data
|
1152
|
+
): # none of the scorers could be executed on this example
|
1153
|
+
info(
|
1154
|
+
f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers."
|
1155
|
+
)
|
998
1156
|
return merged_results
|
999
1157
|
|
1158
|
+
|
1000
1159
|
def assert_test(scoring_results: List[ScoringResult]) -> None:
|
1001
1160
|
"""
|
1002
1161
|
Collects all failed scorers from the scoring results.
|
@@ -1011,11 +1170,8 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
1011
1170
|
|
1012
1171
|
for result in scoring_results:
|
1013
1172
|
if not result.success:
|
1014
|
-
|
1015
1173
|
# Create a test case context with all relevant fields
|
1016
|
-
test_case = {
|
1017
|
-
"failed_scorers": []
|
1018
|
-
}
|
1174
|
+
test_case: Dict = {"failed_scorers": []}
|
1019
1175
|
if result.scorers_data:
|
1020
1176
|
# If the result was not successful, check each scorer_data
|
1021
1177
|
for scorer_data in result.scorers_data:
|
@@ -1024,12 +1180,11 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
1024
1180
|
# Remove threshold, evaluation model for Tool Order scorer
|
1025
1181
|
scorer_data.threshold = None
|
1026
1182
|
scorer_data.evaluation_model = None
|
1027
|
-
test_case[
|
1183
|
+
test_case["failed_scorers"].append(scorer_data)
|
1028
1184
|
failed_cases.append(test_case)
|
1029
1185
|
|
1030
1186
|
if failed_cases:
|
1031
|
-
|
1032
|
-
error_msg = f"The following test cases failed: \n"
|
1187
|
+
error_msg = "The following test cases failed: \n"
|
1033
1188
|
for fail_case in failed_cases:
|
1034
1189
|
# error_msg += f"\nInput: {fail_case['input']}\n"
|
1035
1190
|
# error_msg += f"Actual Output: {fail_case['actual_output']}\n"
|
@@ -1039,13 +1194,12 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
1039
1194
|
# error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
|
1040
1195
|
# error_msg += f"Tools Called: {fail_case['tools_called']}\n"
|
1041
1196
|
# error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
|
1042
|
-
|
1043
|
-
for fail_scorer in fail_case['failed_scorers']:
|
1044
1197
|
|
1198
|
+
for fail_scorer in fail_case["failed_scorers"]:
|
1045
1199
|
error_msg += (
|
1046
1200
|
f"\nScorer Name: {fail_scorer.name}\n"
|
1047
1201
|
f"Threshold: {fail_scorer.threshold}\n"
|
1048
|
-
f"Success: {fail_scorer.success}\n"
|
1202
|
+
f"Success: {fail_scorer.success}\n"
|
1049
1203
|
f"Score: {fail_scorer.score}\n"
|
1050
1204
|
f"Reason: {fail_scorer.reason}\n"
|
1051
1205
|
f"Strict Mode: {fail_scorer.strict_mode}\n"
|
@@ -1055,19 +1209,23 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
1055
1209
|
f"Verbose Logs: {fail_scorer.verbose_logs}\n"
|
1056
1210
|
f"Additional Metadata: {fail_scorer.additional_metadata}\n"
|
1057
1211
|
)
|
1058
|
-
error_msg += "-"*100
|
1212
|
+
error_msg += "-" * 100
|
1059
1213
|
|
1060
1214
|
total_tests = len(scoring_results)
|
1061
1215
|
failed_tests = len(failed_cases)
|
1062
1216
|
passed_tests = total_tests - failed_tests
|
1063
1217
|
|
1064
1218
|
# Print summary with colors
|
1065
|
-
rprint("\n" + "="*80)
|
1219
|
+
rprint("\n" + "=" * 80)
|
1066
1220
|
if failed_tests == 0:
|
1067
|
-
rprint(
|
1221
|
+
rprint(
|
1222
|
+
f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]"
|
1223
|
+
)
|
1068
1224
|
else:
|
1069
|
-
rprint(
|
1070
|
-
|
1225
|
+
rprint(
|
1226
|
+
f"[bold red]⚠️ TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]"
|
1227
|
+
)
|
1228
|
+
rprint("=" * 80 + "\n")
|
1071
1229
|
|
1072
1230
|
# Print individual test cases
|
1073
1231
|
for i, result in enumerate(scoring_results):
|
@@ -1084,9 +1242,8 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
1084
1242
|
rprint(f" [red] Reason: {scorer_data.reason}[/red]")
|
1085
1243
|
if scorer_data.error:
|
1086
1244
|
rprint(f" [red] Error: {scorer_data.error}[/red]")
|
1087
|
-
rprint(" " + "-"*40)
|
1245
|
+
rprint(" " + "-" * 40)
|
1088
1246
|
|
1089
|
-
rprint("\n" + "="*80)
|
1247
|
+
rprint("\n" + "=" * 80)
|
1090
1248
|
if failed_tests > 0:
|
1091
1249
|
raise AssertionError(failed_cases)
|
1092
|
-
|