judgeval 0.0.55__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/api/__init__.py +3 -0
- judgeval/common/api/api.py +352 -0
- judgeval/common/api/constants.py +165 -0
- judgeval/common/storage/__init__.py +6 -0
- judgeval/common/tracer/__init__.py +31 -0
- judgeval/common/tracer/constants.py +22 -0
- judgeval/common/tracer/core.py +1916 -0
- judgeval/common/tracer/otel_exporter.py +108 -0
- judgeval/common/tracer/otel_span_processor.py +234 -0
- judgeval/common/tracer/span_processor.py +37 -0
- judgeval/common/tracer/span_transformer.py +211 -0
- judgeval/common/tracer/trace_manager.py +92 -0
- judgeval/common/utils.py +2 -2
- judgeval/constants.py +3 -30
- judgeval/data/datasets/eval_dataset_client.py +29 -156
- judgeval/data/judgment_types.py +4 -12
- judgeval/data/result.py +1 -1
- judgeval/data/scorer_data.py +2 -2
- judgeval/data/scripts/openapi_transform.py +1 -1
- judgeval/data/trace.py +66 -1
- judgeval/data/trace_run.py +0 -3
- judgeval/evaluation_run.py +0 -2
- judgeval/integrations/langgraph.py +43 -164
- judgeval/judgment_client.py +17 -211
- judgeval/run_evaluation.py +209 -611
- judgeval/scorers/__init__.py +2 -6
- judgeval/scorers/base_scorer.py +4 -23
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +3 -3
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +215 -0
- judgeval/scorers/score.py +2 -1
- judgeval/scorers/utils.py +1 -13
- judgeval/utils/requests.py +21 -0
- judgeval-0.1.0.dist-info/METADATA +202 -0
- {judgeval-0.0.55.dist-info → judgeval-0.1.0.dist-info}/RECORD +37 -29
- judgeval/common/tracer.py +0 -3215
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -73
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -53
- judgeval-0.0.55.dist-info/METADATA +0 -1384
- /judgeval/common/{s3_storage.py → storage/s3_storage.py} +0 -0
- {judgeval-0.0.55.dist-info → judgeval-0.1.0.dist-info}/WHEEL +0 -0
- {judgeval-0.0.55.dist-info → judgeval-0.1.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
CHANGED
@@ -1,29 +1,21 @@
|
|
1
1
|
import asyncio
|
2
2
|
import concurrent.futures
|
3
|
-
from requests import exceptions
|
4
|
-
from judgeval.utils.requests import requests
|
5
3
|
import time
|
6
4
|
import json
|
7
5
|
import sys
|
8
|
-
import itertools
|
9
6
|
import threading
|
10
|
-
from typing import List, Dict,
|
7
|
+
from typing import List, Dict, Union, Optional, Callable, Tuple, Any
|
11
8
|
from rich import print as rprint
|
12
9
|
|
13
10
|
from judgeval.data import ScorerData, ScoringResult, Example, Trace
|
14
11
|
from judgeval.scorers import BaseScorer, APIScorerConfig
|
15
12
|
from judgeval.scorers.score import a_execute_scoring
|
13
|
+
from judgeval.common.api import JudgmentApiClient
|
16
14
|
from judgeval.constants import (
|
17
|
-
ROOT_API,
|
18
|
-
JUDGMENT_EVAL_API_URL,
|
19
|
-
JUDGMENT_TRACE_EVAL_API_URL,
|
20
|
-
JUDGMENT_EVAL_LOG_API_URL,
|
21
15
|
MAX_CONCURRENT_EVALUATIONS,
|
22
|
-
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
23
|
-
JUDGMENT_GET_EVAL_STATUS_API_URL,
|
24
|
-
JUDGMENT_EVAL_FETCH_API_URL,
|
25
16
|
)
|
26
17
|
from judgeval.common.exceptions import JudgmentAPIError
|
18
|
+
from judgeval.common.api.api import JudgmentAPIException
|
27
19
|
from judgeval.common.logger import judgeval_logger
|
28
20
|
from judgeval.evaluation_run import EvaluationRun
|
29
21
|
from judgeval.data.trace_run import TraceRun
|
@@ -54,22 +46,20 @@ def safe_run_async(coro):
|
|
54
46
|
return asyncio.run(coro)
|
55
47
|
|
56
48
|
|
57
|
-
def send_to_rabbitmq(evaluation_run: EvaluationRun) ->
|
49
|
+
def send_to_rabbitmq(evaluation_run: EvaluationRun) -> Dict[str, Any]:
|
58
50
|
"""
|
59
51
|
Sends an evaluation run to the RabbitMQ evaluation queue.
|
60
52
|
"""
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
verify=True,
|
53
|
+
if not evaluation_run.judgment_api_key or not evaluation_run.organization_id:
|
54
|
+
raise ValueError("API key and organization ID are required")
|
55
|
+
if not evaluation_run.eval_name or not evaluation_run.project_name:
|
56
|
+
raise ValueError("Eval name and project name are required")
|
57
|
+
api_client = JudgmentApiClient(
|
58
|
+
evaluation_run.judgment_api_key, evaluation_run.organization_id
|
59
|
+
)
|
60
|
+
return api_client.add_to_evaluation_queue(
|
61
|
+
evaluation_run.eval_name, evaluation_run.project_name
|
71
62
|
)
|
72
|
-
return response.json()
|
73
63
|
|
74
64
|
|
75
65
|
def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
|
@@ -86,146 +76,46 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
|
|
86
76
|
|
87
77
|
try:
|
88
78
|
# submit API request to execute evals
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
"Content-Type": "application/json",
|
94
|
-
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
95
|
-
"X-Organization-Id": evaluation_run.organization_id,
|
96
|
-
},
|
97
|
-
json=payload,
|
98
|
-
verify=True,
|
79
|
+
if not evaluation_run.judgment_api_key or not evaluation_run.organization_id:
|
80
|
+
raise ValueError("API key and organization ID are required")
|
81
|
+
api_client = JudgmentApiClient(
|
82
|
+
evaluation_run.judgment_api_key, evaluation_run.organization_id
|
99
83
|
)
|
100
|
-
|
84
|
+
return api_client.run_evaluation(evaluation_run.model_dump())
|
101
85
|
except Exception as e:
|
102
86
|
judgeval_logger.error(f"Error: {e}")
|
103
|
-
|
87
|
+
|
88
|
+
details = "No details provided"
|
89
|
+
if isinstance(e, JudgmentAPIException):
|
90
|
+
details = e.response_json.get("detail", "No details provided")
|
91
|
+
|
104
92
|
raise JudgmentAPIError(
|
105
93
|
"An error occurred while executing the Judgment API request: " + details
|
106
94
|
)
|
107
|
-
# Check if the response status code is not 2XX
|
108
|
-
# Add check for the duplicate eval run name
|
109
|
-
if not response.ok:
|
110
|
-
error_message = response_data.get("detail", "An unknown error occurred.")
|
111
|
-
judgeval_logger.error(f"Error: {error_message=}")
|
112
|
-
raise JudgmentAPIError(error_message)
|
113
|
-
return response_data
|
114
95
|
|
115
96
|
|
116
|
-
def execute_api_trace_eval(trace_run: TraceRun) -> Dict:
|
97
|
+
def execute_api_trace_eval(trace_run: TraceRun, judgment_api_key: str) -> Dict:
|
117
98
|
"""
|
118
99
|
Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
|
119
100
|
"""
|
120
101
|
|
121
102
|
try:
|
122
103
|
# submit API request to execute evals
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
"Content-Type": "application/json",
|
128
|
-
"Authorization": f"Bearer {trace_run.judgment_api_key}",
|
129
|
-
"X-Organization-Id": trace_run.organization_id,
|
130
|
-
},
|
131
|
-
json=payload,
|
132
|
-
verify=True,
|
133
|
-
)
|
134
|
-
response_data = response.json()
|
104
|
+
if not judgment_api_key or not trace_run.organization_id:
|
105
|
+
raise ValueError("API key and organization ID are required")
|
106
|
+
api_client = JudgmentApiClient(judgment_api_key, trace_run.organization_id)
|
107
|
+
return api_client.run_trace_evaluation(trace_run.model_dump(warnings=False))
|
135
108
|
except Exception as e:
|
136
109
|
judgeval_logger.error(f"Error: {e}")
|
137
|
-
details = response.json().get("detail", "No details provided")
|
138
|
-
raise JudgmentAPIError(
|
139
|
-
"An error occurred while executing the Judgment API request: " + details
|
140
|
-
)
|
141
|
-
# Check if the response status code is not 2XX
|
142
|
-
# Add check for the duplicate eval run name
|
143
|
-
if not response.ok:
|
144
|
-
error_message = response_data.get("detail", "An unknown error occurred.")
|
145
|
-
judgeval_logger.error(f"Error: {error_message=}")
|
146
|
-
raise JudgmentAPIError(error_message)
|
147
|
-
return response_data
|
148
|
-
|
149
|
-
|
150
|
-
def merge_results(
|
151
|
-
api_results: List[ScoringResult], local_results: List[ScoringResult]
|
152
|
-
) -> List[ScoringResult]:
|
153
|
-
"""
|
154
|
-
When executing scorers that come from both the Judgment API and local scorers, we're left with
|
155
|
-
results for each type of scorer. This function merges the results from the API and local evaluations,
|
156
|
-
grouped by example. In particular, we merge the `scorers_data` field of each `ScoringResult` object.
|
157
110
|
|
158
|
-
|
159
|
-
|
160
|
-
|
111
|
+
details = "An unknown error occurred."
|
112
|
+
if isinstance(e, JudgmentAPIException):
|
113
|
+
details = e.response_json.get("detail", "An unknown error occurred.")
|
161
114
|
|
162
|
-
|
163
|
-
|
164
|
-
"""
|
165
|
-
# No merge required
|
166
|
-
if not local_results and api_results:
|
167
|
-
return [result.model_copy() for result in api_results]
|
168
|
-
if not api_results and local_results:
|
169
|
-
return [result.model_copy() for result in local_results]
|
170
|
-
|
171
|
-
if len(api_results) != len(local_results):
|
172
|
-
# Results should be of same length because each ScoringResult is a 1-1 mapping to an Example
|
173
|
-
raise ValueError(
|
174
|
-
f"The number of API and local results do not match: {len(api_results)} vs {len(local_results)}"
|
115
|
+
raise JudgmentAPIError(
|
116
|
+
"An error occurred while executing the Judgment API request: " + details
|
175
117
|
)
|
176
118
|
|
177
|
-
# Create a copy of api_results to avoid modifying the input
|
178
|
-
merged_results = [result.model_copy() for result in api_results]
|
179
|
-
|
180
|
-
# Each ScoringResult in api and local have all the same fields besides `scorers_data`
|
181
|
-
for merged_result, local_result in zip(merged_results, local_results):
|
182
|
-
if not (merged_result.data_object and local_result.data_object):
|
183
|
-
raise ValueError("Data object is None in one of the results.")
|
184
|
-
if merged_result.data_object.input != local_result.data_object.input:
|
185
|
-
raise ValueError("The API and local results are not aligned.")
|
186
|
-
if (
|
187
|
-
merged_result.data_object.actual_output
|
188
|
-
!= local_result.data_object.actual_output
|
189
|
-
):
|
190
|
-
raise ValueError("The API and local results are not aligned.")
|
191
|
-
if (
|
192
|
-
merged_result.data_object.expected_output
|
193
|
-
!= local_result.data_object.expected_output
|
194
|
-
):
|
195
|
-
raise ValueError("The API and local results are not aligned.")
|
196
|
-
if merged_result.data_object.context != local_result.data_object.context:
|
197
|
-
raise ValueError("The API and local results are not aligned.")
|
198
|
-
if (
|
199
|
-
merged_result.data_object.retrieval_context
|
200
|
-
!= local_result.data_object.retrieval_context
|
201
|
-
):
|
202
|
-
raise ValueError("The API and local results are not aligned.")
|
203
|
-
if (
|
204
|
-
merged_result.data_object.additional_metadata
|
205
|
-
!= local_result.data_object.additional_metadata
|
206
|
-
):
|
207
|
-
raise ValueError("The API and local results are not aligned.")
|
208
|
-
if (
|
209
|
-
merged_result.data_object.tools_called
|
210
|
-
!= local_result.data_object.tools_called
|
211
|
-
):
|
212
|
-
raise ValueError("The API and local results are not aligned.")
|
213
|
-
if (
|
214
|
-
merged_result.data_object.expected_tools
|
215
|
-
!= local_result.data_object.expected_tools
|
216
|
-
):
|
217
|
-
raise ValueError("The API and local results are not aligned.")
|
218
|
-
|
219
|
-
# Merge ScorerData from the API and local scorers together
|
220
|
-
api_scorer_data = merged_result.scorers_data
|
221
|
-
local_scorer_data = local_result.scorers_data
|
222
|
-
if api_scorer_data is None and local_scorer_data is not None:
|
223
|
-
merged_result.scorers_data = local_scorer_data
|
224
|
-
elif api_scorer_data is not None and local_scorer_data is not None:
|
225
|
-
merged_result.scorers_data = api_scorer_data + local_scorer_data
|
226
|
-
|
227
|
-
return merged_results
|
228
|
-
|
229
119
|
|
230
120
|
def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
|
231
121
|
"""
|
@@ -255,34 +145,17 @@ def check_experiment_type(
|
|
255
145
|
"""
|
256
146
|
Checks if the current experiment, if one exists, has the same type (examples of traces)
|
257
147
|
"""
|
258
|
-
|
259
|
-
response = requests.post(
|
260
|
-
f"{ROOT_API}/check_experiment_type/",
|
261
|
-
headers={
|
262
|
-
"Content-Type": "application/json",
|
263
|
-
"Authorization": f"Bearer {judgment_api_key}",
|
264
|
-
"X-Organization-Id": organization_id,
|
265
|
-
},
|
266
|
-
json={
|
267
|
-
"eval_name": eval_name,
|
268
|
-
"project_name": project_name,
|
269
|
-
"judgment_api_key": judgment_api_key,
|
270
|
-
"is_trace": is_trace,
|
271
|
-
},
|
272
|
-
verify=True,
|
273
|
-
)
|
274
|
-
|
275
|
-
if response.status_code == 422:
|
276
|
-
judgeval_logger.error(f"{response.json()}")
|
277
|
-
raise ValueError(f"{response.json()}")
|
148
|
+
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
278
149
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
150
|
+
try:
|
151
|
+
api_client.check_experiment_type(eval_name, project_name, is_trace)
|
152
|
+
except JudgmentAPIException as e:
|
153
|
+
if e.response.status_code == 422:
|
154
|
+
judgeval_logger.error(f"{e.response_json}")
|
155
|
+
raise ValueError(f"{e.response_json}")
|
156
|
+
else:
|
157
|
+
raise e
|
158
|
+
except Exception as e:
|
286
159
|
judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
|
287
160
|
raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
|
288
161
|
|
@@ -302,125 +175,56 @@ def check_eval_run_name_exists(
|
|
302
175
|
ValueError: If the evaluation run name already exists
|
303
176
|
JudgmentAPIError: If there's an API error during the check
|
304
177
|
"""
|
178
|
+
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
305
179
|
try:
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
"eval_name": eval_name,
|
315
|
-
"project_name": project_name,
|
316
|
-
"judgment_api_key": judgment_api_key,
|
317
|
-
},
|
318
|
-
verify=True,
|
319
|
-
)
|
320
|
-
|
321
|
-
if response.status_code == 409:
|
322
|
-
judgeval_logger.error(
|
323
|
-
f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true."
|
324
|
-
)
|
325
|
-
raise ValueError(
|
326
|
-
f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true."
|
327
|
-
)
|
328
|
-
|
329
|
-
if not response.ok:
|
330
|
-
response_data = response.json()
|
331
|
-
error_message = response_data.get("detail", "An unknown error occurred.")
|
332
|
-
judgeval_logger.error(f"Error checking eval run name: {error_message}")
|
333
|
-
raise JudgmentAPIError(error_message)
|
180
|
+
api_client.check_eval_run_name_exists(eval_name, project_name)
|
181
|
+
except JudgmentAPIException as e:
|
182
|
+
if e.response.status_code == 409:
|
183
|
+
error_str = f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true. See https://docs.judgmentlabs.ai/sdk-reference/judgment-client#override for more information."
|
184
|
+
judgeval_logger.error(error_str)
|
185
|
+
raise ValueError(error_str)
|
186
|
+
else:
|
187
|
+
raise e
|
334
188
|
|
335
|
-
except
|
189
|
+
except Exception as e:
|
336
190
|
judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
|
337
191
|
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
338
192
|
|
339
193
|
|
340
194
|
def log_evaluation_results(
|
341
|
-
scoring_results: List[ScoringResult],
|
342
|
-
|
195
|
+
scoring_results: List[ScoringResult],
|
196
|
+
run: Union[EvaluationRun, TraceRun],
|
197
|
+
judgment_api_key: str,
|
198
|
+
) -> str:
|
343
199
|
"""
|
344
200
|
Logs evaluation results to the Judgment API database.
|
345
201
|
|
346
202
|
Args:
|
347
203
|
merged_results (List[ScoringResult]): The results to log
|
348
204
|
evaluation_run (EvaluationRun): The evaluation run containing project info and API key
|
205
|
+
judgment_api_key (str): The API key for the Judgment API
|
349
206
|
|
350
207
|
Raises:
|
351
208
|
JudgmentAPIError: If there's an API error during logging
|
352
209
|
ValueError: If there's a validation error with the results
|
353
210
|
"""
|
354
211
|
try:
|
355
|
-
|
356
|
-
|
357
|
-
headers={
|
358
|
-
"Content-Type": "application/json",
|
359
|
-
"Authorization": f"Bearer {run.judgment_api_key}",
|
360
|
-
"X-Organization-Id": run.organization_id,
|
361
|
-
},
|
362
|
-
json={"results": scoring_results, "run": run.model_dump(warnings=False)},
|
363
|
-
verify=True,
|
364
|
-
)
|
212
|
+
if not judgment_api_key or not run.organization_id:
|
213
|
+
raise ValueError("API key and organization ID are required")
|
365
214
|
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
raise JudgmentAPIError(error_message)
|
371
|
-
|
372
|
-
if "ui_results_url" in res.json():
|
373
|
-
url = res.json()["ui_results_url"]
|
374
|
-
pretty_str = f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
375
|
-
return pretty_str
|
376
|
-
|
377
|
-
return None
|
378
|
-
|
379
|
-
except exceptions.RequestException as e:
|
380
|
-
judgeval_logger.error(
|
381
|
-
f"Request failed while saving evaluation results to DB: {str(e)}"
|
215
|
+
api_client = JudgmentApiClient(judgment_api_key, run.organization_id)
|
216
|
+
response = api_client.log_evaluation_results(
|
217
|
+
scoring_results,
|
218
|
+
run.model_dump(warnings=False),
|
382
219
|
)
|
220
|
+
url = response.get("ui_results_url")
|
221
|
+
return url
|
222
|
+
|
223
|
+
except Exception as e:
|
224
|
+
judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
|
383
225
|
raise JudgmentAPIError(
|
384
226
|
f"Request failed while saving evaluation results to DB: {str(e)}"
|
385
227
|
)
|
386
|
-
except Exception as e:
|
387
|
-
judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
|
388
|
-
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
389
|
-
|
390
|
-
|
391
|
-
def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
|
392
|
-
"""Run a function with a spinner in the terminal."""
|
393
|
-
spinner = itertools.cycle(["|", "/", "-", "\\"])
|
394
|
-
|
395
|
-
def display_spinner():
|
396
|
-
while not stop_spinner_event.is_set():
|
397
|
-
sys.stdout.write(f"\r{message}{next(spinner)}")
|
398
|
-
sys.stdout.flush()
|
399
|
-
time.sleep(0.1)
|
400
|
-
|
401
|
-
stop_spinner_event = threading.Event()
|
402
|
-
spinner_thread = threading.Thread(target=display_spinner)
|
403
|
-
spinner_thread.start()
|
404
|
-
|
405
|
-
try:
|
406
|
-
if asyncio.iscoroutinefunction(func):
|
407
|
-
coro = func(*args, **kwargs)
|
408
|
-
result = safe_run_async(coro)
|
409
|
-
else:
|
410
|
-
result = func(*args, **kwargs)
|
411
|
-
except Exception as e:
|
412
|
-
judgeval_logger.error(f"An error occurred: {str(e)}")
|
413
|
-
stop_spinner_event.set()
|
414
|
-
spinner_thread.join()
|
415
|
-
raise e
|
416
|
-
finally:
|
417
|
-
stop_spinner_event.set()
|
418
|
-
spinner_thread.join()
|
419
|
-
|
420
|
-
sys.stdout.write("\r" + " " * (len(message) + 1) + "\r")
|
421
|
-
sys.stdout.flush()
|
422
|
-
|
423
|
-
return result
|
424
228
|
|
425
229
|
|
426
230
|
def check_examples(
|
@@ -455,6 +259,7 @@ def check_examples(
|
|
455
259
|
|
456
260
|
def run_trace_eval(
|
457
261
|
trace_run: TraceRun,
|
262
|
+
judgment_api_key: str,
|
458
263
|
override: bool = False,
|
459
264
|
function: Optional[Callable] = None,
|
460
265
|
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
|
@@ -465,7 +270,7 @@ def run_trace_eval(
|
|
465
270
|
check_eval_run_name_exists(
|
466
271
|
trace_run.eval_name,
|
467
272
|
trace_run.project_name,
|
468
|
-
|
273
|
+
judgment_api_key,
|
469
274
|
trace_run.organization_id,
|
470
275
|
)
|
471
276
|
|
@@ -474,7 +279,7 @@ def run_trace_eval(
|
|
474
279
|
check_experiment_type(
|
475
280
|
trace_run.eval_name,
|
476
281
|
trace_run.project_name,
|
477
|
-
|
282
|
+
judgment_api_key,
|
478
283
|
trace_run.organization_id,
|
479
284
|
True,
|
480
285
|
)
|
@@ -489,22 +294,19 @@ def run_trace_eval(
|
|
489
294
|
|
490
295
|
actual_tracer.offline_mode = True
|
491
296
|
actual_tracer.traces = []
|
297
|
+
judgeval_logger.info("Running agent function: ")
|
492
298
|
for example in examples:
|
493
299
|
if example.input:
|
494
300
|
if isinstance(example.input, str):
|
495
|
-
|
496
|
-
"Running agent function: ", function, example.input
|
497
|
-
)
|
301
|
+
function(example.input)
|
498
302
|
elif isinstance(example.input, dict):
|
499
|
-
|
500
|
-
"Running agent function: ", function, **example.input
|
501
|
-
)
|
303
|
+
function(**example.input)
|
502
304
|
else:
|
503
305
|
raise ValueError(
|
504
306
|
f"Input must be string or dict, got {type(example.input)}"
|
505
307
|
)
|
506
308
|
else:
|
507
|
-
|
309
|
+
function()
|
508
310
|
|
509
311
|
for i, trace in enumerate(actual_tracer.traces):
|
510
312
|
# We set the root-level trace span with the expected tools of the Trace
|
@@ -516,9 +318,8 @@ def run_trace_eval(
|
|
516
318
|
|
517
319
|
# Execute evaluation using Judgment API
|
518
320
|
try: # execute an EvaluationRun with just JudgmentScorers
|
519
|
-
|
520
|
-
|
521
|
-
)
|
321
|
+
judgeval_logger.info("Executing Trace Evaluation... ")
|
322
|
+
response_data: Dict = execute_api_trace_eval(trace_run, judgment_api_key)
|
522
323
|
scoring_results = [
|
523
324
|
ScoringResult(**result) for result in response_data["results"]
|
524
325
|
]
|
@@ -534,14 +335,12 @@ def run_trace_eval(
|
|
534
335
|
# Convert the response data to `ScoringResult` objects
|
535
336
|
# TODO: allow for custom scorer on traces
|
536
337
|
|
537
|
-
|
538
|
-
"
|
539
|
-
|
540
|
-
|
541
|
-
|
338
|
+
url = log_evaluation_results(
|
339
|
+
response_data["agent_results"], trace_run, judgment_api_key
|
340
|
+
)
|
341
|
+
rprint(
|
342
|
+
f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
542
343
|
)
|
543
|
-
rprint(pretty_str)
|
544
|
-
|
545
344
|
return scoring_results
|
546
345
|
|
547
346
|
|
@@ -563,41 +362,33 @@ async def get_evaluation_status(
|
|
563
362
|
- results: List of ScoringResult objects if completed
|
564
363
|
- error: Error message if failed
|
565
364
|
"""
|
365
|
+
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
566
366
|
try:
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
"Authorization": f"Bearer {judgment_api_key}",
|
572
|
-
"X-Organization-Id": organization_id,
|
573
|
-
},
|
574
|
-
params={
|
575
|
-
"eval_name": eval_name,
|
576
|
-
"project_name": project_name,
|
577
|
-
},
|
578
|
-
verify=True,
|
367
|
+
return api_client.get_evaluation_status(eval_name, project_name)
|
368
|
+
except Exception as e:
|
369
|
+
raise JudgmentAPIError(
|
370
|
+
f"An error occurred while checking evaluation status: {str(e)}"
|
579
371
|
)
|
580
372
|
|
581
|
-
if not response.ok:
|
582
|
-
error_message = response.json().get("detail", "An unknown error occurred.")
|
583
|
-
judgeval_logger.error(f"Error checking evaluation status: {error_message}")
|
584
|
-
raise JudgmentAPIError(error_message)
|
585
373
|
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
374
|
+
def retrieve_counts(result: Dict):
|
375
|
+
scorer_data_count = 0
|
376
|
+
for example in result.get("examples", []):
|
377
|
+
for scorer in example.get("scorer_data", []):
|
378
|
+
scorer_data_count += 1
|
379
|
+
return scorer_data_count
|
590
380
|
|
591
381
|
|
592
|
-
|
382
|
+
def _poll_evaluation_until_complete(
|
593
383
|
eval_name: str,
|
594
384
|
project_name: str,
|
595
385
|
judgment_api_key: str,
|
596
386
|
organization_id: str,
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
387
|
+
expected_scorer_data_count: int,
|
388
|
+
poll_interval_seconds: float = 5,
|
389
|
+
max_failures: int = 5,
|
390
|
+
max_poll_count: int = 24, # This should be equivalent to 120 seconds
|
391
|
+
) -> Tuple[List[ScoringResult], str]:
|
601
392
|
"""
|
602
393
|
Polls until the evaluation is complete and returns the results.
|
603
394
|
|
@@ -614,210 +405,93 @@ async def _poll_evaluation_until_complete(
|
|
614
405
|
List[ScoringResult]: The evaluation results
|
615
406
|
"""
|
616
407
|
poll_count = 0
|
617
|
-
|
618
|
-
|
408
|
+
exception_count = 0
|
409
|
+
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
410
|
+
while poll_count < max_poll_count:
|
619
411
|
poll_count += 1
|
620
412
|
try:
|
621
413
|
# Check status
|
622
|
-
|
623
|
-
requests.get,
|
624
|
-
JUDGMENT_GET_EVAL_STATUS_API_URL,
|
625
|
-
headers={
|
626
|
-
"Content-Type": "application/json",
|
627
|
-
"Authorization": f"Bearer {judgment_api_key}",
|
628
|
-
"X-Organization-Id": organization_id,
|
629
|
-
},
|
630
|
-
params={"eval_name": eval_name, "project_name": project_name},
|
631
|
-
verify=True,
|
632
|
-
)
|
414
|
+
status_response = api_client.get_evaluation_status(eval_name, project_name)
|
633
415
|
|
634
|
-
if
|
635
|
-
|
636
|
-
"detail", "An unknown error occurred."
|
637
|
-
)
|
638
|
-
judgeval_logger.error(
|
639
|
-
f"Error checking evaluation status: {error_message}"
|
640
|
-
)
|
641
|
-
# Don't raise exception immediately, just log and continue polling
|
642
|
-
await asyncio.sleep(poll_interval_seconds)
|
416
|
+
if status_response.get("status") != "completed":
|
417
|
+
time.sleep(poll_interval_seconds)
|
643
418
|
continue
|
644
419
|
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
if status == "completed" or status == "complete":
|
650
|
-
results_response = await asyncio.to_thread(
|
651
|
-
requests.post,
|
652
|
-
JUDGMENT_EVAL_FETCH_API_URL,
|
653
|
-
headers={
|
654
|
-
"Content-Type": "application/json",
|
655
|
-
"Authorization": f"Bearer {judgment_api_key}",
|
656
|
-
"X-Organization-Id": organization_id,
|
657
|
-
},
|
658
|
-
json={"project_name": project_name, "eval_name": eval_name},
|
659
|
-
verify=True,
|
660
|
-
)
|
661
|
-
|
662
|
-
if not results_response.ok:
|
663
|
-
error_message = results_response.json().get(
|
664
|
-
"detail", "An unknown error occurred."
|
665
|
-
)
|
666
|
-
judgeval_logger.error(
|
667
|
-
f"Error fetching evaluation results: {error_message}"
|
668
|
-
)
|
669
|
-
raise JudgmentAPIError(error_message)
|
670
|
-
|
671
|
-
result_data = results_response.json()
|
672
|
-
|
673
|
-
if result_data.get("examples") is None:
|
674
|
-
continue
|
675
|
-
|
676
|
-
examples_data = result_data.get("examples", [])
|
677
|
-
scoring_results = []
|
678
|
-
|
679
|
-
for example_data in examples_data:
|
680
|
-
# Create ScorerData objects
|
681
|
-
scorer_data_list = []
|
682
|
-
for raw_scorer_data in example_data.get("scorer_data", []):
|
683
|
-
scorer_data_list.append(ScorerData(**raw_scorer_data))
|
420
|
+
results_response = api_client.fetch_evaluation_results(
|
421
|
+
project_name, eval_name
|
422
|
+
)
|
423
|
+
url = results_response.get("ui_results_url")
|
684
424
|
|
685
|
-
|
686
|
-
|
687
|
-
|
425
|
+
if results_response.get("examples") is None:
|
426
|
+
time.sleep(poll_interval_seconds)
|
427
|
+
continue
|
688
428
|
|
689
|
-
|
429
|
+
examples_data = results_response.get("examples", [])
|
430
|
+
scoring_results = []
|
431
|
+
scorer_data_count = 0
|
690
432
|
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
)
|
695
|
-
|
696
|
-
|
697
|
-
scorers_data=scorer_data_list,
|
698
|
-
data_object=example,
|
699
|
-
)
|
700
|
-
scoring_results.append(scoring_result)
|
433
|
+
for example_data in examples_data:
|
434
|
+
scorer_data_list = []
|
435
|
+
for raw_scorer_data in example_data.get("scorer_data", []):
|
436
|
+
scorer_data = ScorerData(**raw_scorer_data)
|
437
|
+
scorer_data_list.append(scorer_data)
|
438
|
+
scorer_data_count += 1
|
701
439
|
|
702
|
-
|
703
|
-
# This means that not all examples were evaluated
|
704
|
-
continue
|
440
|
+
example = Example(**example_data)
|
705
441
|
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
f"Evaluation '{eval_name}' failed: {error_message}"
|
442
|
+
success = all(scorer_data.success for scorer_data in scorer_data_list)
|
443
|
+
scoring_result = ScoringResult(
|
444
|
+
success=success,
|
445
|
+
scorers_data=scorer_data_list,
|
446
|
+
data_object=example,
|
712
447
|
)
|
713
|
-
|
448
|
+
scoring_results.append(scoring_result)
|
714
449
|
|
715
|
-
|
716
|
-
|
450
|
+
if scorer_data_count != expected_scorer_data_count:
|
451
|
+
time.sleep(poll_interval_seconds)
|
452
|
+
continue
|
717
453
|
|
454
|
+
return scoring_results, url
|
718
455
|
except Exception as e:
|
456
|
+
exception_count += 1
|
719
457
|
if isinstance(e, JudgmentAPIError):
|
720
458
|
raise
|
721
459
|
|
722
|
-
# For other exceptions, log and continue polling
|
723
460
|
judgeval_logger.error(f"Error checking evaluation status: {str(e)}")
|
724
|
-
if
|
461
|
+
if exception_count > max_failures:
|
725
462
|
raise JudgmentAPIError(
|
726
463
|
f"Error checking evaluation status after {poll_count} attempts: {str(e)}"
|
727
464
|
)
|
728
465
|
|
729
|
-
|
730
|
-
await asyncio.sleep(poll_interval_seconds)
|
731
|
-
|
732
|
-
|
733
|
-
async def await_with_spinner(task, message: str = "Awaiting async task: "):
|
734
|
-
"""
|
735
|
-
Display a spinner while awaiting an async task.
|
736
|
-
|
737
|
-
Args:
|
738
|
-
task: The asyncio task to await
|
739
|
-
message (str): Message to display with the spinner
|
740
|
-
|
741
|
-
Returns:
|
742
|
-
Any: The result of the awaited task
|
743
|
-
"""
|
744
|
-
spinner = itertools.cycle(["|", "/", "-", "\\"])
|
745
|
-
|
746
|
-
# Create an event to signal when to stop the spinner
|
747
|
-
stop_spinner_event = asyncio.Event()
|
748
|
-
|
749
|
-
async def display_spinner():
|
750
|
-
while not stop_spinner_event.is_set():
|
751
|
-
sys.stdout.write(f"\r{message}{next(spinner)}")
|
752
|
-
sys.stdout.flush()
|
753
|
-
await asyncio.sleep(0.1)
|
754
|
-
|
755
|
-
# Start the spinner in a separate task
|
756
|
-
spinner_task = asyncio.create_task(display_spinner())
|
757
|
-
|
758
|
-
try:
|
759
|
-
# Await the actual task
|
760
|
-
result = await task
|
761
|
-
finally:
|
762
|
-
# Signal the spinner to stop and wait for it to finish
|
763
|
-
stop_spinner_event.set()
|
764
|
-
await spinner_task
|
765
|
-
|
766
|
-
# Clear the spinner line
|
767
|
-
sys.stdout.write("\r" + " " * (len(message) + 1) + "\r")
|
768
|
-
sys.stdout.flush()
|
769
|
-
|
770
|
-
return result
|
771
|
-
|
772
|
-
|
773
|
-
class SpinnerWrappedTask:
|
774
|
-
"""
|
775
|
-
A wrapper for an asyncio task that displays a spinner when awaited.
|
776
|
-
"""
|
777
|
-
|
778
|
-
def __init__(self, task, message: str):
|
779
|
-
self.task = task
|
780
|
-
self.message = message
|
781
|
-
|
782
|
-
def __await__(self):
|
783
|
-
async def _spin_and_await():
|
784
|
-
# self.task resolves to (scoring_results, pretty_str_to_print)
|
785
|
-
task_result_tuple = await await_with_spinner(self.task, self.message)
|
786
|
-
|
787
|
-
# Unpack the tuple
|
788
|
-
scoring_results, pretty_str_to_print = task_result_tuple
|
789
|
-
|
790
|
-
# Print the pretty string if it exists, after spinner is cleared
|
791
|
-
if pretty_str_to_print:
|
792
|
-
rprint(pretty_str_to_print)
|
466
|
+
time.sleep(poll_interval_seconds)
|
793
467
|
|
794
|
-
|
795
|
-
|
468
|
+
raise JudgmentAPIError(
|
469
|
+
f"Error checking evaluation status after {poll_count} attempts"
|
470
|
+
)
|
796
471
|
|
797
|
-
return _spin_and_await().__await__()
|
798
472
|
|
799
|
-
|
800
|
-
|
801
|
-
|
473
|
+
def progress_logger(stop_event, msg="Working...", interval=5):
|
474
|
+
start = time.time()
|
475
|
+
while not stop_event.is_set():
|
476
|
+
elapsed = int(time.time() - start)
|
477
|
+
judgeval_logger.info(f"{msg} ({elapsed} sec)")
|
478
|
+
stop_event.wait(interval)
|
802
479
|
|
803
480
|
|
804
481
|
def run_eval(
|
805
482
|
evaluation_run: EvaluationRun,
|
483
|
+
judgment_api_key: str,
|
806
484
|
override: bool = False,
|
807
|
-
|
808
|
-
) -> Union[List[ScoringResult], asyncio.Task, SpinnerWrappedTask]:
|
485
|
+
) -> List[ScoringResult]:
|
809
486
|
"""
|
810
487
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
811
488
|
|
812
489
|
Args:
|
813
490
|
evaluation_run (EvaluationRun): Stores example and evaluation together for running
|
814
491
|
override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
|
815
|
-
async_execution (bool, optional): Whether to execute the evaluation asynchronously. Defaults to False.
|
816
492
|
|
817
493
|
Returns:
|
818
|
-
|
819
|
-
- If async_execution is False, returns a list of ScoringResult objects
|
820
|
-
- If async_execution is True, returns a Task that will resolve to a list of ScoringResult objects when awaited
|
494
|
+
List[ScoringResult]: A list of ScoringResult objects
|
821
495
|
"""
|
822
496
|
|
823
497
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
@@ -825,7 +499,7 @@ def run_eval(
|
|
825
499
|
check_eval_run_name_exists(
|
826
500
|
evaluation_run.eval_name,
|
827
501
|
evaluation_run.project_name,
|
828
|
-
|
502
|
+
judgment_api_key,
|
829
503
|
evaluation_run.organization_id,
|
830
504
|
)
|
831
505
|
|
@@ -834,7 +508,7 @@ def run_eval(
|
|
834
508
|
check_experiment_type(
|
835
509
|
evaluation_run.eval_name,
|
836
510
|
evaluation_run.project_name,
|
837
|
-
|
511
|
+
judgment_api_key,
|
838
512
|
evaluation_run.organization_id,
|
839
513
|
False,
|
840
514
|
)
|
@@ -851,148 +525,81 @@ def run_eval(
|
|
851
525
|
else:
|
852
526
|
local_scorers.append(scorer)
|
853
527
|
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
if
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
"X-Organization-Id": evaluation_run.organization_id,
|
876
|
-
},
|
877
|
-
json=payload,
|
878
|
-
verify=True,
|
528
|
+
results: List[ScoringResult] = []
|
529
|
+
url = ""
|
530
|
+
|
531
|
+
if len(local_scorers) > 0 and len(judgment_scorers) > 0:
|
532
|
+
error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
|
533
|
+
judgeval_logger.error(error_msg)
|
534
|
+
raise ValueError(error_msg)
|
535
|
+
|
536
|
+
if len(judgment_scorers) > 0:
|
537
|
+
check_examples(evaluation_run.examples, judgment_scorers)
|
538
|
+
stop_event = threading.Event()
|
539
|
+
t = threading.Thread(
|
540
|
+
target=progress_logger, args=(stop_event, "Running evaluation...")
|
541
|
+
)
|
542
|
+
t.start()
|
543
|
+
try:
|
544
|
+
api_client = JudgmentApiClient(
|
545
|
+
judgment_api_key, evaluation_run.organization_id
|
546
|
+
)
|
547
|
+
response = api_client.add_to_evaluation_queue(
|
548
|
+
evaluation_run.model_dump(warnings=False)
|
879
549
|
)
|
880
550
|
|
881
|
-
if not response.
|
882
|
-
error_message = response.
|
883
|
-
"detail", "An unknown error occurred."
|
884
|
-
)
|
551
|
+
if not response.get("success", False):
|
552
|
+
error_message = response.error
|
885
553
|
judgeval_logger.error(
|
886
554
|
f"Error adding evaluation to queue: {error_message}"
|
887
555
|
)
|
888
556
|
raise JudgmentAPIError(error_message)
|
889
557
|
|
890
|
-
|
891
|
-
|
558
|
+
old_scorer_data_count = 0
|
559
|
+
if evaluation_run.append:
|
560
|
+
try:
|
561
|
+
results_response = api_client.fetch_evaluation_results(
|
562
|
+
evaluation_run.project_name, evaluation_run.eval_name
|
563
|
+
)
|
564
|
+
old_scorer_data_count = retrieve_counts(results_response)
|
565
|
+
except Exception:
|
566
|
+
# This usually means the user did append = True but the eval run name doesn't exist yet
|
567
|
+
pass
|
568
|
+
|
569
|
+
results, url = _poll_evaluation_until_complete(
|
892
570
|
eval_name=evaluation_run.eval_name,
|
893
571
|
project_name=evaluation_run.project_name,
|
894
|
-
judgment_api_key=
|
572
|
+
judgment_api_key=judgment_api_key,
|
895
573
|
organization_id=evaluation_run.organization_id,
|
896
|
-
|
897
|
-
|
574
|
+
expected_scorer_data_count=(
|
575
|
+
len(evaluation_run.scorers) * len(evaluation_run.examples)
|
576
|
+
)
|
577
|
+
+ old_scorer_data_count,
|
578
|
+
)
|
579
|
+
finally:
|
580
|
+
stop_event.set()
|
581
|
+
t.join()
|
582
|
+
|
583
|
+
if len(local_scorers) > 0:
|
584
|
+
results = safe_run_async(
|
585
|
+
a_execute_scoring(
|
586
|
+
evaluation_run.examples,
|
587
|
+
local_scorers,
|
588
|
+
model=evaluation_run.model,
|
589
|
+
throttle_value=0,
|
590
|
+
max_concurrent=MAX_CONCURRENT_EVALUATIONS,
|
898
591
|
)
|
899
|
-
|
900
|
-
pretty_str_to_print = None
|
901
|
-
if results: # Ensure results exist before logging
|
902
|
-
send_results = [
|
903
|
-
scoring_result.model_dump(warnings=False)
|
904
|
-
for scoring_result in results
|
905
|
-
]
|
906
|
-
try:
|
907
|
-
# Run the blocking log_evaluation_results in a separate thread
|
908
|
-
pretty_str_to_print = await asyncio.to_thread(
|
909
|
-
log_evaluation_results, send_results, evaluation_run
|
910
|
-
)
|
911
|
-
except Exception as e:
|
912
|
-
judgeval_logger.error(
|
913
|
-
f"Error logging results after async evaluation: {str(e)}"
|
914
|
-
)
|
915
|
-
|
916
|
-
return results, pretty_str_to_print
|
917
|
-
|
918
|
-
# Create a regular task
|
919
|
-
task = asyncio.create_task(_async_evaluation_workflow())
|
920
|
-
|
921
|
-
# Wrap it in our custom awaitable that will show a spinner only when awaited
|
922
|
-
return SpinnerWrappedTask(
|
923
|
-
task, f"Processing evaluation '{evaluation_run.eval_name}': "
|
924
592
|
)
|
925
|
-
else:
|
926
|
-
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
927
|
-
if judgment_scorers:
|
928
|
-
# Execute evaluation using Judgment API
|
929
|
-
try: # execute an EvaluationRun with just JudgmentScorers
|
930
|
-
api_evaluation_run: EvaluationRun = EvaluationRun(
|
931
|
-
eval_name=evaluation_run.eval_name,
|
932
|
-
project_name=evaluation_run.project_name,
|
933
|
-
examples=evaluation_run.examples,
|
934
|
-
scorers=judgment_scorers,
|
935
|
-
model=evaluation_run.model,
|
936
|
-
judgment_api_key=evaluation_run.judgment_api_key,
|
937
|
-
organization_id=evaluation_run.organization_id,
|
938
|
-
)
|
939
|
-
response_data: Dict = run_with_spinner(
|
940
|
-
"Running Evaluation: ", execute_api_eval, api_evaluation_run
|
941
|
-
)
|
942
|
-
except JudgmentAPIError as e:
|
943
|
-
judgeval_logger.error(
|
944
|
-
f"An error occurred while executing the Judgment API request: {str(e)}"
|
945
|
-
)
|
946
|
-
raise JudgmentAPIError(
|
947
|
-
f"An error occurred while executing the Judgment API request: {str(e)}"
|
948
|
-
)
|
949
|
-
except ValueError as e:
|
950
|
-
raise ValueError(
|
951
|
-
f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}"
|
952
|
-
)
|
953
593
|
|
954
|
-
# Convert the response data to `ScoringResult` objects
|
955
|
-
api_results = [
|
956
|
-
ScoringResult(**result) for result in response_data["results"]
|
957
|
-
]
|
958
|
-
# Run local evals
|
959
|
-
if local_scorers: # List[BaseScorer]
|
960
|
-
results: List[ScoringResult] = safe_run_async(
|
961
|
-
a_execute_scoring(
|
962
|
-
evaluation_run.examples,
|
963
|
-
local_scorers,
|
964
|
-
model=evaluation_run.model,
|
965
|
-
throttle_value=0,
|
966
|
-
max_concurrent=MAX_CONCURRENT_EVALUATIONS,
|
967
|
-
)
|
968
|
-
)
|
969
|
-
local_results = results
|
970
|
-
# Aggregate the ScorerData from the API and local evaluations
|
971
|
-
merged_results: List[ScoringResult] = merge_results(api_results, local_results)
|
972
|
-
merged_results = check_missing_scorer_data(merged_results)
|
973
|
-
|
974
|
-
# Evaluate rules against local scoring results if rules exist (this cant be done just yet)
|
975
|
-
# if evaluation_run.rules and merged_results:
|
976
|
-
# run_rules(
|
977
|
-
# local_results=merged_results,
|
978
|
-
# rules=evaluation_run.rules,
|
979
|
-
# judgment_api_key=evaluation_run.judgment_api_key,
|
980
|
-
# organization_id=evaluation_run.organization_id
|
981
|
-
# )
|
982
|
-
# print(merged_results)
|
983
594
|
send_results = [
|
984
|
-
scoring_result.model_dump(warnings=False)
|
985
|
-
for scoring_result in merged_results
|
595
|
+
scoring_result.model_dump(warnings=False) for scoring_result in results
|
986
596
|
]
|
987
|
-
pretty_str = run_with_spinner(
|
988
|
-
"Logging Results: ",
|
989
|
-
log_evaluation_results,
|
990
|
-
send_results,
|
991
|
-
evaluation_run,
|
992
|
-
)
|
993
|
-
rprint(pretty_str)
|
994
597
|
|
995
|
-
|
598
|
+
url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
|
599
|
+
rprint(
|
600
|
+
f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
601
|
+
)
|
602
|
+
return results
|
996
603
|
|
997
604
|
|
998
605
|
def assert_test(scoring_results: List[ScoringResult]) -> None:
|
@@ -1025,15 +632,6 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
1025
632
|
if failed_cases:
|
1026
633
|
error_msg = "The following test cases failed: \n"
|
1027
634
|
for fail_case in failed_cases:
|
1028
|
-
# error_msg += f"\nInput: {fail_case['input']}\n"
|
1029
|
-
# error_msg += f"Actual Output: {fail_case['actual_output']}\n"
|
1030
|
-
# error_msg += f"Expected Output: {fail_case['expected_output']}\n"
|
1031
|
-
# error_msg += f"Context: {fail_case['context']}\n"
|
1032
|
-
# error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
|
1033
|
-
# error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
|
1034
|
-
# error_msg += f"Tools Called: {fail_case['tools_called']}\n"
|
1035
|
-
# error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
|
1036
|
-
|
1037
635
|
for fail_scorer in fail_case["failed_scorers"]:
|
1038
636
|
error_msg += (
|
1039
637
|
f"\nScorer Name: {fail_scorer.name}\n"
|