judgeval 0.0.55__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/api/__init__.py +3 -0
- judgeval/common/api/api.py +352 -0
- judgeval/common/api/constants.py +165 -0
- judgeval/common/storage/__init__.py +6 -0
- judgeval/common/tracer/__init__.py +31 -0
- judgeval/common/tracer/constants.py +22 -0
- judgeval/common/tracer/core.py +1916 -0
- judgeval/common/tracer/otel_exporter.py +108 -0
- judgeval/common/tracer/otel_span_processor.py +234 -0
- judgeval/common/tracer/span_processor.py +37 -0
- judgeval/common/tracer/span_transformer.py +211 -0
- judgeval/common/tracer/trace_manager.py +92 -0
- judgeval/common/utils.py +2 -2
- judgeval/constants.py +3 -30
- judgeval/data/datasets/eval_dataset_client.py +29 -156
- judgeval/data/judgment_types.py +4 -12
- judgeval/data/result.py +1 -1
- judgeval/data/scorer_data.py +2 -2
- judgeval/data/scripts/openapi_transform.py +1 -1
- judgeval/data/trace.py +66 -1
- judgeval/data/trace_run.py +0 -3
- judgeval/evaluation_run.py +0 -2
- judgeval/integrations/langgraph.py +43 -164
- judgeval/judgment_client.py +17 -211
- judgeval/run_evaluation.py +216 -611
- judgeval/scorers/__init__.py +2 -6
- judgeval/scorers/base_scorer.py +4 -23
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +3 -3
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +215 -0
- judgeval/scorers/score.py +2 -1
- judgeval/scorers/utils.py +1 -13
- judgeval/utils/requests.py +21 -0
- judgeval-0.2.0.dist-info/METADATA +202 -0
- {judgeval-0.0.55.dist-info → judgeval-0.2.0.dist-info}/RECORD +37 -29
- judgeval/common/tracer.py +0 -3215
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -73
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -53
- judgeval-0.0.55.dist-info/METADATA +0 -1384
- /judgeval/common/{s3_storage.py → storage/s3_storage.py} +0 -0
- {judgeval-0.0.55.dist-info → judgeval-0.2.0.dist-info}/WHEEL +0 -0
- {judgeval-0.0.55.dist-info → judgeval-0.2.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
CHANGED
@@ -1,29 +1,21 @@
|
|
1
1
|
import asyncio
|
2
2
|
import concurrent.futures
|
3
|
-
from requests import exceptions
|
4
|
-
from judgeval.utils.requests import requests
|
5
3
|
import time
|
6
4
|
import json
|
7
5
|
import sys
|
8
|
-
import itertools
|
9
6
|
import threading
|
10
|
-
from typing import List, Dict,
|
7
|
+
from typing import List, Dict, Union, Optional, Callable, Tuple, Any
|
11
8
|
from rich import print as rprint
|
12
9
|
|
13
10
|
from judgeval.data import ScorerData, ScoringResult, Example, Trace
|
14
11
|
from judgeval.scorers import BaseScorer, APIScorerConfig
|
15
12
|
from judgeval.scorers.score import a_execute_scoring
|
13
|
+
from judgeval.common.api import JudgmentApiClient
|
16
14
|
from judgeval.constants import (
|
17
|
-
ROOT_API,
|
18
|
-
JUDGMENT_EVAL_API_URL,
|
19
|
-
JUDGMENT_TRACE_EVAL_API_URL,
|
20
|
-
JUDGMENT_EVAL_LOG_API_URL,
|
21
15
|
MAX_CONCURRENT_EVALUATIONS,
|
22
|
-
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
23
|
-
JUDGMENT_GET_EVAL_STATUS_API_URL,
|
24
|
-
JUDGMENT_EVAL_FETCH_API_URL,
|
25
16
|
)
|
26
17
|
from judgeval.common.exceptions import JudgmentAPIError
|
18
|
+
from judgeval.common.api.api import JudgmentAPIException
|
27
19
|
from judgeval.common.logger import judgeval_logger
|
28
20
|
from judgeval.evaluation_run import EvaluationRun
|
29
21
|
from judgeval.data.trace_run import TraceRun
|
@@ -54,22 +46,20 @@ def safe_run_async(coro):
|
|
54
46
|
return asyncio.run(coro)
|
55
47
|
|
56
48
|
|
57
|
-
def send_to_rabbitmq(evaluation_run: EvaluationRun) ->
|
49
|
+
def send_to_rabbitmq(evaluation_run: EvaluationRun) -> Dict[str, Any]:
|
58
50
|
"""
|
59
51
|
Sends an evaluation run to the RabbitMQ evaluation queue.
|
60
52
|
"""
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
verify=True,
|
53
|
+
if not evaluation_run.judgment_api_key or not evaluation_run.organization_id:
|
54
|
+
raise ValueError("API key and organization ID are required")
|
55
|
+
if not evaluation_run.eval_name or not evaluation_run.project_name:
|
56
|
+
raise ValueError("Eval name and project name are required")
|
57
|
+
api_client = JudgmentApiClient(
|
58
|
+
evaluation_run.judgment_api_key, evaluation_run.organization_id
|
59
|
+
)
|
60
|
+
return api_client.add_to_evaluation_queue(
|
61
|
+
evaluation_run.eval_name, evaluation_run.project_name
|
71
62
|
)
|
72
|
-
return response.json()
|
73
63
|
|
74
64
|
|
75
65
|
def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
|
@@ -86,146 +76,46 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
|
|
86
76
|
|
87
77
|
try:
|
88
78
|
# submit API request to execute evals
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
"Content-Type": "application/json",
|
94
|
-
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
95
|
-
"X-Organization-Id": evaluation_run.organization_id,
|
96
|
-
},
|
97
|
-
json=payload,
|
98
|
-
verify=True,
|
79
|
+
if not evaluation_run.judgment_api_key or not evaluation_run.organization_id:
|
80
|
+
raise ValueError("API key and organization ID are required")
|
81
|
+
api_client = JudgmentApiClient(
|
82
|
+
evaluation_run.judgment_api_key, evaluation_run.organization_id
|
99
83
|
)
|
100
|
-
|
84
|
+
return api_client.run_evaluation(evaluation_run.model_dump())
|
101
85
|
except Exception as e:
|
102
86
|
judgeval_logger.error(f"Error: {e}")
|
103
|
-
|
87
|
+
|
88
|
+
details = "No details provided"
|
89
|
+
if isinstance(e, JudgmentAPIException):
|
90
|
+
details = e.response_json.get("detail", "No details provided")
|
91
|
+
|
104
92
|
raise JudgmentAPIError(
|
105
93
|
"An error occurred while executing the Judgment API request: " + details
|
106
94
|
)
|
107
|
-
# Check if the response status code is not 2XX
|
108
|
-
# Add check for the duplicate eval run name
|
109
|
-
if not response.ok:
|
110
|
-
error_message = response_data.get("detail", "An unknown error occurred.")
|
111
|
-
judgeval_logger.error(f"Error: {error_message=}")
|
112
|
-
raise JudgmentAPIError(error_message)
|
113
|
-
return response_data
|
114
95
|
|
115
96
|
|
116
|
-
def execute_api_trace_eval(trace_run: TraceRun) -> Dict:
|
97
|
+
def execute_api_trace_eval(trace_run: TraceRun, judgment_api_key: str) -> Dict:
|
117
98
|
"""
|
118
99
|
Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
|
119
100
|
"""
|
120
101
|
|
121
102
|
try:
|
122
103
|
# submit API request to execute evals
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
"Content-Type": "application/json",
|
128
|
-
"Authorization": f"Bearer {trace_run.judgment_api_key}",
|
129
|
-
"X-Organization-Id": trace_run.organization_id,
|
130
|
-
},
|
131
|
-
json=payload,
|
132
|
-
verify=True,
|
133
|
-
)
|
134
|
-
response_data = response.json()
|
104
|
+
if not judgment_api_key or not trace_run.organization_id:
|
105
|
+
raise ValueError("API key and organization ID are required")
|
106
|
+
api_client = JudgmentApiClient(judgment_api_key, trace_run.organization_id)
|
107
|
+
return api_client.run_trace_evaluation(trace_run.model_dump(warnings=False))
|
135
108
|
except Exception as e:
|
136
109
|
judgeval_logger.error(f"Error: {e}")
|
137
|
-
details = response.json().get("detail", "No details provided")
|
138
|
-
raise JudgmentAPIError(
|
139
|
-
"An error occurred while executing the Judgment API request: " + details
|
140
|
-
)
|
141
|
-
# Check if the response status code is not 2XX
|
142
|
-
# Add check for the duplicate eval run name
|
143
|
-
if not response.ok:
|
144
|
-
error_message = response_data.get("detail", "An unknown error occurred.")
|
145
|
-
judgeval_logger.error(f"Error: {error_message=}")
|
146
|
-
raise JudgmentAPIError(error_message)
|
147
|
-
return response_data
|
148
|
-
|
149
|
-
|
150
|
-
def merge_results(
|
151
|
-
api_results: List[ScoringResult], local_results: List[ScoringResult]
|
152
|
-
) -> List[ScoringResult]:
|
153
|
-
"""
|
154
|
-
When executing scorers that come from both the Judgment API and local scorers, we're left with
|
155
|
-
results for each type of scorer. This function merges the results from the API and local evaluations,
|
156
|
-
grouped by example. In particular, we merge the `scorers_data` field of each `ScoringResult` object.
|
157
110
|
|
158
|
-
|
159
|
-
|
160
|
-
|
111
|
+
details = "An unknown error occurred."
|
112
|
+
if isinstance(e, JudgmentAPIException):
|
113
|
+
details = e.response_json.get("detail", "An unknown error occurred.")
|
161
114
|
|
162
|
-
|
163
|
-
|
164
|
-
"""
|
165
|
-
# No merge required
|
166
|
-
if not local_results and api_results:
|
167
|
-
return [result.model_copy() for result in api_results]
|
168
|
-
if not api_results and local_results:
|
169
|
-
return [result.model_copy() for result in local_results]
|
170
|
-
|
171
|
-
if len(api_results) != len(local_results):
|
172
|
-
# Results should be of same length because each ScoringResult is a 1-1 mapping to an Example
|
173
|
-
raise ValueError(
|
174
|
-
f"The number of API and local results do not match: {len(api_results)} vs {len(local_results)}"
|
115
|
+
raise JudgmentAPIError(
|
116
|
+
"An error occurred while executing the Judgment API request: " + details
|
175
117
|
)
|
176
118
|
|
177
|
-
# Create a copy of api_results to avoid modifying the input
|
178
|
-
merged_results = [result.model_copy() for result in api_results]
|
179
|
-
|
180
|
-
# Each ScoringResult in api and local have all the same fields besides `scorers_data`
|
181
|
-
for merged_result, local_result in zip(merged_results, local_results):
|
182
|
-
if not (merged_result.data_object and local_result.data_object):
|
183
|
-
raise ValueError("Data object is None in one of the results.")
|
184
|
-
if merged_result.data_object.input != local_result.data_object.input:
|
185
|
-
raise ValueError("The API and local results are not aligned.")
|
186
|
-
if (
|
187
|
-
merged_result.data_object.actual_output
|
188
|
-
!= local_result.data_object.actual_output
|
189
|
-
):
|
190
|
-
raise ValueError("The API and local results are not aligned.")
|
191
|
-
if (
|
192
|
-
merged_result.data_object.expected_output
|
193
|
-
!= local_result.data_object.expected_output
|
194
|
-
):
|
195
|
-
raise ValueError("The API and local results are not aligned.")
|
196
|
-
if merged_result.data_object.context != local_result.data_object.context:
|
197
|
-
raise ValueError("The API and local results are not aligned.")
|
198
|
-
if (
|
199
|
-
merged_result.data_object.retrieval_context
|
200
|
-
!= local_result.data_object.retrieval_context
|
201
|
-
):
|
202
|
-
raise ValueError("The API and local results are not aligned.")
|
203
|
-
if (
|
204
|
-
merged_result.data_object.additional_metadata
|
205
|
-
!= local_result.data_object.additional_metadata
|
206
|
-
):
|
207
|
-
raise ValueError("The API and local results are not aligned.")
|
208
|
-
if (
|
209
|
-
merged_result.data_object.tools_called
|
210
|
-
!= local_result.data_object.tools_called
|
211
|
-
):
|
212
|
-
raise ValueError("The API and local results are not aligned.")
|
213
|
-
if (
|
214
|
-
merged_result.data_object.expected_tools
|
215
|
-
!= local_result.data_object.expected_tools
|
216
|
-
):
|
217
|
-
raise ValueError("The API and local results are not aligned.")
|
218
|
-
|
219
|
-
# Merge ScorerData from the API and local scorers together
|
220
|
-
api_scorer_data = merged_result.scorers_data
|
221
|
-
local_scorer_data = local_result.scorers_data
|
222
|
-
if api_scorer_data is None and local_scorer_data is not None:
|
223
|
-
merged_result.scorers_data = local_scorer_data
|
224
|
-
elif api_scorer_data is not None and local_scorer_data is not None:
|
225
|
-
merged_result.scorers_data = api_scorer_data + local_scorer_data
|
226
|
-
|
227
|
-
return merged_results
|
228
|
-
|
229
119
|
|
230
120
|
def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
|
231
121
|
"""
|
@@ -255,34 +145,17 @@ def check_experiment_type(
|
|
255
145
|
"""
|
256
146
|
Checks if the current experiment, if one exists, has the same type (examples of traces)
|
257
147
|
"""
|
258
|
-
|
259
|
-
response = requests.post(
|
260
|
-
f"{ROOT_API}/check_experiment_type/",
|
261
|
-
headers={
|
262
|
-
"Content-Type": "application/json",
|
263
|
-
"Authorization": f"Bearer {judgment_api_key}",
|
264
|
-
"X-Organization-Id": organization_id,
|
265
|
-
},
|
266
|
-
json={
|
267
|
-
"eval_name": eval_name,
|
268
|
-
"project_name": project_name,
|
269
|
-
"judgment_api_key": judgment_api_key,
|
270
|
-
"is_trace": is_trace,
|
271
|
-
},
|
272
|
-
verify=True,
|
273
|
-
)
|
274
|
-
|
275
|
-
if response.status_code == 422:
|
276
|
-
judgeval_logger.error(f"{response.json()}")
|
277
|
-
raise ValueError(f"{response.json()}")
|
148
|
+
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
278
149
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
150
|
+
try:
|
151
|
+
api_client.check_experiment_type(eval_name, project_name, is_trace)
|
152
|
+
except JudgmentAPIException as e:
|
153
|
+
if e.response.status_code == 422:
|
154
|
+
judgeval_logger.error(f"{e.response_json}")
|
155
|
+
raise ValueError(f"{e.response_json}")
|
156
|
+
else:
|
157
|
+
raise e
|
158
|
+
except Exception as e:
|
286
159
|
judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
|
287
160
|
raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
|
288
161
|
|
@@ -302,125 +175,56 @@ def check_eval_run_name_exists(
|
|
302
175
|
ValueError: If the evaluation run name already exists
|
303
176
|
JudgmentAPIError: If there's an API error during the check
|
304
177
|
"""
|
178
|
+
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
305
179
|
try:
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
"eval_name": eval_name,
|
315
|
-
"project_name": project_name,
|
316
|
-
"judgment_api_key": judgment_api_key,
|
317
|
-
},
|
318
|
-
verify=True,
|
319
|
-
)
|
320
|
-
|
321
|
-
if response.status_code == 409:
|
322
|
-
judgeval_logger.error(
|
323
|
-
f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true."
|
324
|
-
)
|
325
|
-
raise ValueError(
|
326
|
-
f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true."
|
327
|
-
)
|
328
|
-
|
329
|
-
if not response.ok:
|
330
|
-
response_data = response.json()
|
331
|
-
error_message = response_data.get("detail", "An unknown error occurred.")
|
332
|
-
judgeval_logger.error(f"Error checking eval run name: {error_message}")
|
333
|
-
raise JudgmentAPIError(error_message)
|
180
|
+
api_client.check_eval_run_name_exists(eval_name, project_name)
|
181
|
+
except JudgmentAPIException as e:
|
182
|
+
if e.response.status_code == 409:
|
183
|
+
error_str = f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true. See https://docs.judgmentlabs.ai/sdk-reference/judgment-client#override for more information."
|
184
|
+
judgeval_logger.error(error_str)
|
185
|
+
raise ValueError(error_str)
|
186
|
+
else:
|
187
|
+
raise e
|
334
188
|
|
335
|
-
except
|
189
|
+
except Exception as e:
|
336
190
|
judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
|
337
191
|
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
338
192
|
|
339
193
|
|
340
194
|
def log_evaluation_results(
|
341
|
-
scoring_results: List[ScoringResult],
|
342
|
-
|
195
|
+
scoring_results: List[ScoringResult],
|
196
|
+
run: Union[EvaluationRun, TraceRun],
|
197
|
+
judgment_api_key: str,
|
198
|
+
) -> str:
|
343
199
|
"""
|
344
200
|
Logs evaluation results to the Judgment API database.
|
345
201
|
|
346
202
|
Args:
|
347
203
|
merged_results (List[ScoringResult]): The results to log
|
348
204
|
evaluation_run (EvaluationRun): The evaluation run containing project info and API key
|
205
|
+
judgment_api_key (str): The API key for the Judgment API
|
349
206
|
|
350
207
|
Raises:
|
351
208
|
JudgmentAPIError: If there's an API error during logging
|
352
209
|
ValueError: If there's a validation error with the results
|
353
210
|
"""
|
354
211
|
try:
|
355
|
-
|
356
|
-
|
357
|
-
headers={
|
358
|
-
"Content-Type": "application/json",
|
359
|
-
"Authorization": f"Bearer {run.judgment_api_key}",
|
360
|
-
"X-Organization-Id": run.organization_id,
|
361
|
-
},
|
362
|
-
json={"results": scoring_results, "run": run.model_dump(warnings=False)},
|
363
|
-
verify=True,
|
364
|
-
)
|
212
|
+
if not judgment_api_key or not run.organization_id:
|
213
|
+
raise ValueError("API key and organization ID are required")
|
365
214
|
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
raise JudgmentAPIError(error_message)
|
371
|
-
|
372
|
-
if "ui_results_url" in res.json():
|
373
|
-
url = res.json()["ui_results_url"]
|
374
|
-
pretty_str = f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
375
|
-
return pretty_str
|
376
|
-
|
377
|
-
return None
|
378
|
-
|
379
|
-
except exceptions.RequestException as e:
|
380
|
-
judgeval_logger.error(
|
381
|
-
f"Request failed while saving evaluation results to DB: {str(e)}"
|
215
|
+
api_client = JudgmentApiClient(judgment_api_key, run.organization_id)
|
216
|
+
response = api_client.log_evaluation_results(
|
217
|
+
scoring_results,
|
218
|
+
run.model_dump(warnings=False),
|
382
219
|
)
|
220
|
+
url = response.get("ui_results_url")
|
221
|
+
return url
|
222
|
+
|
223
|
+
except Exception as e:
|
224
|
+
judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
|
383
225
|
raise JudgmentAPIError(
|
384
226
|
f"Request failed while saving evaluation results to DB: {str(e)}"
|
385
227
|
)
|
386
|
-
except Exception as e:
|
387
|
-
judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
|
388
|
-
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
389
|
-
|
390
|
-
|
391
|
-
def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
|
392
|
-
"""Run a function with a spinner in the terminal."""
|
393
|
-
spinner = itertools.cycle(["|", "/", "-", "\\"])
|
394
|
-
|
395
|
-
def display_spinner():
|
396
|
-
while not stop_spinner_event.is_set():
|
397
|
-
sys.stdout.write(f"\r{message}{next(spinner)}")
|
398
|
-
sys.stdout.flush()
|
399
|
-
time.sleep(0.1)
|
400
|
-
|
401
|
-
stop_spinner_event = threading.Event()
|
402
|
-
spinner_thread = threading.Thread(target=display_spinner)
|
403
|
-
spinner_thread.start()
|
404
|
-
|
405
|
-
try:
|
406
|
-
if asyncio.iscoroutinefunction(func):
|
407
|
-
coro = func(*args, **kwargs)
|
408
|
-
result = safe_run_async(coro)
|
409
|
-
else:
|
410
|
-
result = func(*args, **kwargs)
|
411
|
-
except Exception as e:
|
412
|
-
judgeval_logger.error(f"An error occurred: {str(e)}")
|
413
|
-
stop_spinner_event.set()
|
414
|
-
spinner_thread.join()
|
415
|
-
raise e
|
416
|
-
finally:
|
417
|
-
stop_spinner_event.set()
|
418
|
-
spinner_thread.join()
|
419
|
-
|
420
|
-
sys.stdout.write("\r" + " " * (len(message) + 1) + "\r")
|
421
|
-
sys.stdout.flush()
|
422
|
-
|
423
|
-
return result
|
424
228
|
|
425
229
|
|
426
230
|
def check_examples(
|
@@ -455,6 +259,7 @@ def check_examples(
|
|
455
259
|
|
456
260
|
def run_trace_eval(
|
457
261
|
trace_run: TraceRun,
|
262
|
+
judgment_api_key: str,
|
458
263
|
override: bool = False,
|
459
264
|
function: Optional[Callable] = None,
|
460
265
|
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
|
@@ -465,7 +270,7 @@ def run_trace_eval(
|
|
465
270
|
check_eval_run_name_exists(
|
466
271
|
trace_run.eval_name,
|
467
272
|
trace_run.project_name,
|
468
|
-
|
273
|
+
judgment_api_key,
|
469
274
|
trace_run.organization_id,
|
470
275
|
)
|
471
276
|
|
@@ -474,7 +279,7 @@ def run_trace_eval(
|
|
474
279
|
check_experiment_type(
|
475
280
|
trace_run.eval_name,
|
476
281
|
trace_run.project_name,
|
477
|
-
|
282
|
+
judgment_api_key,
|
478
283
|
trace_run.organization_id,
|
479
284
|
True,
|
480
285
|
)
|
@@ -487,24 +292,28 @@ def run_trace_eval(
|
|
487
292
|
# This is a callback handler, get the underlying tracer
|
488
293
|
actual_tracer = tracer.tracer
|
489
294
|
|
295
|
+
if trace_run.project_name != actual_tracer.project_name:
|
296
|
+
raise ValueError(
|
297
|
+
f"Project name mismatch between run_trace_eval and tracer. "
|
298
|
+
f"Trace run: {trace_run.project_name}, "
|
299
|
+
f"Tracer: {actual_tracer.project_name}"
|
300
|
+
)
|
301
|
+
|
490
302
|
actual_tracer.offline_mode = True
|
491
303
|
actual_tracer.traces = []
|
304
|
+
judgeval_logger.info("Running agent function: ")
|
492
305
|
for example in examples:
|
493
306
|
if example.input:
|
494
307
|
if isinstance(example.input, str):
|
495
|
-
|
496
|
-
"Running agent function: ", function, example.input
|
497
|
-
)
|
308
|
+
function(example.input)
|
498
309
|
elif isinstance(example.input, dict):
|
499
|
-
|
500
|
-
"Running agent function: ", function, **example.input
|
501
|
-
)
|
310
|
+
function(**example.input)
|
502
311
|
else:
|
503
312
|
raise ValueError(
|
504
313
|
f"Input must be string or dict, got {type(example.input)}"
|
505
314
|
)
|
506
315
|
else:
|
507
|
-
|
316
|
+
function()
|
508
317
|
|
509
318
|
for i, trace in enumerate(actual_tracer.traces):
|
510
319
|
# We set the root-level trace span with the expected tools of the Trace
|
@@ -516,9 +325,8 @@ def run_trace_eval(
|
|
516
325
|
|
517
326
|
# Execute evaluation using Judgment API
|
518
327
|
try: # execute an EvaluationRun with just JudgmentScorers
|
519
|
-
|
520
|
-
|
521
|
-
)
|
328
|
+
judgeval_logger.info("Executing Trace Evaluation... ")
|
329
|
+
response_data: Dict = execute_api_trace_eval(trace_run, judgment_api_key)
|
522
330
|
scoring_results = [
|
523
331
|
ScoringResult(**result) for result in response_data["results"]
|
524
332
|
]
|
@@ -534,14 +342,12 @@ def run_trace_eval(
|
|
534
342
|
# Convert the response data to `ScoringResult` objects
|
535
343
|
# TODO: allow for custom scorer on traces
|
536
344
|
|
537
|
-
|
538
|
-
"
|
539
|
-
|
540
|
-
|
541
|
-
|
345
|
+
url = log_evaluation_results(
|
346
|
+
response_data["agent_results"], trace_run, judgment_api_key
|
347
|
+
)
|
348
|
+
rprint(
|
349
|
+
f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
542
350
|
)
|
543
|
-
rprint(pretty_str)
|
544
|
-
|
545
351
|
return scoring_results
|
546
352
|
|
547
353
|
|
@@ -563,41 +369,33 @@ async def get_evaluation_status(
|
|
563
369
|
- results: List of ScoringResult objects if completed
|
564
370
|
- error: Error message if failed
|
565
371
|
"""
|
372
|
+
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
566
373
|
try:
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
"Authorization": f"Bearer {judgment_api_key}",
|
572
|
-
"X-Organization-Id": organization_id,
|
573
|
-
},
|
574
|
-
params={
|
575
|
-
"eval_name": eval_name,
|
576
|
-
"project_name": project_name,
|
577
|
-
},
|
578
|
-
verify=True,
|
374
|
+
return api_client.get_evaluation_status(eval_name, project_name)
|
375
|
+
except Exception as e:
|
376
|
+
raise JudgmentAPIError(
|
377
|
+
f"An error occurred while checking evaluation status: {str(e)}"
|
579
378
|
)
|
580
379
|
|
581
|
-
if not response.ok:
|
582
|
-
error_message = response.json().get("detail", "An unknown error occurred.")
|
583
|
-
judgeval_logger.error(f"Error checking evaluation status: {error_message}")
|
584
|
-
raise JudgmentAPIError(error_message)
|
585
380
|
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
381
|
+
def retrieve_counts(result: Dict):
|
382
|
+
scorer_data_count = 0
|
383
|
+
for example in result.get("examples", []):
|
384
|
+
for scorer in example.get("scorer_data", []):
|
385
|
+
scorer_data_count += 1
|
386
|
+
return scorer_data_count
|
590
387
|
|
591
388
|
|
592
|
-
|
389
|
+
def _poll_evaluation_until_complete(
|
593
390
|
eval_name: str,
|
594
391
|
project_name: str,
|
595
392
|
judgment_api_key: str,
|
596
393
|
organization_id: str,
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
394
|
+
expected_scorer_data_count: int,
|
395
|
+
poll_interval_seconds: float = 5,
|
396
|
+
max_failures: int = 5,
|
397
|
+
max_poll_count: int = 24, # This should be equivalent to 120 seconds
|
398
|
+
) -> Tuple[List[ScoringResult], str]:
|
601
399
|
"""
|
602
400
|
Polls until the evaluation is complete and returns the results.
|
603
401
|
|
@@ -614,210 +412,93 @@ async def _poll_evaluation_until_complete(
|
|
614
412
|
List[ScoringResult]: The evaluation results
|
615
413
|
"""
|
616
414
|
poll_count = 0
|
617
|
-
|
618
|
-
|
415
|
+
exception_count = 0
|
416
|
+
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
417
|
+
while poll_count < max_poll_count:
|
619
418
|
poll_count += 1
|
620
419
|
try:
|
621
420
|
# Check status
|
622
|
-
|
623
|
-
requests.get,
|
624
|
-
JUDGMENT_GET_EVAL_STATUS_API_URL,
|
625
|
-
headers={
|
626
|
-
"Content-Type": "application/json",
|
627
|
-
"Authorization": f"Bearer {judgment_api_key}",
|
628
|
-
"X-Organization-Id": organization_id,
|
629
|
-
},
|
630
|
-
params={"eval_name": eval_name, "project_name": project_name},
|
631
|
-
verify=True,
|
632
|
-
)
|
421
|
+
status_response = api_client.get_evaluation_status(eval_name, project_name)
|
633
422
|
|
634
|
-
if
|
635
|
-
|
636
|
-
"detail", "An unknown error occurred."
|
637
|
-
)
|
638
|
-
judgeval_logger.error(
|
639
|
-
f"Error checking evaluation status: {error_message}"
|
640
|
-
)
|
641
|
-
# Don't raise exception immediately, just log and continue polling
|
642
|
-
await asyncio.sleep(poll_interval_seconds)
|
423
|
+
if status_response.get("status") != "completed":
|
424
|
+
time.sleep(poll_interval_seconds)
|
643
425
|
continue
|
644
426
|
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
if status == "completed" or status == "complete":
|
650
|
-
results_response = await asyncio.to_thread(
|
651
|
-
requests.post,
|
652
|
-
JUDGMENT_EVAL_FETCH_API_URL,
|
653
|
-
headers={
|
654
|
-
"Content-Type": "application/json",
|
655
|
-
"Authorization": f"Bearer {judgment_api_key}",
|
656
|
-
"X-Organization-Id": organization_id,
|
657
|
-
},
|
658
|
-
json={"project_name": project_name, "eval_name": eval_name},
|
659
|
-
verify=True,
|
660
|
-
)
|
661
|
-
|
662
|
-
if not results_response.ok:
|
663
|
-
error_message = results_response.json().get(
|
664
|
-
"detail", "An unknown error occurred."
|
665
|
-
)
|
666
|
-
judgeval_logger.error(
|
667
|
-
f"Error fetching evaluation results: {error_message}"
|
668
|
-
)
|
669
|
-
raise JudgmentAPIError(error_message)
|
670
|
-
|
671
|
-
result_data = results_response.json()
|
672
|
-
|
673
|
-
if result_data.get("examples") is None:
|
674
|
-
continue
|
675
|
-
|
676
|
-
examples_data = result_data.get("examples", [])
|
677
|
-
scoring_results = []
|
678
|
-
|
679
|
-
for example_data in examples_data:
|
680
|
-
# Create ScorerData objects
|
681
|
-
scorer_data_list = []
|
682
|
-
for raw_scorer_data in example_data.get("scorer_data", []):
|
683
|
-
scorer_data_list.append(ScorerData(**raw_scorer_data))
|
427
|
+
results_response = api_client.fetch_evaluation_results(
|
428
|
+
project_name, eval_name
|
429
|
+
)
|
430
|
+
url = results_response.get("ui_results_url")
|
684
431
|
|
685
|
-
|
686
|
-
|
687
|
-
|
432
|
+
if results_response.get("examples") is None:
|
433
|
+
time.sleep(poll_interval_seconds)
|
434
|
+
continue
|
688
435
|
|
689
|
-
|
436
|
+
examples_data = results_response.get("examples", [])
|
437
|
+
scoring_results = []
|
438
|
+
scorer_data_count = 0
|
690
439
|
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
)
|
695
|
-
|
696
|
-
|
697
|
-
scorers_data=scorer_data_list,
|
698
|
-
data_object=example,
|
699
|
-
)
|
700
|
-
scoring_results.append(scoring_result)
|
440
|
+
for example_data in examples_data:
|
441
|
+
scorer_data_list = []
|
442
|
+
for raw_scorer_data in example_data.get("scorer_data", []):
|
443
|
+
scorer_data = ScorerData(**raw_scorer_data)
|
444
|
+
scorer_data_list.append(scorer_data)
|
445
|
+
scorer_data_count += 1
|
701
446
|
|
702
|
-
|
703
|
-
# This means that not all examples were evaluated
|
704
|
-
continue
|
447
|
+
example = Example(**example_data)
|
705
448
|
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
f"Evaluation '{eval_name}' failed: {error_message}"
|
449
|
+
success = all(scorer_data.success for scorer_data in scorer_data_list)
|
450
|
+
scoring_result = ScoringResult(
|
451
|
+
success=success,
|
452
|
+
scorers_data=scorer_data_list,
|
453
|
+
data_object=example,
|
712
454
|
)
|
713
|
-
|
455
|
+
scoring_results.append(scoring_result)
|
714
456
|
|
715
|
-
|
716
|
-
|
457
|
+
if scorer_data_count != expected_scorer_data_count:
|
458
|
+
time.sleep(poll_interval_seconds)
|
459
|
+
continue
|
717
460
|
|
461
|
+
return scoring_results, url
|
718
462
|
except Exception as e:
|
463
|
+
exception_count += 1
|
719
464
|
if isinstance(e, JudgmentAPIError):
|
720
465
|
raise
|
721
466
|
|
722
|
-
# For other exceptions, log and continue polling
|
723
467
|
judgeval_logger.error(f"Error checking evaluation status: {str(e)}")
|
724
|
-
if
|
468
|
+
if exception_count > max_failures:
|
725
469
|
raise JudgmentAPIError(
|
726
470
|
f"Error checking evaluation status after {poll_count} attempts: {str(e)}"
|
727
471
|
)
|
728
472
|
|
729
|
-
|
730
|
-
await asyncio.sleep(poll_interval_seconds)
|
731
|
-
|
732
|
-
|
733
|
-
async def await_with_spinner(task, message: str = "Awaiting async task: "):
|
734
|
-
"""
|
735
|
-
Display a spinner while awaiting an async task.
|
736
|
-
|
737
|
-
Args:
|
738
|
-
task: The asyncio task to await
|
739
|
-
message (str): Message to display with the spinner
|
740
|
-
|
741
|
-
Returns:
|
742
|
-
Any: The result of the awaited task
|
743
|
-
"""
|
744
|
-
spinner = itertools.cycle(["|", "/", "-", "\\"])
|
745
|
-
|
746
|
-
# Create an event to signal when to stop the spinner
|
747
|
-
stop_spinner_event = asyncio.Event()
|
748
|
-
|
749
|
-
async def display_spinner():
|
750
|
-
while not stop_spinner_event.is_set():
|
751
|
-
sys.stdout.write(f"\r{message}{next(spinner)}")
|
752
|
-
sys.stdout.flush()
|
753
|
-
await asyncio.sleep(0.1)
|
754
|
-
|
755
|
-
# Start the spinner in a separate task
|
756
|
-
spinner_task = asyncio.create_task(display_spinner())
|
757
|
-
|
758
|
-
try:
|
759
|
-
# Await the actual task
|
760
|
-
result = await task
|
761
|
-
finally:
|
762
|
-
# Signal the spinner to stop and wait for it to finish
|
763
|
-
stop_spinner_event.set()
|
764
|
-
await spinner_task
|
765
|
-
|
766
|
-
# Clear the spinner line
|
767
|
-
sys.stdout.write("\r" + " " * (len(message) + 1) + "\r")
|
768
|
-
sys.stdout.flush()
|
769
|
-
|
770
|
-
return result
|
771
|
-
|
772
|
-
|
773
|
-
class SpinnerWrappedTask:
|
774
|
-
"""
|
775
|
-
A wrapper for an asyncio task that displays a spinner when awaited.
|
776
|
-
"""
|
777
|
-
|
778
|
-
def __init__(self, task, message: str):
|
779
|
-
self.task = task
|
780
|
-
self.message = message
|
781
|
-
|
782
|
-
def __await__(self):
|
783
|
-
async def _spin_and_await():
|
784
|
-
# self.task resolves to (scoring_results, pretty_str_to_print)
|
785
|
-
task_result_tuple = await await_with_spinner(self.task, self.message)
|
786
|
-
|
787
|
-
# Unpack the tuple
|
788
|
-
scoring_results, pretty_str_to_print = task_result_tuple
|
473
|
+
time.sleep(poll_interval_seconds)
|
789
474
|
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
# Return only the scoring_results to the original awaiter
|
795
|
-
return scoring_results
|
475
|
+
raise JudgmentAPIError(
|
476
|
+
f"Error checking evaluation status after {poll_count} attempts"
|
477
|
+
)
|
796
478
|
|
797
|
-
return _spin_and_await().__await__()
|
798
479
|
|
799
|
-
|
800
|
-
|
801
|
-
|
480
|
+
def progress_logger(stop_event, msg="Working...", interval=5):
|
481
|
+
start = time.time()
|
482
|
+
while not stop_event.is_set():
|
483
|
+
elapsed = int(time.time() - start)
|
484
|
+
judgeval_logger.info(f"{msg} ({elapsed} sec)")
|
485
|
+
stop_event.wait(interval)
|
802
486
|
|
803
487
|
|
804
488
|
def run_eval(
|
805
489
|
evaluation_run: EvaluationRun,
|
490
|
+
judgment_api_key: str,
|
806
491
|
override: bool = False,
|
807
|
-
|
808
|
-
) -> Union[List[ScoringResult], asyncio.Task, SpinnerWrappedTask]:
|
492
|
+
) -> List[ScoringResult]:
|
809
493
|
"""
|
810
494
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
811
495
|
|
812
496
|
Args:
|
813
497
|
evaluation_run (EvaluationRun): Stores example and evaluation together for running
|
814
498
|
override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
|
815
|
-
async_execution (bool, optional): Whether to execute the evaluation asynchronously. Defaults to False.
|
816
499
|
|
817
500
|
Returns:
|
818
|
-
|
819
|
-
- If async_execution is False, returns a list of ScoringResult objects
|
820
|
-
- If async_execution is True, returns a Task that will resolve to a list of ScoringResult objects when awaited
|
501
|
+
List[ScoringResult]: A list of ScoringResult objects
|
821
502
|
"""
|
822
503
|
|
823
504
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
@@ -825,7 +506,7 @@ def run_eval(
|
|
825
506
|
check_eval_run_name_exists(
|
826
507
|
evaluation_run.eval_name,
|
827
508
|
evaluation_run.project_name,
|
828
|
-
|
509
|
+
judgment_api_key,
|
829
510
|
evaluation_run.organization_id,
|
830
511
|
)
|
831
512
|
|
@@ -834,7 +515,7 @@ def run_eval(
|
|
834
515
|
check_experiment_type(
|
835
516
|
evaluation_run.eval_name,
|
836
517
|
evaluation_run.project_name,
|
837
|
-
|
518
|
+
judgment_api_key,
|
838
519
|
evaluation_run.organization_id,
|
839
520
|
False,
|
840
521
|
)
|
@@ -851,148 +532,81 @@ def run_eval(
|
|
851
532
|
else:
|
852
533
|
local_scorers.append(scorer)
|
853
534
|
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
if
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
"X-Organization-Id": evaluation_run.organization_id,
|
876
|
-
},
|
877
|
-
json=payload,
|
878
|
-
verify=True,
|
535
|
+
results: List[ScoringResult] = []
|
536
|
+
url = ""
|
537
|
+
|
538
|
+
if len(local_scorers) > 0 and len(judgment_scorers) > 0:
|
539
|
+
error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
|
540
|
+
judgeval_logger.error(error_msg)
|
541
|
+
raise ValueError(error_msg)
|
542
|
+
|
543
|
+
if len(judgment_scorers) > 0:
|
544
|
+
check_examples(evaluation_run.examples, judgment_scorers)
|
545
|
+
stop_event = threading.Event()
|
546
|
+
t = threading.Thread(
|
547
|
+
target=progress_logger, args=(stop_event, "Running evaluation...")
|
548
|
+
)
|
549
|
+
t.start()
|
550
|
+
try:
|
551
|
+
api_client = JudgmentApiClient(
|
552
|
+
judgment_api_key, evaluation_run.organization_id
|
553
|
+
)
|
554
|
+
response = api_client.add_to_evaluation_queue(
|
555
|
+
evaluation_run.model_dump(warnings=False)
|
879
556
|
)
|
880
557
|
|
881
|
-
if not response.
|
882
|
-
error_message = response.
|
883
|
-
"detail", "An unknown error occurred."
|
884
|
-
)
|
558
|
+
if not response.get("success", False):
|
559
|
+
error_message = response.error
|
885
560
|
judgeval_logger.error(
|
886
561
|
f"Error adding evaluation to queue: {error_message}"
|
887
562
|
)
|
888
563
|
raise JudgmentAPIError(error_message)
|
889
564
|
|
890
|
-
|
891
|
-
|
565
|
+
old_scorer_data_count = 0
|
566
|
+
if evaluation_run.append:
|
567
|
+
try:
|
568
|
+
results_response = api_client.fetch_evaluation_results(
|
569
|
+
evaluation_run.project_name, evaluation_run.eval_name
|
570
|
+
)
|
571
|
+
old_scorer_data_count = retrieve_counts(results_response)
|
572
|
+
except Exception:
|
573
|
+
# This usually means the user did append = True but the eval run name doesn't exist yet
|
574
|
+
pass
|
575
|
+
|
576
|
+
results, url = _poll_evaluation_until_complete(
|
892
577
|
eval_name=evaluation_run.eval_name,
|
893
578
|
project_name=evaluation_run.project_name,
|
894
|
-
judgment_api_key=
|
579
|
+
judgment_api_key=judgment_api_key,
|
895
580
|
organization_id=evaluation_run.organization_id,
|
896
|
-
|
897
|
-
|
581
|
+
expected_scorer_data_count=(
|
582
|
+
len(evaluation_run.scorers) * len(evaluation_run.examples)
|
583
|
+
)
|
584
|
+
+ old_scorer_data_count,
|
585
|
+
)
|
586
|
+
finally:
|
587
|
+
stop_event.set()
|
588
|
+
t.join()
|
589
|
+
|
590
|
+
if len(local_scorers) > 0:
|
591
|
+
results = safe_run_async(
|
592
|
+
a_execute_scoring(
|
593
|
+
evaluation_run.examples,
|
594
|
+
local_scorers,
|
595
|
+
model=evaluation_run.model,
|
596
|
+
throttle_value=0,
|
597
|
+
max_concurrent=MAX_CONCURRENT_EVALUATIONS,
|
898
598
|
)
|
899
|
-
|
900
|
-
pretty_str_to_print = None
|
901
|
-
if results: # Ensure results exist before logging
|
902
|
-
send_results = [
|
903
|
-
scoring_result.model_dump(warnings=False)
|
904
|
-
for scoring_result in results
|
905
|
-
]
|
906
|
-
try:
|
907
|
-
# Run the blocking log_evaluation_results in a separate thread
|
908
|
-
pretty_str_to_print = await asyncio.to_thread(
|
909
|
-
log_evaluation_results, send_results, evaluation_run
|
910
|
-
)
|
911
|
-
except Exception as e:
|
912
|
-
judgeval_logger.error(
|
913
|
-
f"Error logging results after async evaluation: {str(e)}"
|
914
|
-
)
|
915
|
-
|
916
|
-
return results, pretty_str_to_print
|
917
|
-
|
918
|
-
# Create a regular task
|
919
|
-
task = asyncio.create_task(_async_evaluation_workflow())
|
920
|
-
|
921
|
-
# Wrap it in our custom awaitable that will show a spinner only when awaited
|
922
|
-
return SpinnerWrappedTask(
|
923
|
-
task, f"Processing evaluation '{evaluation_run.eval_name}': "
|
924
599
|
)
|
925
|
-
else:
|
926
|
-
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
927
|
-
if judgment_scorers:
|
928
|
-
# Execute evaluation using Judgment API
|
929
|
-
try: # execute an EvaluationRun with just JudgmentScorers
|
930
|
-
api_evaluation_run: EvaluationRun = EvaluationRun(
|
931
|
-
eval_name=evaluation_run.eval_name,
|
932
|
-
project_name=evaluation_run.project_name,
|
933
|
-
examples=evaluation_run.examples,
|
934
|
-
scorers=judgment_scorers,
|
935
|
-
model=evaluation_run.model,
|
936
|
-
judgment_api_key=evaluation_run.judgment_api_key,
|
937
|
-
organization_id=evaluation_run.organization_id,
|
938
|
-
)
|
939
|
-
response_data: Dict = run_with_spinner(
|
940
|
-
"Running Evaluation: ", execute_api_eval, api_evaluation_run
|
941
|
-
)
|
942
|
-
except JudgmentAPIError as e:
|
943
|
-
judgeval_logger.error(
|
944
|
-
f"An error occurred while executing the Judgment API request: {str(e)}"
|
945
|
-
)
|
946
|
-
raise JudgmentAPIError(
|
947
|
-
f"An error occurred while executing the Judgment API request: {str(e)}"
|
948
|
-
)
|
949
|
-
except ValueError as e:
|
950
|
-
raise ValueError(
|
951
|
-
f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}"
|
952
|
-
)
|
953
600
|
|
954
|
-
# Convert the response data to `ScoringResult` objects
|
955
|
-
api_results = [
|
956
|
-
ScoringResult(**result) for result in response_data["results"]
|
957
|
-
]
|
958
|
-
# Run local evals
|
959
|
-
if local_scorers: # List[BaseScorer]
|
960
|
-
results: List[ScoringResult] = safe_run_async(
|
961
|
-
a_execute_scoring(
|
962
|
-
evaluation_run.examples,
|
963
|
-
local_scorers,
|
964
|
-
model=evaluation_run.model,
|
965
|
-
throttle_value=0,
|
966
|
-
max_concurrent=MAX_CONCURRENT_EVALUATIONS,
|
967
|
-
)
|
968
|
-
)
|
969
|
-
local_results = results
|
970
|
-
# Aggregate the ScorerData from the API and local evaluations
|
971
|
-
merged_results: List[ScoringResult] = merge_results(api_results, local_results)
|
972
|
-
merged_results = check_missing_scorer_data(merged_results)
|
973
|
-
|
974
|
-
# Evaluate rules against local scoring results if rules exist (this cant be done just yet)
|
975
|
-
# if evaluation_run.rules and merged_results:
|
976
|
-
# run_rules(
|
977
|
-
# local_results=merged_results,
|
978
|
-
# rules=evaluation_run.rules,
|
979
|
-
# judgment_api_key=evaluation_run.judgment_api_key,
|
980
|
-
# organization_id=evaluation_run.organization_id
|
981
|
-
# )
|
982
|
-
# print(merged_results)
|
983
601
|
send_results = [
|
984
|
-
scoring_result.model_dump(warnings=False)
|
985
|
-
for scoring_result in merged_results
|
602
|
+
scoring_result.model_dump(warnings=False) for scoring_result in results
|
986
603
|
]
|
987
|
-
pretty_str = run_with_spinner(
|
988
|
-
"Logging Results: ",
|
989
|
-
log_evaluation_results,
|
990
|
-
send_results,
|
991
|
-
evaluation_run,
|
992
|
-
)
|
993
|
-
rprint(pretty_str)
|
994
604
|
|
995
|
-
|
605
|
+
url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
|
606
|
+
rprint(
|
607
|
+
f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
608
|
+
)
|
609
|
+
return results
|
996
610
|
|
997
611
|
|
998
612
|
def assert_test(scoring_results: List[ScoringResult]) -> None:
|
@@ -1025,15 +639,6 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
1025
639
|
if failed_cases:
|
1026
640
|
error_msg = "The following test cases failed: \n"
|
1027
641
|
for fail_case in failed_cases:
|
1028
|
-
# error_msg += f"\nInput: {fail_case['input']}\n"
|
1029
|
-
# error_msg += f"Actual Output: {fail_case['actual_output']}\n"
|
1030
|
-
# error_msg += f"Expected Output: {fail_case['expected_output']}\n"
|
1031
|
-
# error_msg += f"Context: {fail_case['context']}\n"
|
1032
|
-
# error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
|
1033
|
-
# error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
|
1034
|
-
# error_msg += f"Tools Called: {fail_case['tools_called']}\n"
|
1035
|
-
# error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
|
1036
|
-
|
1037
642
|
for fail_scorer in fail_case["failed_scorers"]:
|
1038
643
|
error_msg += (
|
1039
644
|
f"\nScorer Name: {fail_scorer.name}\n"
|