judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. judgeval/__init__.py +5 -4
  2. judgeval/clients.py +6 -6
  3. judgeval/common/__init__.py +7 -2
  4. judgeval/common/exceptions.py +2 -3
  5. judgeval/common/logger.py +74 -49
  6. judgeval/common/s3_storage.py +30 -23
  7. judgeval/common/tracer.py +1273 -939
  8. judgeval/common/utils.py +416 -244
  9. judgeval/constants.py +73 -61
  10. judgeval/data/__init__.py +1 -1
  11. judgeval/data/custom_example.py +3 -2
  12. judgeval/data/datasets/dataset.py +80 -54
  13. judgeval/data/datasets/eval_dataset_client.py +131 -181
  14. judgeval/data/example.py +67 -43
  15. judgeval/data/result.py +11 -9
  16. judgeval/data/scorer_data.py +4 -2
  17. judgeval/data/tool.py +25 -16
  18. judgeval/data/trace.py +57 -29
  19. judgeval/data/trace_run.py +5 -11
  20. judgeval/evaluation_run.py +22 -82
  21. judgeval/integrations/langgraph.py +546 -184
  22. judgeval/judges/base_judge.py +1 -2
  23. judgeval/judges/litellm_judge.py +33 -11
  24. judgeval/judges/mixture_of_judges.py +128 -78
  25. judgeval/judges/together_judge.py +22 -9
  26. judgeval/judges/utils.py +14 -5
  27. judgeval/judgment_client.py +259 -271
  28. judgeval/rules.py +169 -142
  29. judgeval/run_evaluation.py +462 -305
  30. judgeval/scorers/api_scorer.py +20 -11
  31. judgeval/scorers/exceptions.py +1 -0
  32. judgeval/scorers/judgeval_scorer.py +77 -58
  33. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
  36. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
  37. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
  38. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
  39. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
  40. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
  41. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
  42. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
  43. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
  44. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
  45. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
  46. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
  47. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
  48. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
  49. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
  50. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
  51. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
  52. judgeval/scorers/prompt_scorer.py +48 -37
  53. judgeval/scorers/score.py +86 -53
  54. judgeval/scorers/utils.py +11 -7
  55. judgeval/tracer/__init__.py +1 -1
  56. judgeval/utils/alerts.py +23 -12
  57. judgeval/utils/{data_utils.py → file_utils.py} +5 -9
  58. judgeval/utils/requests.py +29 -0
  59. judgeval/version_check.py +5 -2
  60. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
  61. judgeval-0.0.46.dist-info/RECORD +69 -0
  62. judgeval-0.0.44.dist-info/RECORD +0 -68
  63. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
  64. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,26 +1,17 @@
1
1
  import asyncio
2
- import requests
2
+ import concurrent.futures
3
+ from requests import exceptions
4
+ from judgeval.utils.requests import requests
3
5
  import time
4
6
  import json
5
7
  import sys
6
8
  import itertools
7
9
  import threading
8
10
  from typing import List, Dict, Any, Union, Optional, Callable
9
- from datetime import datetime
10
11
  from rich import print as rprint
11
12
 
12
- from judgeval.data import (
13
- ScorerData,
14
- ScoringResult,
15
- Example,
16
- CustomExample,
17
- Trace
18
- )
19
- from judgeval.scorers import (
20
- JudgevalScorer,
21
- APIJudgmentScorer,
22
- ClassifierScorer
23
- )
13
+ from judgeval.data import ScorerData, ScoringResult, Example, Trace
14
+ from judgeval.scorers import JudgevalScorer, APIJudgmentScorer, ClassifierScorer
24
15
  from judgeval.scorers.score import a_execute_scoring
25
16
  from judgeval.constants import (
26
17
  ROOT_API,
@@ -30,21 +21,39 @@ from judgeval.constants import (
30
21
  MAX_CONCURRENT_EVALUATIONS,
31
22
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
32
23
  JUDGMENT_GET_EVAL_STATUS_API_URL,
33
- JUDGMENT_EVAL_FETCH_API_URL
24
+ JUDGMENT_EVAL_FETCH_API_URL,
34
25
  )
35
26
  from judgeval.common.exceptions import JudgmentAPIError
36
- from judgeval.common.logger import (
37
- debug,
38
- info,
39
- error,
40
- warning,
41
- example_logging_context
42
- )
27
+ from judgeval.common.logger import debug, info, error, warning, example_logging_context
43
28
  from judgeval.evaluation_run import EvaluationRun
44
29
  from judgeval.data.trace_run import TraceRun
45
30
  from judgeval.common.tracer import Tracer
46
31
  from langchain_core.callbacks import BaseCallbackHandler
47
32
 
33
+
34
+ def safe_run_async(coro):
35
+ """
36
+ Safely run an async coroutine whether or not there's already an event loop running.
37
+
38
+ Args:
39
+ coro: The coroutine to run
40
+
41
+ Returns:
42
+ The result of the coroutine
43
+ """
44
+ try:
45
+ # Try to get the running loop
46
+ asyncio.get_running_loop()
47
+ # If we get here, there's already a loop running
48
+ # Run in a separate thread to avoid "asyncio.run() cannot be called from a running event loop"
49
+ with concurrent.futures.ThreadPoolExecutor() as executor:
50
+ future = executor.submit(asyncio.run, coro)
51
+ return future.result()
52
+ except RuntimeError:
53
+ # No event loop is running, safe to use asyncio.run()
54
+ return asyncio.run(coro)
55
+
56
+
48
57
  def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
49
58
  """
50
59
  Sends an evaluation run to the RabbitMQ evaluation queue.
@@ -55,14 +64,15 @@ def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
55
64
  headers={
56
65
  "Content-Type": "application/json",
57
66
  "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
58
- "X-Organization-Id": evaluation_run.organization_id
59
- },
67
+ "X-Organization-Id": evaluation_run.organization_id,
68
+ },
60
69
  json=payload,
61
- verify=True
70
+ verify=True,
62
71
  )
63
72
  return response.json()
64
73
 
65
- def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
74
+
75
+ def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
66
76
  """
67
77
  Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
68
78
 
@@ -71,67 +81,75 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
71
81
 
72
82
  Returns:
73
83
  List[Dict]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult`
74
- object.
84
+ object.
75
85
  """
76
-
86
+
77
87
  try:
78
88
  # submit API request to execute evals
79
89
  payload = evaluation_run.model_dump(warnings=False)
80
90
  response = requests.post(
81
- JUDGMENT_EVAL_API_URL,
91
+ JUDGMENT_EVAL_API_URL,
82
92
  headers={
83
93
  "Content-Type": "application/json",
84
94
  "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
85
- "X-Organization-Id": evaluation_run.organization_id
86
- },
95
+ "X-Organization-Id": evaluation_run.organization_id,
96
+ },
87
97
  json=payload,
88
- verify=True
98
+ verify=True,
89
99
  )
90
100
  response_data = response.json()
91
101
  except Exception as e:
92
102
  error(f"Error: {e}")
93
103
  details = response.json().get("detail", "No details provided")
94
- raise JudgmentAPIError("An error occurred while executing the Judgment API request: " + details)
104
+ raise JudgmentAPIError(
105
+ "An error occurred while executing the Judgment API request: " + details
106
+ )
95
107
  # Check if the response status code is not 2XX
96
108
  # Add check for the duplicate eval run name
97
109
  if not response.ok:
98
- error_message = response_data.get('detail', 'An unknown error occurred.')
110
+ error_message = response_data.get("detail", "An unknown error occurred.")
99
111
  error(f"Error: {error_message=}")
100
112
  raise JudgmentAPIError(error_message)
101
113
  return response_data
102
114
 
115
+
103
116
  def execute_api_trace_eval(trace_run: TraceRun) -> Dict:
104
117
  """
105
118
  Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
106
119
  """
107
-
120
+
108
121
  try:
109
122
  # submit API request to execute evals
110
123
  payload = trace_run.model_dump(warnings=False)
111
124
  response = requests.post(
112
- JUDGMENT_TRACE_EVAL_API_URL,
125
+ JUDGMENT_TRACE_EVAL_API_URL,
113
126
  headers={
114
127
  "Content-Type": "application/json",
115
128
  "Authorization": f"Bearer {trace_run.judgment_api_key}",
116
- "X-Organization-Id": trace_run.organization_id
117
- },
129
+ "X-Organization-Id": trace_run.organization_id,
130
+ },
118
131
  json=payload,
119
- verify=True
132
+ verify=True,
120
133
  )
121
134
  response_data = response.json()
122
135
  except Exception as e:
123
136
  error(f"Error: {e}")
124
137
  details = response.json().get("detail", "No details provided")
125
- raise JudgmentAPIError("An error occurred while executing the Judgment API request: " + details)
138
+ raise JudgmentAPIError(
139
+ "An error occurred while executing the Judgment API request: " + details
140
+ )
126
141
  # Check if the response status code is not 2XX
127
142
  # Add check for the duplicate eval run name
128
143
  if not response.ok:
129
- error_message = response_data.get('detail', 'An unknown error occurred.')
144
+ error_message = response_data.get("detail", "An unknown error occurred.")
130
145
  error(f"Error: {error_message=}")
131
146
  raise JudgmentAPIError(error_message)
132
147
  return response_data
133
148
 
134
- def merge_results(api_results: List[ScoringResult], local_results: List[ScoringResult]) -> List[ScoringResult]:
149
+
150
+ def merge_results(
151
+ api_results: List[ScoringResult], local_results: List[ScoringResult]
152
+ ) -> List[ScoringResult]:
135
153
  """
136
154
  When executing scorers that come from both the Judgment API and local scorers, we're left with
137
155
  results for each type of scorer. This function merges the results from the API and local evaluations,
@@ -152,32 +170,52 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
152
170
 
153
171
  if len(api_results) != len(local_results):
154
172
  # Results should be of same length because each ScoringResult is a 1-1 mapping to an Example
155
- raise ValueError(f"The number of API and local results do not match: {len(api_results)} vs {len(local_results)}")
156
-
173
+ raise ValueError(
174
+ f"The number of API and local results do not match: {len(api_results)} vs {len(local_results)}"
175
+ )
176
+
157
177
  # Create a copy of api_results to avoid modifying the input
158
178
  merged_results = [result.model_copy() for result in api_results]
159
-
179
+
160
180
  # Each ScoringResult in api and local have all the same fields besides `scorers_data`
161
181
  for merged_result, local_result in zip(merged_results, local_results):
162
182
  if not (merged_result.data_object and local_result.data_object):
163
183
  raise ValueError("Data object is None in one of the results.")
164
184
  if merged_result.data_object.input != local_result.data_object.input:
165
185
  raise ValueError("The API and local results are not aligned.")
166
- if merged_result.data_object.actual_output != local_result.data_object.actual_output:
186
+ if (
187
+ merged_result.data_object.actual_output
188
+ != local_result.data_object.actual_output
189
+ ):
167
190
  raise ValueError("The API and local results are not aligned.")
168
- if merged_result.data_object.expected_output != local_result.data_object.expected_output:
191
+ if (
192
+ merged_result.data_object.expected_output
193
+ != local_result.data_object.expected_output
194
+ ):
169
195
  raise ValueError("The API and local results are not aligned.")
170
196
  if merged_result.data_object.context != local_result.data_object.context:
171
197
  raise ValueError("The API and local results are not aligned.")
172
- if merged_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
198
+ if (
199
+ merged_result.data_object.retrieval_context
200
+ != local_result.data_object.retrieval_context
201
+ ):
173
202
  raise ValueError("The API and local results are not aligned.")
174
- if merged_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
203
+ if (
204
+ merged_result.data_object.additional_metadata
205
+ != local_result.data_object.additional_metadata
206
+ ):
175
207
  raise ValueError("The API and local results are not aligned.")
176
- if merged_result.data_object.tools_called != local_result.data_object.tools_called:
208
+ if (
209
+ merged_result.data_object.tools_called
210
+ != local_result.data_object.tools_called
211
+ ):
177
212
  raise ValueError("The API and local results are not aligned.")
178
- if merged_result.data_object.expected_tools != local_result.data_object.expected_tools:
213
+ if (
214
+ merged_result.data_object.expected_tools
215
+ != local_result.data_object.expected_tools
216
+ ):
179
217
  raise ValueError("The API and local results are not aligned.")
180
-
218
+
181
219
  # Merge ScorerData from the API and local scorers together
182
220
  api_scorer_data = merged_result.scorers_data
183
221
  local_scorer_data = local_result.scorers_data
@@ -185,7 +223,7 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
185
223
  merged_result.scorers_data = local_scorer_data
186
224
  elif api_scorer_data is not None and local_scorer_data is not None:
187
225
  merged_result.scorers_data = api_scorer_data + local_scorer_data
188
-
226
+
189
227
  return merged_results
190
228
 
191
229
 
@@ -206,7 +244,14 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
206
244
  )
207
245
  return results
208
246
 
209
- def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_trace: bool) -> None:
247
+
248
+ def check_experiment_type(
249
+ eval_name: str,
250
+ project_name: str,
251
+ judgment_api_key: str,
252
+ organization_id: str,
253
+ is_trace: bool,
254
+ ) -> None:
210
255
  """
211
256
  Checks if the current experiment, if one exists, has the same type (examples of traces)
212
257
  """
@@ -216,32 +261,35 @@ def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: s
216
261
  headers={
217
262
  "Content-Type": "application/json",
218
263
  "Authorization": f"Bearer {judgment_api_key}",
219
- "X-Organization-Id": organization_id
264
+ "X-Organization-Id": organization_id,
220
265
  },
221
266
  json={
222
267
  "eval_name": eval_name,
223
268
  "project_name": project_name,
224
269
  "judgment_api_key": judgment_api_key,
225
- "is_trace": is_trace
270
+ "is_trace": is_trace,
226
271
  },
227
- verify=True
272
+ verify=True,
228
273
  )
229
-
274
+
230
275
  if response.status_code == 422:
231
276
  error(f"{response.json()}")
232
277
  raise ValueError(f"{response.json()}")
233
-
278
+
234
279
  if not response.ok:
235
280
  response_data = response.json()
236
- error_message = response_data.get('detail', 'An unknown error occurred.')
281
+ error_message = response_data.get("detail", "An unknown error occurred.")
237
282
  error(f"Error checking eval run name: {error_message}")
238
283
  raise JudgmentAPIError(error_message)
239
-
240
- except requests.exceptions.RequestException as e:
284
+
285
+ except exceptions.RequestException as e:
241
286
  error(f"Failed to check if experiment type exists: {str(e)}")
242
287
  raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
243
288
 
244
- def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str) -> None:
289
+
290
+ def check_eval_run_name_exists(
291
+ eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
292
+ ) -> None:
245
293
  """
246
294
  Checks if an evaluation run name already exists for a given project.
247
295
 
@@ -260,32 +308,38 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
260
308
  headers={
261
309
  "Content-Type": "application/json",
262
310
  "Authorization": f"Bearer {judgment_api_key}",
263
- "X-Organization-Id": organization_id
311
+ "X-Organization-Id": organization_id,
264
312
  },
265
313
  json={
266
314
  "eval_name": eval_name,
267
315
  "project_name": project_name,
268
316
  "judgment_api_key": judgment_api_key,
269
317
  },
270
- verify=True
318
+ verify=True,
271
319
  )
272
-
320
+
273
321
  if response.status_code == 409:
274
- error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true.")
275
- raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true.")
276
-
322
+ error(
323
+ f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true."
324
+ )
325
+ raise ValueError(
326
+ f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true."
327
+ )
328
+
277
329
  if not response.ok:
278
330
  response_data = response.json()
279
- error_message = response_data.get('detail', 'An unknown error occurred.')
331
+ error_message = response_data.get("detail", "An unknown error occurred.")
280
332
  error(f"Error checking eval run name: {error_message}")
281
333
  raise JudgmentAPIError(error_message)
282
-
283
- except requests.exceptions.RequestException as e:
334
+
335
+ except exceptions.RequestException as e:
284
336
  error(f"Failed to check if eval run name exists: {str(e)}")
285
337
  raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
286
338
 
287
339
 
288
- def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, TraceRun]) -> str:
340
+ def log_evaluation_results(
341
+ scoring_results: List[ScoringResult], run: Union[EvaluationRun, TraceRun]
342
+ ) -> str | None:
289
343
  """
290
344
  Logs evaluation results to the Judgment API database.
291
345
 
@@ -303,64 +357,73 @@ def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[Eval
303
357
  headers={
304
358
  "Content-Type": "application/json",
305
359
  "Authorization": f"Bearer {run.judgment_api_key}",
306
- "X-Organization-Id": run.organization_id
307
- },
308
- json={
309
- "results": scoring_results,
310
- "run": run.model_dump(warnings=False)
360
+ "X-Organization-Id": run.organization_id,
311
361
  },
312
- verify=True
362
+ json={"results": scoring_results, "run": run.model_dump(warnings=False)},
363
+ verify=True,
313
364
  )
314
-
365
+
315
366
  if not res.ok:
316
367
  response_data = res.json()
317
- error_message = response_data.get('detail', 'An unknown error occurred.')
368
+ error_message = response_data.get("detail", "An unknown error occurred.")
318
369
  error(f"Error {res.status_code}: {error_message}")
319
370
  raise JudgmentAPIError(error_message)
320
-
371
+
321
372
  if "ui_results_url" in res.json():
322
- url = res.json()['ui_results_url']
373
+ url = res.json()["ui_results_url"]
323
374
  pretty_str = f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
324
375
  return pretty_str
325
-
326
- except requests.exceptions.RequestException as e:
376
+
377
+ return None
378
+
379
+ except exceptions.RequestException as e:
327
380
  error(f"Request failed while saving evaluation results to DB: {str(e)}")
328
- raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
381
+ raise JudgmentAPIError(
382
+ f"Request failed while saving evaluation results to DB: {str(e)}"
383
+ )
329
384
  except Exception as e:
330
385
  error(f"Failed to save evaluation results to DB: {str(e)}")
331
386
  raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
332
387
 
388
+
333
389
  def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
334
- """Run a function with a spinner in the terminal."""
335
- spinner = itertools.cycle(['|', '/', '-', '\\'])
390
+ """Run a function with a spinner in the terminal."""
391
+ spinner = itertools.cycle(["|", "/", "-", "\\"])
336
392
 
337
- def display_spinner():
338
- while not stop_spinner_event.is_set():
339
- sys.stdout.write(f'\r{message}{next(spinner)}')
340
- sys.stdout.flush()
341
- time.sleep(0.1)
393
+ def display_spinner():
394
+ while not stop_spinner_event.is_set():
395
+ sys.stdout.write(f"\r{message}{next(spinner)}")
396
+ sys.stdout.flush()
397
+ time.sleep(0.1)
342
398
 
343
- stop_spinner_event = threading.Event()
344
- spinner_thread = threading.Thread(target=display_spinner)
345
- spinner_thread.start()
399
+ stop_spinner_event = threading.Event()
400
+ spinner_thread = threading.Thread(target=display_spinner)
401
+ spinner_thread.start()
346
402
 
347
- try:
403
+ try:
404
+ if asyncio.iscoroutinefunction(func):
405
+ coro = func(*args, **kwargs)
406
+ result = safe_run_async(coro)
407
+ else:
348
408
  result = func(*args, **kwargs)
349
- except Exception as e:
350
- error(f"An error occurred: {str(e)}")
351
- stop_spinner_event.set()
352
- spinner_thread.join()
353
- raise e
354
- finally:
355
- stop_spinner_event.set()
356
- spinner_thread.join()
357
-
358
- sys.stdout.write('\r' + ' ' * (len(message) + 1) + '\r')
359
- sys.stdout.flush()
409
+ except Exception as e:
410
+ error(f"An error occurred: {str(e)}")
411
+ stop_spinner_event.set()
412
+ spinner_thread.join()
413
+ raise e
414
+ finally:
415
+ stop_spinner_event.set()
416
+ spinner_thread.join()
360
417
 
361
- return result
418
+ sys.stdout.write("\r" + " " * (len(message) + 1) + "\r")
419
+ sys.stdout.flush()
420
+
421
+ return result
362
422
 
363
- def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]) -> None:
423
+
424
+ def check_examples(
425
+ examples: List[Example], scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
426
+ ) -> None:
364
427
  """
365
428
  Checks if the example contains the necessary parameters for the scorer.
366
429
  """
@@ -372,27 +435,36 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
372
435
  if getattr(example, param.value) is None:
373
436
  missing_params.append(f"{param.value}")
374
437
  if missing_params:
375
- rprint(f"[yellow]⚠️ WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]")
438
+ rprint(
439
+ f"[yellow]⚠️ WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]"
440
+ )
376
441
  rprint(f"Missing parameters: {', '.join(missing_params)}")
377
442
  rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
378
- rprint("-"*40)
443
+ rprint("-" * 40)
379
444
  prompt_user = True
380
445
 
381
446
  if prompt_user:
382
447
  user_input = input("Do you want to continue? (y/n)")
383
448
  if user_input.lower() != "y":
384
- sys.exit(0)
449
+ sys.exit(0)
385
450
  else:
386
451
  rprint("[green]Continuing...[/green]")
387
452
 
388
- def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
453
+
454
+ def run_trace_eval(
455
+ trace_run: TraceRun,
456
+ override: bool = False,
457
+ function: Optional[Callable] = None,
458
+ tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
459
+ examples: Optional[List[Example]] = None,
460
+ ) -> List[ScoringResult]:
389
461
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
390
- if not override and trace_run.log_results and not trace_run.append:
462
+ if not override and not trace_run.append:
391
463
  check_eval_run_name_exists(
392
464
  trace_run.eval_name,
393
465
  trace_run.project_name,
394
466
  trace_run.judgment_api_key,
395
- trace_run.organization_id
467
+ trace_run.organization_id,
396
468
  )
397
469
 
398
470
  if trace_run.append:
@@ -402,31 +474,36 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
402
474
  trace_run.project_name,
403
475
  trace_run.judgment_api_key,
404
476
  trace_run.organization_id,
405
- True
477
+ True,
406
478
  )
407
- if function and tracer:
479
+ if function and tracer and examples is not None:
408
480
  new_traces: List[Trace] = []
409
-
481
+
410
482
  # Handle case where tracer is actually a callback handler
411
483
  actual_tracer = tracer
412
- if hasattr(tracer, 'tracer') and hasattr(tracer.tracer, 'traces'):
484
+ if hasattr(tracer, "tracer") and hasattr(tracer.tracer, "traces"):
413
485
  # This is a callback handler, get the underlying tracer
414
486
  actual_tracer = tracer.tracer
415
-
487
+
416
488
  actual_tracer.offline_mode = True
417
489
  actual_tracer.traces = []
418
490
  for example in examples:
419
491
  if example.input:
420
492
  if isinstance(example.input, str):
421
- result = run_with_spinner("Running agent function: ", function, example.input)
493
+ run_with_spinner(
494
+ "Running agent function: ", function, example.input
495
+ )
422
496
  elif isinstance(example.input, dict):
423
- result = run_with_spinner("Running agent function: ", function, **example.input)
497
+ run_with_spinner(
498
+ "Running agent function: ", function, **example.input
499
+ )
424
500
  else:
425
- raise ValueError(f"Input must be string or dict, got {type(example.input)}")
501
+ raise ValueError(
502
+ f"Input must be string or dict, got {type(example.input)}"
503
+ )
426
504
  else:
427
- result = run_with_spinner("Running agent function: ", function)
428
-
429
-
505
+ run_with_spinner("Running agent function: ", function)
506
+
430
507
  for i, trace in enumerate(actual_tracer.traces):
431
508
  # We set the root-level trace span with the expected tools of the Trace
432
509
  trace = Trace(**trace)
@@ -434,35 +511,49 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
434
511
  new_traces.append(trace)
435
512
  trace_run.traces = new_traces
436
513
  actual_tracer.traces = []
437
-
514
+
438
515
  # Execute evaluation using Judgment API
439
516
  info("Starting API evaluation")
440
517
  try: # execute an EvaluationRun with just JudgmentScorers
441
- debug("Sending request to Judgment API")
442
- response_data: Dict = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
443
- scoring_results = [ScoringResult(**result) for result in response_data["results"]]
518
+ debug("Sending request to Judgment API")
519
+ response_data: Dict = run_with_spinner(
520
+ "Running Trace Evaluation: ", execute_api_trace_eval, trace_run
521
+ )
522
+ scoring_results = [
523
+ ScoringResult(**result) for result in response_data["results"]
524
+ ]
444
525
  info(f"Received {len(scoring_results)} results from API")
445
526
  except JudgmentAPIError as e:
446
527
  error(f"An error occurred while executing the Judgment API request: {str(e)}")
447
- raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
528
+ raise JudgmentAPIError(
529
+ f"An error occurred while executing the Judgment API request: {str(e)}"
530
+ )
448
531
  except ValueError as e:
449
- raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: {str(e)}")
450
-
532
+ raise ValueError(
533
+ f"Please check your TraceRun object, one or more fields are invalid: {str(e)}"
534
+ )
535
+
451
536
  # Convert the response data to `ScoringResult` objects
452
537
  debug("Processing API results")
453
538
  # TODO: allow for custom scorer on traces
454
- if trace_run.log_results:
455
- pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["agent_results"], trace_run)
456
- rprint(pretty_str)
539
+
540
+ pretty_str = run_with_spinner(
541
+ "Logging Results: ",
542
+ log_evaluation_results,
543
+ response_data["agent_results"],
544
+ trace_run,
545
+ )
546
+ rprint(pretty_str)
457
547
 
458
548
  return scoring_results
459
-
460
-
461
549
 
462
- async def get_evaluation_status(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str) -> Dict:
550
+
551
+ async def get_evaluation_status(
552
+ eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
553
+ ) -> Dict:
463
554
  """
464
555
  Gets the status of an async evaluation run.
465
-
556
+
466
557
  Args:
467
558
  eval_name (str): Name of the evaluation run
468
559
  project_name (str): Name of the project
@@ -481,38 +572,46 @@ async def get_evaluation_status(eval_name: str, project_name: str, judgment_api_
481
572
  headers={
482
573
  "Content-Type": "application/json",
483
574
  "Authorization": f"Bearer {judgment_api_key}",
484
- "X-Organization-Id": organization_id
575
+ "X-Organization-Id": organization_id,
485
576
  },
486
577
  params={
487
578
  "eval_name": eval_name,
488
579
  "project_name": project_name,
489
580
  },
490
- verify=True
581
+ verify=True,
491
582
  )
492
-
583
+
493
584
  if not response.ok:
494
- error_message = response.json().get('detail', 'An unknown error occurred.')
585
+ error_message = response.json().get("detail", "An unknown error occurred.")
495
586
  error(f"Error checking evaluation status: {error_message}")
496
587
  raise JudgmentAPIError(error_message)
497
-
588
+
498
589
  return response.json()
499
- except requests.exceptions.RequestException as e:
590
+ except exceptions.RequestException as e:
500
591
  error(f"Failed to check evaluation status: {str(e)}")
501
592
  raise JudgmentAPIError(f"Failed to check evaluation status: {str(e)}")
502
593
 
503
- async def _poll_evaluation_until_complete(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, poll_interval_seconds: int = 5, original_examples: Optional[List[Example]] = None) -> List[ScoringResult]:
594
+
595
+ async def _poll_evaluation_until_complete(
596
+ eval_name: str,
597
+ project_name: str,
598
+ judgment_api_key: str,
599
+ organization_id: str,
600
+ poll_interval_seconds: int = 5,
601
+ original_examples: Optional[List[Example]] = None,
602
+ ) -> List[ScoringResult]:
504
603
  """
505
604
  Polls until the evaluation is complete and returns the results.
506
-
605
+
507
606
  Args:
508
607
  eval_name (str): Name of the evaluation run
509
608
  project_name (str): Name of the project
510
609
  judgment_api_key (str): API key for authentication
511
610
  organization_id (str): Organization ID for the evaluation
512
611
  poll_interval_seconds (int, optional): Time between status checks in seconds. Defaults to 5.
513
- original_examples (List[Example], optional): The original examples sent for evaluation.
612
+ original_examples (List[Example], optional): The original examples sent for evaluation.
514
613
  If provided, will match results with original examples.
515
-
614
+
516
615
  Returns:
517
616
  List[ScoringResult]: The evaluation results
518
617
  """
@@ -522,7 +621,7 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
522
621
  if original_examples:
523
622
  for example in original_examples:
524
623
  original_example_map[example.example_id] = example
525
-
624
+
526
625
  # Remove the expected scorer names extraction and checking
527
626
  # We'll instead verify all examples have consistent scorer data
528
627
  while True:
@@ -530,8 +629,10 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
530
629
  try:
531
630
  # Log polling attempt
532
631
  if poll_count % 4 == 0: # Log every 4th poll to avoid excess logging
533
- info(f"Polling for evaluation '{eval_name}' in project '{project_name}' (attempt {poll_count})")
534
-
632
+ info(
633
+ f"Polling for evaluation '{eval_name}' in project '{project_name}' (attempt {poll_count})"
634
+ )
635
+
535
636
  # Check status
536
637
  response = await asyncio.to_thread(
537
638
  requests.get,
@@ -539,82 +640,89 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
539
640
  headers={
540
641
  "Content-Type": "application/json",
541
642
  "Authorization": f"Bearer {judgment_api_key}",
542
- "X-Organization-Id": organization_id
643
+ "X-Organization-Id": organization_id,
543
644
  },
544
- params={
545
- "eval_name": eval_name,
546
- "project_name": project_name
547
- },
548
- verify=True
645
+ params={"eval_name": eval_name, "project_name": project_name},
646
+ verify=True,
549
647
  )
550
-
648
+
551
649
  if not response.ok:
552
- error_message = response.json().get('detail', 'An unknown error occurred.')
650
+ error_message = response.json().get(
651
+ "detail", "An unknown error occurred."
652
+ )
553
653
  error(f"Error checking evaluation status: {error_message}")
554
654
  # Don't raise exception immediately, just log and continue polling
555
655
  await asyncio.sleep(poll_interval_seconds)
556
656
  continue
557
-
657
+
558
658
  status_data = response.json()
559
659
  status = status_data.get("status")
560
-
660
+
561
661
  # If complete, get results and return
562
662
  if status == "completed" or status == "complete":
563
- info(f"Evaluation '{eval_name}' reported as completed, fetching and verifying results...")
663
+ info(
664
+ f"Evaluation '{eval_name}' reported as completed, fetching and verifying results..."
665
+ )
564
666
  results_response = await asyncio.to_thread(
565
667
  requests.post,
566
668
  JUDGMENT_EVAL_FETCH_API_URL,
567
669
  headers={
568
670
  "Content-Type": "application/json",
569
671
  "Authorization": f"Bearer {judgment_api_key}",
570
- "X-Organization-Id": organization_id
571
- },
572
- json={
573
- "project_name": project_name,
574
- "eval_name": eval_name
672
+ "X-Organization-Id": organization_id,
575
673
  },
576
- verify=True
674
+ json={"project_name": project_name, "eval_name": eval_name},
675
+ verify=True,
577
676
  )
578
-
677
+
579
678
  if not results_response.ok:
580
- error_message = results_response.json().get('detail', 'An unknown error occurred.')
679
+ error_message = results_response.json().get(
680
+ "detail", "An unknown error occurred."
681
+ )
581
682
  error(f"Error fetching evaluation results: {error_message}")
582
683
  raise JudgmentAPIError(error_message)
583
-
684
+
584
685
  result_data = results_response.json()
585
-
686
+
586
687
  if "examples" in result_data:
587
688
  examples_data = result_data.get("examples", [])
588
-
589
-
590
- info(f"Successfully fetched {len(examples_data)} results for evaluation '{eval_name}'")
591
-
689
+
690
+ info(
691
+ f"Successfully fetched {len(examples_data)} results for evaluation '{eval_name}'"
692
+ )
693
+
592
694
  # Check for result validity if original examples are provided
593
695
  if original_example_map:
594
696
  # Verify all returned examples have matching original examples
595
697
  has_invalid_results = False
596
698
  for example_data in examples_data:
597
699
  example_id = example_data.get("example_id")
598
-
700
+
599
701
  if example_id not in original_example_map:
600
- warning(f"Server returned example with ID {example_id} not found in original examples. " +
601
- f"This indicates stale or incorrect data. Continuing to poll...")
702
+ warning(
703
+ f"Server returned example with ID {example_id} not found in original examples. "
704
+ + "This indicates stale or incorrect data. Continuing to poll..."
705
+ )
602
706
  has_invalid_results = True
603
707
  break
604
-
708
+
605
709
  # If any invalid examples found, continue polling
606
710
  if has_invalid_results:
607
711
  info("Detected stale data. Waiting before polling again...")
608
712
  await asyncio.sleep(poll_interval_seconds)
609
713
  continue
610
-
714
+
611
715
  # Check if we received the expected number of results
612
- if len(original_examples) != len(examples_data):
613
- warning(f"Expected {len(original_examples)} results but got {len(examples_data)} results. " +
614
- f"This indicates incomplete data. Continuing to poll...")
716
+ if original_examples and len(original_examples) != len(
717
+ examples_data
718
+ ):
719
+ warning(
720
+ f"Expected {len(original_examples)} results but got {len(examples_data)} results. "
721
+ + "This indicates incomplete data. Continuing to poll..."
722
+ )
615
723
  await asyncio.sleep(poll_interval_seconds)
616
724
  continue
617
-
725
+
618
726
  # Collect all example IDs from scorer data
619
727
  scorer_example_ids = set()
620
728
  for example_data in examples_data:
@@ -622,114 +730,135 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
622
730
  for scorer_data in scorer_data_list:
623
731
  if "example_id" in scorer_data:
624
732
  scorer_example_ids.add(scorer_data["example_id"])
625
-
733
+
626
734
  # Get the set of original example IDs
627
735
  original_example_ids = set(original_example_map.keys())
628
-
736
+
629
737
  # Check if the sets are equal
630
738
  missing_in_scorer = original_example_ids - scorer_example_ids
631
739
  extra_in_scorer = scorer_example_ids - original_example_ids
632
-
740
+
633
741
  if missing_in_scorer or extra_in_scorer:
634
742
  if missing_in_scorer:
635
- warning(f"Examples missing in scorer data: {missing_in_scorer}")
743
+ warning(
744
+ f"Examples missing in scorer data: {missing_in_scorer}"
745
+ )
636
746
  if extra_in_scorer:
637
- warning(f"Extra examples in scorer data: {extra_in_scorer}")
638
- info("Detected mismatched example IDs in scorer data. Waiting before polling again...")
747
+ warning(
748
+ f"Extra examples in scorer data: {extra_in_scorer}"
749
+ )
750
+ info(
751
+ "Detected mismatched example IDs in scorer data. Waiting before polling again..."
752
+ )
639
753
  await asyncio.sleep(poll_interval_seconds)
640
754
  continue
641
-
755
+
642
756
  # Create ScoringResult objects from the raw data
643
757
  scoring_results = []
644
-
758
+
645
759
  for example_data in examples_data:
646
760
  # Extract example_id from the server response
647
761
  example_id = example_data.get("example_id")
648
-
762
+
649
763
  # Create ScorerData objects
650
764
  scorer_data_list = []
651
765
  for raw_scorer_data in example_data.get("scorer_data", []):
652
766
  scorer_data_list.append(ScorerData(**raw_scorer_data))
653
-
767
+
654
768
  # Use the original Example object if we have it and the ID matches
655
769
  if original_example_map:
656
770
  example = original_example_map[example_id]
657
771
  debug(f"Matched result with original example {example_id}")
658
772
  else:
659
773
  # Create Example from example data (excluding scorer_data) if no original examples provided
660
- example_dict = {k: v for k, v in example_data.items() if k != "scorer_data"}
774
+ example_dict = {
775
+ k: v
776
+ for k, v in example_data.items()
777
+ if k != "scorer_data"
778
+ }
661
779
  example = Example(**example_dict)
662
-
780
+
663
781
  # Calculate success based on whether all scorer_data entries were successful
664
- success = all(scorer_data.success for scorer_data in scorer_data_list) if scorer_data_list else False
665
-
782
+ success = (
783
+ all(scorer_data.success for scorer_data in scorer_data_list)
784
+ if scorer_data_list
785
+ else False
786
+ )
787
+
666
788
  # Create ScoringResult
667
789
  scoring_result = ScoringResult(
668
790
  success=success, # Set based on all scorer data success values
669
791
  scorers_data=scorer_data_list,
670
- data_object=example
792
+ data_object=example,
671
793
  )
672
794
  scoring_results.append(scoring_result)
673
-
795
+
674
796
  # If we got here, all validation checks passed
675
- info(f"Verified complete results for all {len(scoring_results)} examples with all expected scorer data")
797
+ info(
798
+ f"Verified complete results for all {len(scoring_results)} examples with all expected scorer data"
799
+ )
676
800
  return scoring_results
677
801
  else:
678
802
  # No examples found
679
- info(f"No example results found for completed evaluation '{eval_name}'")
803
+ info(
804
+ f"No example results found for completed evaluation '{eval_name}'"
805
+ )
680
806
  return []
681
-
807
+
682
808
  elif status == "failed":
683
809
  # Evaluation failed
684
810
  error_message = status_data.get("error", "Unknown error")
685
811
  error(f"Evaluation '{eval_name}' failed: {error_message}")
686
812
  raise JudgmentAPIError(f"Evaluation failed: {error_message}")
687
-
813
+
688
814
  elif status == "pending" or status == "running":
689
815
  # Only log occasionally for pending/running to avoid flooding logs
690
816
  if poll_count % 4 == 0:
691
817
  info(f"Evaluation '{eval_name}' status: {status}")
692
-
818
+
693
819
  # Wait before checking again
694
820
  await asyncio.sleep(poll_interval_seconds)
695
-
821
+
696
822
  except Exception as e:
697
823
  if isinstance(e, JudgmentAPIError):
698
824
  raise
699
-
825
+
700
826
  # For other exceptions, log and continue polling
701
827
  error(f"Error checking evaluation status: {str(e)}")
702
828
  if poll_count > 20: # Only raise exception after many failed attempts
703
- raise JudgmentAPIError(f"Error checking evaluation status after {poll_count} attempts: {str(e)}")
704
-
829
+ raise JudgmentAPIError(
830
+ f"Error checking evaluation status after {poll_count} attempts: {str(e)}"
831
+ )
832
+
705
833
  # Continue polling after a delay
706
834
  await asyncio.sleep(poll_interval_seconds)
707
835
 
836
+
708
837
  async def await_with_spinner(task, message: str = "Awaiting async task: "):
709
838
  """
710
839
  Display a spinner while awaiting an async task.
711
-
840
+
712
841
  Args:
713
842
  task: The asyncio task to await
714
843
  message (str): Message to display with the spinner
715
-
844
+
716
845
  Returns:
717
846
  Any: The result of the awaited task
718
847
  """
719
- spinner = itertools.cycle(['|', '/', '-', '\\'])
720
-
848
+ spinner = itertools.cycle(["|", "/", "-", "\\"])
849
+
721
850
  # Create an event to signal when to stop the spinner
722
851
  stop_spinner_event = asyncio.Event()
723
-
852
+
724
853
  async def display_spinner():
725
854
  while not stop_spinner_event.is_set():
726
- sys.stdout.write(f'\r{message}{next(spinner)}')
855
+ sys.stdout.write(f"\r{message}{next(spinner)}")
727
856
  sys.stdout.flush()
728
857
  await asyncio.sleep(0.1)
729
-
858
+
730
859
  # Start the spinner in a separate task
731
860
  spinner_task = asyncio.create_task(display_spinner())
732
-
861
+
733
862
  try:
734
863
  # Await the actual task
735
864
  result = await task
@@ -737,66 +866,73 @@ async def await_with_spinner(task, message: str = "Awaiting async task: "):
737
866
  # Signal the spinner to stop and wait for it to finish
738
867
  stop_spinner_event.set()
739
868
  await spinner_task
740
-
869
+
741
870
  # Clear the spinner line
742
- sys.stdout.write('\r' + ' ' * (len(message) + 1) + '\r')
871
+ sys.stdout.write("\r" + " " * (len(message) + 1) + "\r")
743
872
  sys.stdout.flush()
744
-
873
+
745
874
  return result
746
875
 
876
+
747
877
  class SpinnerWrappedTask:
748
878
  """
749
879
  A wrapper for an asyncio task that displays a spinner when awaited.
750
880
  """
881
+
751
882
  def __init__(self, task, message: str):
752
883
  self.task = task
753
884
  self.message = message
754
-
885
+
755
886
  def __await__(self):
756
887
  async def _spin_and_await():
757
888
  # self.task resolves to (scoring_results, pretty_str_to_print)
758
889
  task_result_tuple = await await_with_spinner(self.task, self.message)
759
-
890
+
760
891
  # Unpack the tuple
761
892
  scoring_results, pretty_str_to_print = task_result_tuple
762
-
893
+
763
894
  # Print the pretty string if it exists, after spinner is cleared
764
895
  if pretty_str_to_print:
765
896
  rprint(pretty_str_to_print)
766
-
897
+
767
898
  # Return only the scoring_results to the original awaiter
768
899
  return scoring_results
900
+
769
901
  return _spin_and_await().__await__()
770
-
902
+
771
903
  # Proxy all Task attributes and methods to the underlying task
772
904
  def __getattr__(self, name):
773
905
  return getattr(self.task, name)
774
906
 
775
- def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> Union[List[ScoringResult], asyncio.Task]:
907
+
908
+ def run_eval(
909
+ evaluation_run: EvaluationRun,
910
+ override: bool = False,
911
+ async_execution: bool = False,
912
+ ) -> Union[List[ScoringResult], asyncio.Task, SpinnerWrappedTask]:
776
913
  """
777
914
  Executes an evaluation of `Example`s using one or more `Scorer`s
778
915
 
779
916
  Args:
780
917
  evaluation_run (EvaluationRun): Stores example and evaluation together for running
781
918
  override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
782
- ignore_errors (bool, optional): Whether to ignore scorer errors during evaluation. Defaults to True.
783
919
  async_execution (bool, optional): Whether to execute the evaluation asynchronously. Defaults to False.
784
-
920
+
785
921
  Returns:
786
- Union[List[ScoringResult], Union[asyncio.Task, SpinnerWrappedTask]]:
922
+ Union[List[ScoringResult], Union[asyncio.Task, SpinnerWrappedTask]]:
787
923
  - If async_execution is False, returns a list of ScoringResult objects
788
924
  - If async_execution is True, returns a Task that will resolve to a list of ScoringResult objects when awaited
789
925
  """
790
926
 
791
927
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
792
- if not override and evaluation_run.log_results and not evaluation_run.append:
928
+ if not override and not evaluation_run.append:
793
929
  check_eval_run_name_exists(
794
930
  evaluation_run.eval_name,
795
931
  evaluation_run.project_name,
796
932
  evaluation_run.judgment_api_key,
797
- evaluation_run.organization_id
933
+ evaluation_run.organization_id,
798
934
  )
799
-
935
+
800
936
  if evaluation_run.append:
801
937
  # Check that the current experiment, if one exists, has the same type (examples of traces)
802
938
  check_experiment_type(
@@ -804,15 +940,17 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
804
940
  evaluation_run.project_name,
805
941
  evaluation_run.judgment_api_key,
806
942
  evaluation_run.organization_id,
807
- False
943
+ False,
808
944
  )
809
-
945
+
810
946
  # Set example IDs if not already set
811
947
  debug("Initializing examples with IDs and timestamps")
812
948
  for idx, example in enumerate(evaluation_run.examples):
813
949
  example.example_index = idx # Set numeric index
814
950
  with example_logging_context(example.created_at, example.example_id):
815
- debug(f"Initialized example {example.example_id} (index: {example.example_index})")
951
+ debug(
952
+ f"Initialized example {example.example_id} (index: {example.example_index})"
953
+ )
816
954
  debug(f"Input: {example.input}")
817
955
  debug(f"Actual output: {example.actual_output}")
818
956
  if example.expected_output:
@@ -827,9 +965,9 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
827
965
  debug(f"Tools called: {example.tools_called}")
828
966
  if example.expected_tools:
829
967
  debug(f"Expected tools: {example.expected_tools}")
830
-
968
+
831
969
  debug(f"Starting evaluation run with {len(evaluation_run.examples)} examples")
832
-
970
+
833
971
  # Group APIJudgmentScorers and JudgevalScorers, then evaluate them in parallel
834
972
  debug("Grouping scorers by type")
835
973
  judgment_scorers: List[APIJudgmentScorer] = []
@@ -841,14 +979,16 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
841
979
  else:
842
980
  local_scorers.append(scorer)
843
981
  debug(f"Added local scorer: {type(scorer).__name__}")
844
-
982
+
845
983
  custom_example_check = [scorer.custom_example for scorer in local_scorers]
846
984
  if any(custom_example_check) and not all(custom_example_check):
847
985
  error("All scorers must be custom scorers if using custom examples")
848
986
  raise ValueError("All scorers must be custom scorers if using custom examples")
849
-
850
- debug(f"Found {len(judgment_scorers)} judgment scorers and {len(local_scorers)} local scorers")
851
-
987
+
988
+ debug(
989
+ f"Found {len(judgment_scorers)} judgment scorers and {len(local_scorers)} local scorers"
990
+ )
991
+
852
992
  api_results: List[ScoringResult] = []
853
993
  local_results: List[ScoringResult] = []
854
994
 
@@ -856,14 +996,14 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
856
996
  if len(local_scorers) > 0:
857
997
  error("Local scorers are not supported in async execution")
858
998
  raise ValueError("Local scorers are not supported in async execution")
859
-
999
+
860
1000
  check_examples(evaluation_run.examples, evaluation_run.scorers)
861
1001
  info("Starting async evaluation")
862
-
1002
+
863
1003
  async def _async_evaluation_workflow():
864
1004
  # Create a payload
865
1005
  payload = evaluation_run.model_dump(warnings=False)
866
-
1006
+
867
1007
  # Send the evaluation to the queue
868
1008
  response = await asyncio.to_thread(
869
1009
  requests.post,
@@ -871,50 +1011,52 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
871
1011
  headers={
872
1012
  "Content-Type": "application/json",
873
1013
  "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
874
- "X-Organization-Id": evaluation_run.organization_id
1014
+ "X-Organization-Id": evaluation_run.organization_id,
875
1015
  },
876
1016
  json=payload,
877
- verify=True
1017
+ verify=True,
878
1018
  )
879
-
1019
+
880
1020
  if not response.ok:
881
- error_message = response.json().get('detail', 'An unknown error occurred.')
1021
+ error_message = response.json().get(
1022
+ "detail", "An unknown error occurred."
1023
+ )
882
1024
  error(f"Error adding evaluation to queue: {error_message}")
883
1025
  raise JudgmentAPIError(error_message)
884
-
1026
+
885
1027
  info(f"Successfully added evaluation '{evaluation_run.eval_name}' to queue")
886
-
1028
+
887
1029
  # Poll until the evaluation is complete
888
1030
  results = await _poll_evaluation_until_complete(
889
1031
  eval_name=evaluation_run.eval_name,
890
1032
  project_name=evaluation_run.project_name,
891
1033
  judgment_api_key=evaluation_run.judgment_api_key,
892
1034
  organization_id=evaluation_run.organization_id,
893
- original_examples=evaluation_run.examples # Pass the original examples
1035
+ original_examples=evaluation_run.examples, # Pass the original examples
894
1036
  )
895
1037
 
896
1038
  pretty_str_to_print = None
897
- if evaluation_run.log_results and results: # Ensure results exist before logging
898
- send_results = [scoring_result.model_dump(warnings=False) for scoring_result in results]
1039
+ if results: # Ensure results exist before logging
1040
+ send_results = [
1041
+ scoring_result.model_dump(warnings=False)
1042
+ for scoring_result in results
1043
+ ]
899
1044
  try:
900
1045
  # Run the blocking log_evaluation_results in a separate thread
901
1046
  pretty_str_to_print = await asyncio.to_thread(
902
- log_evaluation_results,
903
- send_results,
904
- evaluation_run
1047
+ log_evaluation_results, send_results, evaluation_run
905
1048
  )
906
1049
  except Exception as e:
907
1050
  error(f"Error logging results after async evaluation: {str(e)}")
908
-
1051
+
909
1052
  return results, pretty_str_to_print
910
-
1053
+
911
1054
  # Create a regular task
912
1055
  task = asyncio.create_task(_async_evaluation_workflow())
913
-
1056
+
914
1057
  # Wrap it in our custom awaitable that will show a spinner only when awaited
915
1058
  return SpinnerWrappedTask(
916
- task,
917
- f"Processing evaluation '{evaluation_run.eval_name}': "
1059
+ task, f"Processing evaluation '{evaluation_run.eval_name}': "
918
1060
  )
919
1061
  else:
920
1062
  check_examples(evaluation_run.examples, evaluation_run.scorers)
@@ -929,25 +1071,31 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
929
1071
  examples=evaluation_run.examples,
930
1072
  scorers=judgment_scorers,
931
1073
  model=evaluation_run.model,
932
- aggregator=evaluation_run.aggregator,
933
- metadata=evaluation_run.metadata,
934
1074
  judgment_api_key=evaluation_run.judgment_api_key,
935
1075
  organization_id=evaluation_run.organization_id,
936
- log_results=evaluation_run.log_results,
937
- rules=evaluation_run.rules
938
1076
  )
939
- debug("Sending request to Judgment API")
940
- response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
1077
+ debug("Sending request to Judgment API")
1078
+ response_data: Dict = run_with_spinner(
1079
+ "Running Evaluation: ", execute_api_eval, api_evaluation_run
1080
+ )
941
1081
  info(f"Received {len(response_data['results'])} results from API")
942
1082
  except JudgmentAPIError as e:
943
- error(f"An error occurred while executing the Judgment API request: {str(e)}")
944
- raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
1083
+ error(
1084
+ f"An error occurred while executing the Judgment API request: {str(e)}"
1085
+ )
1086
+ raise JudgmentAPIError(
1087
+ f"An error occurred while executing the Judgment API request: {str(e)}"
1088
+ )
945
1089
  except ValueError as e:
946
- raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}")
947
-
1090
+ raise ValueError(
1091
+ f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}"
1092
+ )
1093
+
948
1094
  # Convert the response data to `ScoringResult` objects
949
1095
  debug("Processing API results")
950
- api_results = [ScoringResult(**result) for result in response_data["results"]]
1096
+ api_results = [
1097
+ ScoringResult(**result) for result in response_data["results"]
1098
+ ]
951
1099
  # Run local evals
952
1100
  if local_scorers: # List[JudgevalScorer]
953
1101
  # We should be removing local scorers soon
@@ -955,13 +1103,12 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
955
1103
  for example in evaluation_run.examples:
956
1104
  with example_logging_context(example.created_at, example.example_id):
957
1105
  debug(f"Processing example {example.example_id}: {example.input}")
958
-
959
- results: List[ScoringResult] = asyncio.run(
1106
+
1107
+ results: List[ScoringResult] = safe_run_async(
960
1108
  a_execute_scoring(
961
1109
  evaluation_run.examples,
962
1110
  local_scorers,
963
1111
  model=evaluation_run.model,
964
- ignore_errors=ignore_errors,
965
1112
  skip_on_missing_params=True,
966
1113
  show_indicator=True,
967
1114
  _use_bar_indicator=True,
@@ -981,22 +1128,34 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
981
1128
  # Evaluate rules against local scoring results if rules exist (this cant be done just yet)
982
1129
  # if evaluation_run.rules and merged_results:
983
1130
  # run_rules(
984
- # local_results=merged_results,
985
- # rules=evaluation_run.rules,
1131
+ # local_results=merged_results,
1132
+ # rules=evaluation_run.rules,
986
1133
  # judgment_api_key=evaluation_run.judgment_api_key,
987
1134
  # organization_id=evaluation_run.organization_id
988
1135
  # )
989
1136
  # print(merged_results)
990
- if evaluation_run.log_results:
991
- send_results = [scoring_result.model_dump(warnings=False) for scoring_result in merged_results]
992
- pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, send_results, evaluation_run)
993
- rprint(pretty_str)
1137
+ send_results = [
1138
+ scoring_result.model_dump(warnings=False)
1139
+ for scoring_result in merged_results
1140
+ ]
1141
+ pretty_str = run_with_spinner(
1142
+ "Logging Results: ",
1143
+ log_evaluation_results,
1144
+ send_results,
1145
+ evaluation_run,
1146
+ )
1147
+ rprint(pretty_str)
994
1148
 
995
1149
  for i, result in enumerate(merged_results):
996
- if not result.scorers_data: # none of the scorers could be executed on this example
997
- info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
1150
+ if (
1151
+ not result.scorers_data
1152
+ ): # none of the scorers could be executed on this example
1153
+ info(
1154
+ f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers."
1155
+ )
998
1156
  return merged_results
999
1157
 
1158
+
1000
1159
  def assert_test(scoring_results: List[ScoringResult]) -> None:
1001
1160
  """
1002
1161
  Collects all failed scorers from the scoring results.
@@ -1011,11 +1170,8 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
1011
1170
 
1012
1171
  for result in scoring_results:
1013
1172
  if not result.success:
1014
-
1015
1173
  # Create a test case context with all relevant fields
1016
- test_case = {
1017
- "failed_scorers": []
1018
- }
1174
+ test_case: Dict = {"failed_scorers": []}
1019
1175
  if result.scorers_data:
1020
1176
  # If the result was not successful, check each scorer_data
1021
1177
  for scorer_data in result.scorers_data:
@@ -1024,12 +1180,11 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
1024
1180
  # Remove threshold, evaluation model for Tool Order scorer
1025
1181
  scorer_data.threshold = None
1026
1182
  scorer_data.evaluation_model = None
1027
- test_case['failed_scorers'].append(scorer_data)
1183
+ test_case["failed_scorers"].append(scorer_data)
1028
1184
  failed_cases.append(test_case)
1029
1185
 
1030
1186
  if failed_cases:
1031
-
1032
- error_msg = f"The following test cases failed: \n"
1187
+ error_msg = "The following test cases failed: \n"
1033
1188
  for fail_case in failed_cases:
1034
1189
  # error_msg += f"\nInput: {fail_case['input']}\n"
1035
1190
  # error_msg += f"Actual Output: {fail_case['actual_output']}\n"
@@ -1039,13 +1194,12 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
1039
1194
  # error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
1040
1195
  # error_msg += f"Tools Called: {fail_case['tools_called']}\n"
1041
1196
  # error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
1042
-
1043
- for fail_scorer in fail_case['failed_scorers']:
1044
1197
 
1198
+ for fail_scorer in fail_case["failed_scorers"]:
1045
1199
  error_msg += (
1046
1200
  f"\nScorer Name: {fail_scorer.name}\n"
1047
1201
  f"Threshold: {fail_scorer.threshold}\n"
1048
- f"Success: {fail_scorer.success}\n"
1202
+ f"Success: {fail_scorer.success}\n"
1049
1203
  f"Score: {fail_scorer.score}\n"
1050
1204
  f"Reason: {fail_scorer.reason}\n"
1051
1205
  f"Strict Mode: {fail_scorer.strict_mode}\n"
@@ -1055,19 +1209,23 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
1055
1209
  f"Verbose Logs: {fail_scorer.verbose_logs}\n"
1056
1210
  f"Additional Metadata: {fail_scorer.additional_metadata}\n"
1057
1211
  )
1058
- error_msg += "-"*100
1212
+ error_msg += "-" * 100
1059
1213
 
1060
1214
  total_tests = len(scoring_results)
1061
1215
  failed_tests = len(failed_cases)
1062
1216
  passed_tests = total_tests - failed_tests
1063
1217
 
1064
1218
  # Print summary with colors
1065
- rprint("\n" + "="*80)
1219
+ rprint("\n" + "=" * 80)
1066
1220
  if failed_tests == 0:
1067
- rprint(f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]")
1221
+ rprint(
1222
+ f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]"
1223
+ )
1068
1224
  else:
1069
- rprint(f"[bold red]⚠️ TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]")
1070
- rprint("="*80 + "\n")
1225
+ rprint(
1226
+ f"[bold red]⚠️ TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]"
1227
+ )
1228
+ rprint("=" * 80 + "\n")
1071
1229
 
1072
1230
  # Print individual test cases
1073
1231
  for i, result in enumerate(scoring_results):
@@ -1084,9 +1242,8 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
1084
1242
  rprint(f" [red] Reason: {scorer_data.reason}[/red]")
1085
1243
  if scorer_data.error:
1086
1244
  rprint(f" [red] Error: {scorer_data.error}[/red]")
1087
- rprint(" " + "-"*40)
1245
+ rprint(" " + "-" * 40)
1088
1246
 
1089
- rprint("\n" + "="*80)
1247
+ rprint("\n" + "=" * 80)
1090
1248
  if failed_tests > 0:
1091
1249
  raise AssertionError(failed_cases)
1092
-