judgeval 0.0.55__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. judgeval/common/api/__init__.py +3 -0
  2. judgeval/common/api/api.py +352 -0
  3. judgeval/common/api/constants.py +165 -0
  4. judgeval/common/storage/__init__.py +6 -0
  5. judgeval/common/tracer/__init__.py +31 -0
  6. judgeval/common/tracer/constants.py +22 -0
  7. judgeval/common/tracer/core.py +1916 -0
  8. judgeval/common/tracer/otel_exporter.py +108 -0
  9. judgeval/common/tracer/otel_span_processor.py +234 -0
  10. judgeval/common/tracer/span_processor.py +37 -0
  11. judgeval/common/tracer/span_transformer.py +211 -0
  12. judgeval/common/tracer/trace_manager.py +92 -0
  13. judgeval/common/utils.py +2 -2
  14. judgeval/constants.py +3 -30
  15. judgeval/data/datasets/eval_dataset_client.py +29 -156
  16. judgeval/data/judgment_types.py +4 -12
  17. judgeval/data/result.py +1 -1
  18. judgeval/data/scorer_data.py +2 -2
  19. judgeval/data/scripts/openapi_transform.py +1 -1
  20. judgeval/data/trace.py +66 -1
  21. judgeval/data/trace_run.py +0 -3
  22. judgeval/evaluation_run.py +0 -2
  23. judgeval/integrations/langgraph.py +43 -164
  24. judgeval/judgment_client.py +17 -211
  25. judgeval/run_evaluation.py +216 -611
  26. judgeval/scorers/__init__.py +2 -6
  27. judgeval/scorers/base_scorer.py +4 -23
  28. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +3 -3
  29. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +215 -0
  30. judgeval/scorers/score.py +2 -1
  31. judgeval/scorers/utils.py +1 -13
  32. judgeval/utils/requests.py +21 -0
  33. judgeval-0.2.0.dist-info/METADATA +202 -0
  34. {judgeval-0.0.55.dist-info → judgeval-0.2.0.dist-info}/RECORD +37 -29
  35. judgeval/common/tracer.py +0 -3215
  36. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -73
  37. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  38. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  39. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -53
  40. judgeval-0.0.55.dist-info/METADATA +0 -1384
  41. /judgeval/common/{s3_storage.py → storage/s3_storage.py} +0 -0
  42. {judgeval-0.0.55.dist-info → judgeval-0.2.0.dist-info}/WHEEL +0 -0
  43. {judgeval-0.0.55.dist-info → judgeval-0.2.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,29 +1,21 @@
1
1
  import asyncio
2
2
  import concurrent.futures
3
- from requests import exceptions
4
- from judgeval.utils.requests import requests
5
3
  import time
6
4
  import json
7
5
  import sys
8
- import itertools
9
6
  import threading
10
- from typing import List, Dict, Any, Union, Optional, Callable
7
+ from typing import List, Dict, Union, Optional, Callable, Tuple, Any
11
8
  from rich import print as rprint
12
9
 
13
10
  from judgeval.data import ScorerData, ScoringResult, Example, Trace
14
11
  from judgeval.scorers import BaseScorer, APIScorerConfig
15
12
  from judgeval.scorers.score import a_execute_scoring
13
+ from judgeval.common.api import JudgmentApiClient
16
14
  from judgeval.constants import (
17
- ROOT_API,
18
- JUDGMENT_EVAL_API_URL,
19
- JUDGMENT_TRACE_EVAL_API_URL,
20
- JUDGMENT_EVAL_LOG_API_URL,
21
15
  MAX_CONCURRENT_EVALUATIONS,
22
- JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
23
- JUDGMENT_GET_EVAL_STATUS_API_URL,
24
- JUDGMENT_EVAL_FETCH_API_URL,
25
16
  )
26
17
  from judgeval.common.exceptions import JudgmentAPIError
18
+ from judgeval.common.api.api import JudgmentAPIException
27
19
  from judgeval.common.logger import judgeval_logger
28
20
  from judgeval.evaluation_run import EvaluationRun
29
21
  from judgeval.data.trace_run import TraceRun
@@ -54,22 +46,20 @@ def safe_run_async(coro):
54
46
  return asyncio.run(coro)
55
47
 
56
48
 
57
- def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
49
+ def send_to_rabbitmq(evaluation_run: EvaluationRun) -> Dict[str, Any]:
58
50
  """
59
51
  Sends an evaluation run to the RabbitMQ evaluation queue.
60
52
  """
61
- payload = evaluation_run.model_dump(warnings=False)
62
- response = requests.post(
63
- JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
64
- headers={
65
- "Content-Type": "application/json",
66
- "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
67
- "X-Organization-Id": evaluation_run.organization_id,
68
- },
69
- json=payload,
70
- verify=True,
53
+ if not evaluation_run.judgment_api_key or not evaluation_run.organization_id:
54
+ raise ValueError("API key and organization ID are required")
55
+ if not evaluation_run.eval_name or not evaluation_run.project_name:
56
+ raise ValueError("Eval name and project name are required")
57
+ api_client = JudgmentApiClient(
58
+ evaluation_run.judgment_api_key, evaluation_run.organization_id
59
+ )
60
+ return api_client.add_to_evaluation_queue(
61
+ evaluation_run.eval_name, evaluation_run.project_name
71
62
  )
72
- return response.json()
73
63
 
74
64
 
75
65
  def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
@@ -86,146 +76,46 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
86
76
 
87
77
  try:
88
78
  # submit API request to execute evals
89
- payload = evaluation_run.model_dump()
90
- response = requests.post(
91
- JUDGMENT_EVAL_API_URL,
92
- headers={
93
- "Content-Type": "application/json",
94
- "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
95
- "X-Organization-Id": evaluation_run.organization_id,
96
- },
97
- json=payload,
98
- verify=True,
79
+ if not evaluation_run.judgment_api_key or not evaluation_run.organization_id:
80
+ raise ValueError("API key and organization ID are required")
81
+ api_client = JudgmentApiClient(
82
+ evaluation_run.judgment_api_key, evaluation_run.organization_id
99
83
  )
100
- response_data = response.json()
84
+ return api_client.run_evaluation(evaluation_run.model_dump())
101
85
  except Exception as e:
102
86
  judgeval_logger.error(f"Error: {e}")
103
- details = response.json().get("detail", "No details provided")
87
+
88
+ details = "No details provided"
89
+ if isinstance(e, JudgmentAPIException):
90
+ details = e.response_json.get("detail", "No details provided")
91
+
104
92
  raise JudgmentAPIError(
105
93
  "An error occurred while executing the Judgment API request: " + details
106
94
  )
107
- # Check if the response status code is not 2XX
108
- # Add check for the duplicate eval run name
109
- if not response.ok:
110
- error_message = response_data.get("detail", "An unknown error occurred.")
111
- judgeval_logger.error(f"Error: {error_message=}")
112
- raise JudgmentAPIError(error_message)
113
- return response_data
114
95
 
115
96
 
116
- def execute_api_trace_eval(trace_run: TraceRun) -> Dict:
97
+ def execute_api_trace_eval(trace_run: TraceRun, judgment_api_key: str) -> Dict:
117
98
  """
118
99
  Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
119
100
  """
120
101
 
121
102
  try:
122
103
  # submit API request to execute evals
123
- payload = trace_run.model_dump(warnings=False)
124
- response = requests.post(
125
- JUDGMENT_TRACE_EVAL_API_URL,
126
- headers={
127
- "Content-Type": "application/json",
128
- "Authorization": f"Bearer {trace_run.judgment_api_key}",
129
- "X-Organization-Id": trace_run.organization_id,
130
- },
131
- json=payload,
132
- verify=True,
133
- )
134
- response_data = response.json()
104
+ if not judgment_api_key or not trace_run.organization_id:
105
+ raise ValueError("API key and organization ID are required")
106
+ api_client = JudgmentApiClient(judgment_api_key, trace_run.organization_id)
107
+ return api_client.run_trace_evaluation(trace_run.model_dump(warnings=False))
135
108
  except Exception as e:
136
109
  judgeval_logger.error(f"Error: {e}")
137
- details = response.json().get("detail", "No details provided")
138
- raise JudgmentAPIError(
139
- "An error occurred while executing the Judgment API request: " + details
140
- )
141
- # Check if the response status code is not 2XX
142
- # Add check for the duplicate eval run name
143
- if not response.ok:
144
- error_message = response_data.get("detail", "An unknown error occurred.")
145
- judgeval_logger.error(f"Error: {error_message=}")
146
- raise JudgmentAPIError(error_message)
147
- return response_data
148
-
149
-
150
- def merge_results(
151
- api_results: List[ScoringResult], local_results: List[ScoringResult]
152
- ) -> List[ScoringResult]:
153
- """
154
- When executing scorers that come from both the Judgment API and local scorers, we're left with
155
- results for each type of scorer. This function merges the results from the API and local evaluations,
156
- grouped by example. In particular, we merge the `scorers_data` field of each `ScoringResult` object.
157
110
 
158
- Args:
159
- api_results (List[ScoringResult]): The `ScoringResult`s from the API evaluation
160
- local_results (List[ScoringResult]): The `ScoringResult`s from the local evaluation
111
+ details = "An unknown error occurred."
112
+ if isinstance(e, JudgmentAPIException):
113
+ details = e.response_json.get("detail", "An unknown error occurred.")
161
114
 
162
- Returns:
163
- List[ScoringResult]: The merged `ScoringResult`s (updated `scorers_data` field)
164
- """
165
- # No merge required
166
- if not local_results and api_results:
167
- return [result.model_copy() for result in api_results]
168
- if not api_results and local_results:
169
- return [result.model_copy() for result in local_results]
170
-
171
- if len(api_results) != len(local_results):
172
- # Results should be of same length because each ScoringResult is a 1-1 mapping to an Example
173
- raise ValueError(
174
- f"The number of API and local results do not match: {len(api_results)} vs {len(local_results)}"
115
+ raise JudgmentAPIError(
116
+ "An error occurred while executing the Judgment API request: " + details
175
117
  )
176
118
 
177
- # Create a copy of api_results to avoid modifying the input
178
- merged_results = [result.model_copy() for result in api_results]
179
-
180
- # Each ScoringResult in api and local have all the same fields besides `scorers_data`
181
- for merged_result, local_result in zip(merged_results, local_results):
182
- if not (merged_result.data_object and local_result.data_object):
183
- raise ValueError("Data object is None in one of the results.")
184
- if merged_result.data_object.input != local_result.data_object.input:
185
- raise ValueError("The API and local results are not aligned.")
186
- if (
187
- merged_result.data_object.actual_output
188
- != local_result.data_object.actual_output
189
- ):
190
- raise ValueError("The API and local results are not aligned.")
191
- if (
192
- merged_result.data_object.expected_output
193
- != local_result.data_object.expected_output
194
- ):
195
- raise ValueError("The API and local results are not aligned.")
196
- if merged_result.data_object.context != local_result.data_object.context:
197
- raise ValueError("The API and local results are not aligned.")
198
- if (
199
- merged_result.data_object.retrieval_context
200
- != local_result.data_object.retrieval_context
201
- ):
202
- raise ValueError("The API and local results are not aligned.")
203
- if (
204
- merged_result.data_object.additional_metadata
205
- != local_result.data_object.additional_metadata
206
- ):
207
- raise ValueError("The API and local results are not aligned.")
208
- if (
209
- merged_result.data_object.tools_called
210
- != local_result.data_object.tools_called
211
- ):
212
- raise ValueError("The API and local results are not aligned.")
213
- if (
214
- merged_result.data_object.expected_tools
215
- != local_result.data_object.expected_tools
216
- ):
217
- raise ValueError("The API and local results are not aligned.")
218
-
219
- # Merge ScorerData from the API and local scorers together
220
- api_scorer_data = merged_result.scorers_data
221
- local_scorer_data = local_result.scorers_data
222
- if api_scorer_data is None and local_scorer_data is not None:
223
- merged_result.scorers_data = local_scorer_data
224
- elif api_scorer_data is not None and local_scorer_data is not None:
225
- merged_result.scorers_data = api_scorer_data + local_scorer_data
226
-
227
- return merged_results
228
-
229
119
 
230
120
  def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
231
121
  """
@@ -255,34 +145,17 @@ def check_experiment_type(
255
145
  """
256
146
  Checks if the current experiment, if one exists, has the same type (examples of traces)
257
147
  """
258
- try:
259
- response = requests.post(
260
- f"{ROOT_API}/check_experiment_type/",
261
- headers={
262
- "Content-Type": "application/json",
263
- "Authorization": f"Bearer {judgment_api_key}",
264
- "X-Organization-Id": organization_id,
265
- },
266
- json={
267
- "eval_name": eval_name,
268
- "project_name": project_name,
269
- "judgment_api_key": judgment_api_key,
270
- "is_trace": is_trace,
271
- },
272
- verify=True,
273
- )
274
-
275
- if response.status_code == 422:
276
- judgeval_logger.error(f"{response.json()}")
277
- raise ValueError(f"{response.json()}")
148
+ api_client = JudgmentApiClient(judgment_api_key, organization_id)
278
149
 
279
- if not response.ok:
280
- response_data = response.json()
281
- error_message = response_data.get("detail", "An unknown error occurred.")
282
- judgeval_logger.error(f"Error checking eval run name: {error_message}")
283
- raise JudgmentAPIError(error_message)
284
-
285
- except exceptions.RequestException as e:
150
+ try:
151
+ api_client.check_experiment_type(eval_name, project_name, is_trace)
152
+ except JudgmentAPIException as e:
153
+ if e.response.status_code == 422:
154
+ judgeval_logger.error(f"{e.response_json}")
155
+ raise ValueError(f"{e.response_json}")
156
+ else:
157
+ raise e
158
+ except Exception as e:
286
159
  judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
287
160
  raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
288
161
 
@@ -302,125 +175,56 @@ def check_eval_run_name_exists(
302
175
  ValueError: If the evaluation run name already exists
303
176
  JudgmentAPIError: If there's an API error during the check
304
177
  """
178
+ api_client = JudgmentApiClient(judgment_api_key, organization_id)
305
179
  try:
306
- response = requests.post(
307
- f"{ROOT_API}/eval-run-name-exists/",
308
- headers={
309
- "Content-Type": "application/json",
310
- "Authorization": f"Bearer {judgment_api_key}",
311
- "X-Organization-Id": organization_id,
312
- },
313
- json={
314
- "eval_name": eval_name,
315
- "project_name": project_name,
316
- "judgment_api_key": judgment_api_key,
317
- },
318
- verify=True,
319
- )
320
-
321
- if response.status_code == 409:
322
- judgeval_logger.error(
323
- f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true."
324
- )
325
- raise ValueError(
326
- f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true."
327
- )
328
-
329
- if not response.ok:
330
- response_data = response.json()
331
- error_message = response_data.get("detail", "An unknown error occurred.")
332
- judgeval_logger.error(f"Error checking eval run name: {error_message}")
333
- raise JudgmentAPIError(error_message)
180
+ api_client.check_eval_run_name_exists(eval_name, project_name)
181
+ except JudgmentAPIException as e:
182
+ if e.response.status_code == 409:
183
+ error_str = f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true. See https://docs.judgmentlabs.ai/sdk-reference/judgment-client#override for more information."
184
+ judgeval_logger.error(error_str)
185
+ raise ValueError(error_str)
186
+ else:
187
+ raise e
334
188
 
335
- except exceptions.RequestException as e:
189
+ except Exception as e:
336
190
  judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
337
191
  raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
338
192
 
339
193
 
340
194
  def log_evaluation_results(
341
- scoring_results: List[ScoringResult], run: Union[EvaluationRun, TraceRun]
342
- ) -> str | None:
195
+ scoring_results: List[ScoringResult],
196
+ run: Union[EvaluationRun, TraceRun],
197
+ judgment_api_key: str,
198
+ ) -> str:
343
199
  """
344
200
  Logs evaluation results to the Judgment API database.
345
201
 
346
202
  Args:
347
203
  merged_results (List[ScoringResult]): The results to log
348
204
  evaluation_run (EvaluationRun): The evaluation run containing project info and API key
205
+ judgment_api_key (str): The API key for the Judgment API
349
206
 
350
207
  Raises:
351
208
  JudgmentAPIError: If there's an API error during logging
352
209
  ValueError: If there's a validation error with the results
353
210
  """
354
211
  try:
355
- res = requests.post(
356
- JUDGMENT_EVAL_LOG_API_URL,
357
- headers={
358
- "Content-Type": "application/json",
359
- "Authorization": f"Bearer {run.judgment_api_key}",
360
- "X-Organization-Id": run.organization_id,
361
- },
362
- json={"results": scoring_results, "run": run.model_dump(warnings=False)},
363
- verify=True,
364
- )
212
+ if not judgment_api_key or not run.organization_id:
213
+ raise ValueError("API key and organization ID are required")
365
214
 
366
- if not res.ok:
367
- response_data = res.json()
368
- error_message = response_data.get("detail", "An unknown error occurred.")
369
- judgeval_logger.error(f"Error {res.status_code}: {error_message}")
370
- raise JudgmentAPIError(error_message)
371
-
372
- if "ui_results_url" in res.json():
373
- url = res.json()["ui_results_url"]
374
- pretty_str = f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
375
- return pretty_str
376
-
377
- return None
378
-
379
- except exceptions.RequestException as e:
380
- judgeval_logger.error(
381
- f"Request failed while saving evaluation results to DB: {str(e)}"
215
+ api_client = JudgmentApiClient(judgment_api_key, run.organization_id)
216
+ response = api_client.log_evaluation_results(
217
+ scoring_results,
218
+ run.model_dump(warnings=False),
382
219
  )
220
+ url = response.get("ui_results_url")
221
+ return url
222
+
223
+ except Exception as e:
224
+ judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
383
225
  raise JudgmentAPIError(
384
226
  f"Request failed while saving evaluation results to DB: {str(e)}"
385
227
  )
386
- except Exception as e:
387
- judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
388
- raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
389
-
390
-
391
- def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
392
- """Run a function with a spinner in the terminal."""
393
- spinner = itertools.cycle(["|", "/", "-", "\\"])
394
-
395
- def display_spinner():
396
- while not stop_spinner_event.is_set():
397
- sys.stdout.write(f"\r{message}{next(spinner)}")
398
- sys.stdout.flush()
399
- time.sleep(0.1)
400
-
401
- stop_spinner_event = threading.Event()
402
- spinner_thread = threading.Thread(target=display_spinner)
403
- spinner_thread.start()
404
-
405
- try:
406
- if asyncio.iscoroutinefunction(func):
407
- coro = func(*args, **kwargs)
408
- result = safe_run_async(coro)
409
- else:
410
- result = func(*args, **kwargs)
411
- except Exception as e:
412
- judgeval_logger.error(f"An error occurred: {str(e)}")
413
- stop_spinner_event.set()
414
- spinner_thread.join()
415
- raise e
416
- finally:
417
- stop_spinner_event.set()
418
- spinner_thread.join()
419
-
420
- sys.stdout.write("\r" + " " * (len(message) + 1) + "\r")
421
- sys.stdout.flush()
422
-
423
- return result
424
228
 
425
229
 
426
230
  def check_examples(
@@ -455,6 +259,7 @@ def check_examples(
455
259
 
456
260
  def run_trace_eval(
457
261
  trace_run: TraceRun,
262
+ judgment_api_key: str,
458
263
  override: bool = False,
459
264
  function: Optional[Callable] = None,
460
265
  tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
@@ -465,7 +270,7 @@ def run_trace_eval(
465
270
  check_eval_run_name_exists(
466
271
  trace_run.eval_name,
467
272
  trace_run.project_name,
468
- trace_run.judgment_api_key,
273
+ judgment_api_key,
469
274
  trace_run.organization_id,
470
275
  )
471
276
 
@@ -474,7 +279,7 @@ def run_trace_eval(
474
279
  check_experiment_type(
475
280
  trace_run.eval_name,
476
281
  trace_run.project_name,
477
- trace_run.judgment_api_key,
282
+ judgment_api_key,
478
283
  trace_run.organization_id,
479
284
  True,
480
285
  )
@@ -487,24 +292,28 @@ def run_trace_eval(
487
292
  # This is a callback handler, get the underlying tracer
488
293
  actual_tracer = tracer.tracer
489
294
 
295
+ if trace_run.project_name != actual_tracer.project_name:
296
+ raise ValueError(
297
+ f"Project name mismatch between run_trace_eval and tracer. "
298
+ f"Trace run: {trace_run.project_name}, "
299
+ f"Tracer: {actual_tracer.project_name}"
300
+ )
301
+
490
302
  actual_tracer.offline_mode = True
491
303
  actual_tracer.traces = []
304
+ judgeval_logger.info("Running agent function: ")
492
305
  for example in examples:
493
306
  if example.input:
494
307
  if isinstance(example.input, str):
495
- run_with_spinner(
496
- "Running agent function: ", function, example.input
497
- )
308
+ function(example.input)
498
309
  elif isinstance(example.input, dict):
499
- run_with_spinner(
500
- "Running agent function: ", function, **example.input
501
- )
310
+ function(**example.input)
502
311
  else:
503
312
  raise ValueError(
504
313
  f"Input must be string or dict, got {type(example.input)}"
505
314
  )
506
315
  else:
507
- run_with_spinner("Running agent function: ", function)
316
+ function()
508
317
 
509
318
  for i, trace in enumerate(actual_tracer.traces):
510
319
  # We set the root-level trace span with the expected tools of the Trace
@@ -516,9 +325,8 @@ def run_trace_eval(
516
325
 
517
326
  # Execute evaluation using Judgment API
518
327
  try: # execute an EvaluationRun with just JudgmentScorers
519
- response_data: Dict = run_with_spinner(
520
- "Running Trace Evaluation: ", execute_api_trace_eval, trace_run
521
- )
328
+ judgeval_logger.info("Executing Trace Evaluation... ")
329
+ response_data: Dict = execute_api_trace_eval(trace_run, judgment_api_key)
522
330
  scoring_results = [
523
331
  ScoringResult(**result) for result in response_data["results"]
524
332
  ]
@@ -534,14 +342,12 @@ def run_trace_eval(
534
342
  # Convert the response data to `ScoringResult` objects
535
343
  # TODO: allow for custom scorer on traces
536
344
 
537
- pretty_str = run_with_spinner(
538
- "Logging Results: ",
539
- log_evaluation_results,
540
- response_data["agent_results"],
541
- trace_run,
345
+ url = log_evaluation_results(
346
+ response_data["agent_results"], trace_run, judgment_api_key
347
+ )
348
+ rprint(
349
+ f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
542
350
  )
543
- rprint(pretty_str)
544
-
545
351
  return scoring_results
546
352
 
547
353
 
@@ -563,41 +369,33 @@ async def get_evaluation_status(
563
369
  - results: List of ScoringResult objects if completed
564
370
  - error: Error message if failed
565
371
  """
372
+ api_client = JudgmentApiClient(judgment_api_key, organization_id)
566
373
  try:
567
- response = requests.get(
568
- JUDGMENT_GET_EVAL_STATUS_API_URL,
569
- headers={
570
- "Content-Type": "application/json",
571
- "Authorization": f"Bearer {judgment_api_key}",
572
- "X-Organization-Id": organization_id,
573
- },
574
- params={
575
- "eval_name": eval_name,
576
- "project_name": project_name,
577
- },
578
- verify=True,
374
+ return api_client.get_evaluation_status(eval_name, project_name)
375
+ except Exception as e:
376
+ raise JudgmentAPIError(
377
+ f"An error occurred while checking evaluation status: {str(e)}"
579
378
  )
580
379
 
581
- if not response.ok:
582
- error_message = response.json().get("detail", "An unknown error occurred.")
583
- judgeval_logger.error(f"Error checking evaluation status: {error_message}")
584
- raise JudgmentAPIError(error_message)
585
380
 
586
- return response.json()
587
- except exceptions.RequestException as e:
588
- judgeval_logger.error(f"Failed to check evaluation status: {str(e)}")
589
- raise JudgmentAPIError(f"Failed to check evaluation status: {str(e)}")
381
+ def retrieve_counts(result: Dict):
382
+ scorer_data_count = 0
383
+ for example in result.get("examples", []):
384
+ for scorer in example.get("scorer_data", []):
385
+ scorer_data_count += 1
386
+ return scorer_data_count
590
387
 
591
388
 
592
- async def _poll_evaluation_until_complete(
389
+ def _poll_evaluation_until_complete(
593
390
  eval_name: str,
594
391
  project_name: str,
595
392
  judgment_api_key: str,
596
393
  organization_id: str,
597
- expected_scorer_count: int,
598
- original_examples: List[Example],
599
- poll_interval_seconds: int = 5,
600
- ) -> List[ScoringResult]:
394
+ expected_scorer_data_count: int,
395
+ poll_interval_seconds: float = 5,
396
+ max_failures: int = 5,
397
+ max_poll_count: int = 24, # This should be equivalent to 120 seconds
398
+ ) -> Tuple[List[ScoringResult], str]:
601
399
  """
602
400
  Polls until the evaluation is complete and returns the results.
603
401
 
@@ -614,210 +412,93 @@ async def _poll_evaluation_until_complete(
614
412
  List[ScoringResult]: The evaluation results
615
413
  """
616
414
  poll_count = 0
617
-
618
- while True:
415
+ exception_count = 0
416
+ api_client = JudgmentApiClient(judgment_api_key, organization_id)
417
+ while poll_count < max_poll_count:
619
418
  poll_count += 1
620
419
  try:
621
420
  # Check status
622
- response = await asyncio.to_thread(
623
- requests.get,
624
- JUDGMENT_GET_EVAL_STATUS_API_URL,
625
- headers={
626
- "Content-Type": "application/json",
627
- "Authorization": f"Bearer {judgment_api_key}",
628
- "X-Organization-Id": organization_id,
629
- },
630
- params={"eval_name": eval_name, "project_name": project_name},
631
- verify=True,
632
- )
421
+ status_response = api_client.get_evaluation_status(eval_name, project_name)
633
422
 
634
- if not response.ok:
635
- error_message = response.json().get(
636
- "detail", "An unknown error occurred."
637
- )
638
- judgeval_logger.error(
639
- f"Error checking evaluation status: {error_message}"
640
- )
641
- # Don't raise exception immediately, just log and continue polling
642
- await asyncio.sleep(poll_interval_seconds)
423
+ if status_response.get("status") != "completed":
424
+ time.sleep(poll_interval_seconds)
643
425
  continue
644
426
 
645
- status_data = response.json()
646
- status = status_data.get("status")
647
-
648
- # If complete, get results and return
649
- if status == "completed" or status == "complete":
650
- results_response = await asyncio.to_thread(
651
- requests.post,
652
- JUDGMENT_EVAL_FETCH_API_URL,
653
- headers={
654
- "Content-Type": "application/json",
655
- "Authorization": f"Bearer {judgment_api_key}",
656
- "X-Organization-Id": organization_id,
657
- },
658
- json={"project_name": project_name, "eval_name": eval_name},
659
- verify=True,
660
- )
661
-
662
- if not results_response.ok:
663
- error_message = results_response.json().get(
664
- "detail", "An unknown error occurred."
665
- )
666
- judgeval_logger.error(
667
- f"Error fetching evaluation results: {error_message}"
668
- )
669
- raise JudgmentAPIError(error_message)
670
-
671
- result_data = results_response.json()
672
-
673
- if result_data.get("examples") is None:
674
- continue
675
-
676
- examples_data = result_data.get("examples", [])
677
- scoring_results = []
678
-
679
- for example_data in examples_data:
680
- # Create ScorerData objects
681
- scorer_data_list = []
682
- for raw_scorer_data in example_data.get("scorer_data", []):
683
- scorer_data_list.append(ScorerData(**raw_scorer_data))
427
+ results_response = api_client.fetch_evaluation_results(
428
+ project_name, eval_name
429
+ )
430
+ url = results_response.get("ui_results_url")
684
431
 
685
- if len(scorer_data_list) != expected_scorer_count:
686
- # This means that not all scorers were loading for a specific example
687
- continue
432
+ if results_response.get("examples") is None:
433
+ time.sleep(poll_interval_seconds)
434
+ continue
688
435
 
689
- example = Example(**example_data)
436
+ examples_data = results_response.get("examples", [])
437
+ scoring_results = []
438
+ scorer_data_count = 0
690
439
 
691
- # Calculate success based on whether all scorer_data entries were successful
692
- success = all(
693
- scorer_data.success for scorer_data in scorer_data_list
694
- )
695
- scoring_result = ScoringResult(
696
- success=success, # Set based on all scorer data success values
697
- scorers_data=scorer_data_list,
698
- data_object=example,
699
- )
700
- scoring_results.append(scoring_result)
440
+ for example_data in examples_data:
441
+ scorer_data_list = []
442
+ for raw_scorer_data in example_data.get("scorer_data", []):
443
+ scorer_data = ScorerData(**raw_scorer_data)
444
+ scorer_data_list.append(scorer_data)
445
+ scorer_data_count += 1
701
446
 
702
- if len(scoring_results) != len(original_examples):
703
- # This means that not all examples were evaluated
704
- continue
447
+ example = Example(**example_data)
705
448
 
706
- return scoring_results
707
- elif status == "failed":
708
- # Evaluation failed
709
- error_message = status_data.get("error", "Unknown error")
710
- judgeval_logger.error(
711
- f"Evaluation '{eval_name}' failed: {error_message}"
449
+ success = all(scorer_data.success for scorer_data in scorer_data_list)
450
+ scoring_result = ScoringResult(
451
+ success=success,
452
+ scorers_data=scorer_data_list,
453
+ data_object=example,
712
454
  )
713
- raise JudgmentAPIError(f"Evaluation failed: {error_message}")
455
+ scoring_results.append(scoring_result)
714
456
 
715
- # Wait before checking again
716
- await asyncio.sleep(poll_interval_seconds)
457
+ if scorer_data_count != expected_scorer_data_count:
458
+ time.sleep(poll_interval_seconds)
459
+ continue
717
460
 
461
+ return scoring_results, url
718
462
  except Exception as e:
463
+ exception_count += 1
719
464
  if isinstance(e, JudgmentAPIError):
720
465
  raise
721
466
 
722
- # For other exceptions, log and continue polling
723
467
  judgeval_logger.error(f"Error checking evaluation status: {str(e)}")
724
- if poll_count > 20: # Only raise exception after many failed attempts
468
+ if exception_count > max_failures:
725
469
  raise JudgmentAPIError(
726
470
  f"Error checking evaluation status after {poll_count} attempts: {str(e)}"
727
471
  )
728
472
 
729
- # Continue polling after a delay
730
- await asyncio.sleep(poll_interval_seconds)
731
-
732
-
733
- async def await_with_spinner(task, message: str = "Awaiting async task: "):
734
- """
735
- Display a spinner while awaiting an async task.
736
-
737
- Args:
738
- task: The asyncio task to await
739
- message (str): Message to display with the spinner
740
-
741
- Returns:
742
- Any: The result of the awaited task
743
- """
744
- spinner = itertools.cycle(["|", "/", "-", "\\"])
745
-
746
- # Create an event to signal when to stop the spinner
747
- stop_spinner_event = asyncio.Event()
748
-
749
- async def display_spinner():
750
- while not stop_spinner_event.is_set():
751
- sys.stdout.write(f"\r{message}{next(spinner)}")
752
- sys.stdout.flush()
753
- await asyncio.sleep(0.1)
754
-
755
- # Start the spinner in a separate task
756
- spinner_task = asyncio.create_task(display_spinner())
757
-
758
- try:
759
- # Await the actual task
760
- result = await task
761
- finally:
762
- # Signal the spinner to stop and wait for it to finish
763
- stop_spinner_event.set()
764
- await spinner_task
765
-
766
- # Clear the spinner line
767
- sys.stdout.write("\r" + " " * (len(message) + 1) + "\r")
768
- sys.stdout.flush()
769
-
770
- return result
771
-
772
-
773
- class SpinnerWrappedTask:
774
- """
775
- A wrapper for an asyncio task that displays a spinner when awaited.
776
- """
777
-
778
- def __init__(self, task, message: str):
779
- self.task = task
780
- self.message = message
781
-
782
- def __await__(self):
783
- async def _spin_and_await():
784
- # self.task resolves to (scoring_results, pretty_str_to_print)
785
- task_result_tuple = await await_with_spinner(self.task, self.message)
786
-
787
- # Unpack the tuple
788
- scoring_results, pretty_str_to_print = task_result_tuple
473
+ time.sleep(poll_interval_seconds)
789
474
 
790
- # Print the pretty string if it exists, after spinner is cleared
791
- if pretty_str_to_print:
792
- rprint(pretty_str_to_print)
793
-
794
- # Return only the scoring_results to the original awaiter
795
- return scoring_results
475
+ raise JudgmentAPIError(
476
+ f"Error checking evaluation status after {poll_count} attempts"
477
+ )
796
478
 
797
- return _spin_and_await().__await__()
798
479
 
799
- # Proxy all Task attributes and methods to the underlying task
800
- def __getattr__(self, name):
801
- return getattr(self.task, name)
480
+ def progress_logger(stop_event, msg="Working...", interval=5):
481
+ start = time.time()
482
+ while not stop_event.is_set():
483
+ elapsed = int(time.time() - start)
484
+ judgeval_logger.info(f"{msg} ({elapsed} sec)")
485
+ stop_event.wait(interval)
802
486
 
803
487
 
804
488
  def run_eval(
805
489
  evaluation_run: EvaluationRun,
490
+ judgment_api_key: str,
806
491
  override: bool = False,
807
- async_execution: bool = False,
808
- ) -> Union[List[ScoringResult], asyncio.Task, SpinnerWrappedTask]:
492
+ ) -> List[ScoringResult]:
809
493
  """
810
494
  Executes an evaluation of `Example`s using one or more `Scorer`s
811
495
 
812
496
  Args:
813
497
  evaluation_run (EvaluationRun): Stores example and evaluation together for running
814
498
  override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
815
- async_execution (bool, optional): Whether to execute the evaluation asynchronously. Defaults to False.
816
499
 
817
500
  Returns:
818
- Union[List[ScoringResult], Union[asyncio.Task, SpinnerWrappedTask]]:
819
- - If async_execution is False, returns a list of ScoringResult objects
820
- - If async_execution is True, returns a Task that will resolve to a list of ScoringResult objects when awaited
501
+ List[ScoringResult]: A list of ScoringResult objects
821
502
  """
822
503
 
823
504
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -825,7 +506,7 @@ def run_eval(
825
506
  check_eval_run_name_exists(
826
507
  evaluation_run.eval_name,
827
508
  evaluation_run.project_name,
828
- evaluation_run.judgment_api_key,
509
+ judgment_api_key,
829
510
  evaluation_run.organization_id,
830
511
  )
831
512
 
@@ -834,7 +515,7 @@ def run_eval(
834
515
  check_experiment_type(
835
516
  evaluation_run.eval_name,
836
517
  evaluation_run.project_name,
837
- evaluation_run.judgment_api_key,
518
+ judgment_api_key,
838
519
  evaluation_run.organization_id,
839
520
  False,
840
521
  )
@@ -851,148 +532,81 @@ def run_eval(
851
532
  else:
852
533
  local_scorers.append(scorer)
853
534
 
854
- api_results: List[ScoringResult] = []
855
- local_results: List[ScoringResult] = []
856
-
857
- if async_execution:
858
- if len(local_scorers) > 0:
859
- judgeval_logger.error("Local scorers are not supported in async execution")
860
- raise ValueError("Local scorers are not supported in async execution")
861
-
862
- check_examples(evaluation_run.examples, evaluation_run.scorers)
863
-
864
- async def _async_evaluation_workflow():
865
- # Create a payload
866
- payload = evaluation_run.model_dump(warnings=False)
867
-
868
- # Send the evaluation to the queue
869
- response = await asyncio.to_thread(
870
- requests.post,
871
- JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
872
- headers={
873
- "Content-Type": "application/json",
874
- "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
875
- "X-Organization-Id": evaluation_run.organization_id,
876
- },
877
- json=payload,
878
- verify=True,
535
+ results: List[ScoringResult] = []
536
+ url = ""
537
+
538
+ if len(local_scorers) > 0 and len(judgment_scorers) > 0:
539
+ error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
540
+ judgeval_logger.error(error_msg)
541
+ raise ValueError(error_msg)
542
+
543
+ if len(judgment_scorers) > 0:
544
+ check_examples(evaluation_run.examples, judgment_scorers)
545
+ stop_event = threading.Event()
546
+ t = threading.Thread(
547
+ target=progress_logger, args=(stop_event, "Running evaluation...")
548
+ )
549
+ t.start()
550
+ try:
551
+ api_client = JudgmentApiClient(
552
+ judgment_api_key, evaluation_run.organization_id
553
+ )
554
+ response = api_client.add_to_evaluation_queue(
555
+ evaluation_run.model_dump(warnings=False)
879
556
  )
880
557
 
881
- if not response.ok:
882
- error_message = response.json().get(
883
- "detail", "An unknown error occurred."
884
- )
558
+ if not response.get("success", False):
559
+ error_message = response.error
885
560
  judgeval_logger.error(
886
561
  f"Error adding evaluation to queue: {error_message}"
887
562
  )
888
563
  raise JudgmentAPIError(error_message)
889
564
 
890
- # Poll until the evaluation is complete
891
- results = await _poll_evaluation_until_complete(
565
+ old_scorer_data_count = 0
566
+ if evaluation_run.append:
567
+ try:
568
+ results_response = api_client.fetch_evaluation_results(
569
+ evaluation_run.project_name, evaluation_run.eval_name
570
+ )
571
+ old_scorer_data_count = retrieve_counts(results_response)
572
+ except Exception:
573
+ # This usually means the user did append = True but the eval run name doesn't exist yet
574
+ pass
575
+
576
+ results, url = _poll_evaluation_until_complete(
892
577
  eval_name=evaluation_run.eval_name,
893
578
  project_name=evaluation_run.project_name,
894
- judgment_api_key=evaluation_run.judgment_api_key,
579
+ judgment_api_key=judgment_api_key,
895
580
  organization_id=evaluation_run.organization_id,
896
- original_examples=evaluation_run.examples, # Pass the original examples
897
- expected_scorer_count=len(evaluation_run.scorers),
581
+ expected_scorer_data_count=(
582
+ len(evaluation_run.scorers) * len(evaluation_run.examples)
583
+ )
584
+ + old_scorer_data_count,
585
+ )
586
+ finally:
587
+ stop_event.set()
588
+ t.join()
589
+
590
+ if len(local_scorers) > 0:
591
+ results = safe_run_async(
592
+ a_execute_scoring(
593
+ evaluation_run.examples,
594
+ local_scorers,
595
+ model=evaluation_run.model,
596
+ throttle_value=0,
597
+ max_concurrent=MAX_CONCURRENT_EVALUATIONS,
898
598
  )
899
-
900
- pretty_str_to_print = None
901
- if results: # Ensure results exist before logging
902
- send_results = [
903
- scoring_result.model_dump(warnings=False)
904
- for scoring_result in results
905
- ]
906
- try:
907
- # Run the blocking log_evaluation_results in a separate thread
908
- pretty_str_to_print = await asyncio.to_thread(
909
- log_evaluation_results, send_results, evaluation_run
910
- )
911
- except Exception as e:
912
- judgeval_logger.error(
913
- f"Error logging results after async evaluation: {str(e)}"
914
- )
915
-
916
- return results, pretty_str_to_print
917
-
918
- # Create a regular task
919
- task = asyncio.create_task(_async_evaluation_workflow())
920
-
921
- # Wrap it in our custom awaitable that will show a spinner only when awaited
922
- return SpinnerWrappedTask(
923
- task, f"Processing evaluation '{evaluation_run.eval_name}': "
924
599
  )
925
- else:
926
- check_examples(evaluation_run.examples, evaluation_run.scorers)
927
- if judgment_scorers:
928
- # Execute evaluation using Judgment API
929
- try: # execute an EvaluationRun with just JudgmentScorers
930
- api_evaluation_run: EvaluationRun = EvaluationRun(
931
- eval_name=evaluation_run.eval_name,
932
- project_name=evaluation_run.project_name,
933
- examples=evaluation_run.examples,
934
- scorers=judgment_scorers,
935
- model=evaluation_run.model,
936
- judgment_api_key=evaluation_run.judgment_api_key,
937
- organization_id=evaluation_run.organization_id,
938
- )
939
- response_data: Dict = run_with_spinner(
940
- "Running Evaluation: ", execute_api_eval, api_evaluation_run
941
- )
942
- except JudgmentAPIError as e:
943
- judgeval_logger.error(
944
- f"An error occurred while executing the Judgment API request: {str(e)}"
945
- )
946
- raise JudgmentAPIError(
947
- f"An error occurred while executing the Judgment API request: {str(e)}"
948
- )
949
- except ValueError as e:
950
- raise ValueError(
951
- f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}"
952
- )
953
600
 
954
- # Convert the response data to `ScoringResult` objects
955
- api_results = [
956
- ScoringResult(**result) for result in response_data["results"]
957
- ]
958
- # Run local evals
959
- if local_scorers: # List[BaseScorer]
960
- results: List[ScoringResult] = safe_run_async(
961
- a_execute_scoring(
962
- evaluation_run.examples,
963
- local_scorers,
964
- model=evaluation_run.model,
965
- throttle_value=0,
966
- max_concurrent=MAX_CONCURRENT_EVALUATIONS,
967
- )
968
- )
969
- local_results = results
970
- # Aggregate the ScorerData from the API and local evaluations
971
- merged_results: List[ScoringResult] = merge_results(api_results, local_results)
972
- merged_results = check_missing_scorer_data(merged_results)
973
-
974
- # Evaluate rules against local scoring results if rules exist (this cant be done just yet)
975
- # if evaluation_run.rules and merged_results:
976
- # run_rules(
977
- # local_results=merged_results,
978
- # rules=evaluation_run.rules,
979
- # judgment_api_key=evaluation_run.judgment_api_key,
980
- # organization_id=evaluation_run.organization_id
981
- # )
982
- # print(merged_results)
983
601
  send_results = [
984
- scoring_result.model_dump(warnings=False)
985
- for scoring_result in merged_results
602
+ scoring_result.model_dump(warnings=False) for scoring_result in results
986
603
  ]
987
- pretty_str = run_with_spinner(
988
- "Logging Results: ",
989
- log_evaluation_results,
990
- send_results,
991
- evaluation_run,
992
- )
993
- rprint(pretty_str)
994
604
 
995
- return merged_results
605
+ url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
606
+ rprint(
607
+ f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
608
+ )
609
+ return results
996
610
 
997
611
 
998
612
  def assert_test(scoring_results: List[ScoringResult]) -> None:
@@ -1025,15 +639,6 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
1025
639
  if failed_cases:
1026
640
  error_msg = "The following test cases failed: \n"
1027
641
  for fail_case in failed_cases:
1028
- # error_msg += f"\nInput: {fail_case['input']}\n"
1029
- # error_msg += f"Actual Output: {fail_case['actual_output']}\n"
1030
- # error_msg += f"Expected Output: {fail_case['expected_output']}\n"
1031
- # error_msg += f"Context: {fail_case['context']}\n"
1032
- # error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
1033
- # error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
1034
- # error_msg += f"Tools Called: {fail_case['tools_called']}\n"
1035
- # error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
1036
-
1037
642
  for fail_scorer in fail_case["failed_scorers"]:
1038
643
  error_msg += (
1039
644
  f"\nScorer Name: {fail_scorer.name}\n"