judgeval 0.0.55__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. judgeval/common/api/__init__.py +3 -0
  2. judgeval/common/api/api.py +352 -0
  3. judgeval/common/api/constants.py +165 -0
  4. judgeval/common/storage/__init__.py +6 -0
  5. judgeval/common/tracer/__init__.py +31 -0
  6. judgeval/common/tracer/constants.py +22 -0
  7. judgeval/common/tracer/core.py +1916 -0
  8. judgeval/common/tracer/otel_exporter.py +108 -0
  9. judgeval/common/tracer/otel_span_processor.py +234 -0
  10. judgeval/common/tracer/span_processor.py +37 -0
  11. judgeval/common/tracer/span_transformer.py +211 -0
  12. judgeval/common/tracer/trace_manager.py +92 -0
  13. judgeval/common/utils.py +2 -2
  14. judgeval/constants.py +3 -30
  15. judgeval/data/datasets/eval_dataset_client.py +29 -156
  16. judgeval/data/judgment_types.py +4 -12
  17. judgeval/data/result.py +1 -1
  18. judgeval/data/scorer_data.py +2 -2
  19. judgeval/data/scripts/openapi_transform.py +1 -1
  20. judgeval/data/trace.py +66 -1
  21. judgeval/data/trace_run.py +0 -3
  22. judgeval/evaluation_run.py +0 -2
  23. judgeval/integrations/langgraph.py +43 -164
  24. judgeval/judgment_client.py +17 -211
  25. judgeval/run_evaluation.py +209 -611
  26. judgeval/scorers/__init__.py +2 -6
  27. judgeval/scorers/base_scorer.py +4 -23
  28. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +3 -3
  29. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +215 -0
  30. judgeval/scorers/score.py +2 -1
  31. judgeval/scorers/utils.py +1 -13
  32. judgeval/utils/requests.py +21 -0
  33. judgeval-0.1.0.dist-info/METADATA +202 -0
  34. {judgeval-0.0.55.dist-info → judgeval-0.1.0.dist-info}/RECORD +37 -29
  35. judgeval/common/tracer.py +0 -3215
  36. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -73
  37. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  38. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  39. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -53
  40. judgeval-0.0.55.dist-info/METADATA +0 -1384
  41. /judgeval/common/{s3_storage.py → storage/s3_storage.py} +0 -0
  42. {judgeval-0.0.55.dist-info → judgeval-0.1.0.dist-info}/WHEEL +0 -0
  43. {judgeval-0.0.55.dist-info → judgeval-0.1.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,29 +1,21 @@
1
1
  import asyncio
2
2
  import concurrent.futures
3
- from requests import exceptions
4
- from judgeval.utils.requests import requests
5
3
  import time
6
4
  import json
7
5
  import sys
8
- import itertools
9
6
  import threading
10
- from typing import List, Dict, Any, Union, Optional, Callable
7
+ from typing import List, Dict, Union, Optional, Callable, Tuple, Any
11
8
  from rich import print as rprint
12
9
 
13
10
  from judgeval.data import ScorerData, ScoringResult, Example, Trace
14
11
  from judgeval.scorers import BaseScorer, APIScorerConfig
15
12
  from judgeval.scorers.score import a_execute_scoring
13
+ from judgeval.common.api import JudgmentApiClient
16
14
  from judgeval.constants import (
17
- ROOT_API,
18
- JUDGMENT_EVAL_API_URL,
19
- JUDGMENT_TRACE_EVAL_API_URL,
20
- JUDGMENT_EVAL_LOG_API_URL,
21
15
  MAX_CONCURRENT_EVALUATIONS,
22
- JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
23
- JUDGMENT_GET_EVAL_STATUS_API_URL,
24
- JUDGMENT_EVAL_FETCH_API_URL,
25
16
  )
26
17
  from judgeval.common.exceptions import JudgmentAPIError
18
+ from judgeval.common.api.api import JudgmentAPIException
27
19
  from judgeval.common.logger import judgeval_logger
28
20
  from judgeval.evaluation_run import EvaluationRun
29
21
  from judgeval.data.trace_run import TraceRun
@@ -54,22 +46,20 @@ def safe_run_async(coro):
54
46
  return asyncio.run(coro)
55
47
 
56
48
 
57
- def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
49
+ def send_to_rabbitmq(evaluation_run: EvaluationRun) -> Dict[str, Any]:
58
50
  """
59
51
  Sends an evaluation run to the RabbitMQ evaluation queue.
60
52
  """
61
- payload = evaluation_run.model_dump(warnings=False)
62
- response = requests.post(
63
- JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
64
- headers={
65
- "Content-Type": "application/json",
66
- "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
67
- "X-Organization-Id": evaluation_run.organization_id,
68
- },
69
- json=payload,
70
- verify=True,
53
+ if not evaluation_run.judgment_api_key or not evaluation_run.organization_id:
54
+ raise ValueError("API key and organization ID are required")
55
+ if not evaluation_run.eval_name or not evaluation_run.project_name:
56
+ raise ValueError("Eval name and project name are required")
57
+ api_client = JudgmentApiClient(
58
+ evaluation_run.judgment_api_key, evaluation_run.organization_id
59
+ )
60
+ return api_client.add_to_evaluation_queue(
61
+ evaluation_run.eval_name, evaluation_run.project_name
71
62
  )
72
- return response.json()
73
63
 
74
64
 
75
65
  def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
@@ -86,146 +76,46 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
86
76
 
87
77
  try:
88
78
  # submit API request to execute evals
89
- payload = evaluation_run.model_dump()
90
- response = requests.post(
91
- JUDGMENT_EVAL_API_URL,
92
- headers={
93
- "Content-Type": "application/json",
94
- "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
95
- "X-Organization-Id": evaluation_run.organization_id,
96
- },
97
- json=payload,
98
- verify=True,
79
+ if not evaluation_run.judgment_api_key or not evaluation_run.organization_id:
80
+ raise ValueError("API key and organization ID are required")
81
+ api_client = JudgmentApiClient(
82
+ evaluation_run.judgment_api_key, evaluation_run.organization_id
99
83
  )
100
- response_data = response.json()
84
+ return api_client.run_evaluation(evaluation_run.model_dump())
101
85
  except Exception as e:
102
86
  judgeval_logger.error(f"Error: {e}")
103
- details = response.json().get("detail", "No details provided")
87
+
88
+ details = "No details provided"
89
+ if isinstance(e, JudgmentAPIException):
90
+ details = e.response_json.get("detail", "No details provided")
91
+
104
92
  raise JudgmentAPIError(
105
93
  "An error occurred while executing the Judgment API request: " + details
106
94
  )
107
- # Check if the response status code is not 2XX
108
- # Add check for the duplicate eval run name
109
- if not response.ok:
110
- error_message = response_data.get("detail", "An unknown error occurred.")
111
- judgeval_logger.error(f"Error: {error_message=}")
112
- raise JudgmentAPIError(error_message)
113
- return response_data
114
95
 
115
96
 
116
- def execute_api_trace_eval(trace_run: TraceRun) -> Dict:
97
+ def execute_api_trace_eval(trace_run: TraceRun, judgment_api_key: str) -> Dict:
117
98
  """
118
99
  Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
119
100
  """
120
101
 
121
102
  try:
122
103
  # submit API request to execute evals
123
- payload = trace_run.model_dump(warnings=False)
124
- response = requests.post(
125
- JUDGMENT_TRACE_EVAL_API_URL,
126
- headers={
127
- "Content-Type": "application/json",
128
- "Authorization": f"Bearer {trace_run.judgment_api_key}",
129
- "X-Organization-Id": trace_run.organization_id,
130
- },
131
- json=payload,
132
- verify=True,
133
- )
134
- response_data = response.json()
104
+ if not judgment_api_key or not trace_run.organization_id:
105
+ raise ValueError("API key and organization ID are required")
106
+ api_client = JudgmentApiClient(judgment_api_key, trace_run.organization_id)
107
+ return api_client.run_trace_evaluation(trace_run.model_dump(warnings=False))
135
108
  except Exception as e:
136
109
  judgeval_logger.error(f"Error: {e}")
137
- details = response.json().get("detail", "No details provided")
138
- raise JudgmentAPIError(
139
- "An error occurred while executing the Judgment API request: " + details
140
- )
141
- # Check if the response status code is not 2XX
142
- # Add check for the duplicate eval run name
143
- if not response.ok:
144
- error_message = response_data.get("detail", "An unknown error occurred.")
145
- judgeval_logger.error(f"Error: {error_message=}")
146
- raise JudgmentAPIError(error_message)
147
- return response_data
148
-
149
-
150
- def merge_results(
151
- api_results: List[ScoringResult], local_results: List[ScoringResult]
152
- ) -> List[ScoringResult]:
153
- """
154
- When executing scorers that come from both the Judgment API and local scorers, we're left with
155
- results for each type of scorer. This function merges the results from the API and local evaluations,
156
- grouped by example. In particular, we merge the `scorers_data` field of each `ScoringResult` object.
157
110
 
158
- Args:
159
- api_results (List[ScoringResult]): The `ScoringResult`s from the API evaluation
160
- local_results (List[ScoringResult]): The `ScoringResult`s from the local evaluation
111
+ details = "An unknown error occurred."
112
+ if isinstance(e, JudgmentAPIException):
113
+ details = e.response_json.get("detail", "An unknown error occurred.")
161
114
 
162
- Returns:
163
- List[ScoringResult]: The merged `ScoringResult`s (updated `scorers_data` field)
164
- """
165
- # No merge required
166
- if not local_results and api_results:
167
- return [result.model_copy() for result in api_results]
168
- if not api_results and local_results:
169
- return [result.model_copy() for result in local_results]
170
-
171
- if len(api_results) != len(local_results):
172
- # Results should be of same length because each ScoringResult is a 1-1 mapping to an Example
173
- raise ValueError(
174
- f"The number of API and local results do not match: {len(api_results)} vs {len(local_results)}"
115
+ raise JudgmentAPIError(
116
+ "An error occurred while executing the Judgment API request: " + details
175
117
  )
176
118
 
177
- # Create a copy of api_results to avoid modifying the input
178
- merged_results = [result.model_copy() for result in api_results]
179
-
180
- # Each ScoringResult in api and local have all the same fields besides `scorers_data`
181
- for merged_result, local_result in zip(merged_results, local_results):
182
- if not (merged_result.data_object and local_result.data_object):
183
- raise ValueError("Data object is None in one of the results.")
184
- if merged_result.data_object.input != local_result.data_object.input:
185
- raise ValueError("The API and local results are not aligned.")
186
- if (
187
- merged_result.data_object.actual_output
188
- != local_result.data_object.actual_output
189
- ):
190
- raise ValueError("The API and local results are not aligned.")
191
- if (
192
- merged_result.data_object.expected_output
193
- != local_result.data_object.expected_output
194
- ):
195
- raise ValueError("The API and local results are not aligned.")
196
- if merged_result.data_object.context != local_result.data_object.context:
197
- raise ValueError("The API and local results are not aligned.")
198
- if (
199
- merged_result.data_object.retrieval_context
200
- != local_result.data_object.retrieval_context
201
- ):
202
- raise ValueError("The API and local results are not aligned.")
203
- if (
204
- merged_result.data_object.additional_metadata
205
- != local_result.data_object.additional_metadata
206
- ):
207
- raise ValueError("The API and local results are not aligned.")
208
- if (
209
- merged_result.data_object.tools_called
210
- != local_result.data_object.tools_called
211
- ):
212
- raise ValueError("The API and local results are not aligned.")
213
- if (
214
- merged_result.data_object.expected_tools
215
- != local_result.data_object.expected_tools
216
- ):
217
- raise ValueError("The API and local results are not aligned.")
218
-
219
- # Merge ScorerData from the API and local scorers together
220
- api_scorer_data = merged_result.scorers_data
221
- local_scorer_data = local_result.scorers_data
222
- if api_scorer_data is None and local_scorer_data is not None:
223
- merged_result.scorers_data = local_scorer_data
224
- elif api_scorer_data is not None and local_scorer_data is not None:
225
- merged_result.scorers_data = api_scorer_data + local_scorer_data
226
-
227
- return merged_results
228
-
229
119
 
230
120
  def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
231
121
  """
@@ -255,34 +145,17 @@ def check_experiment_type(
255
145
  """
256
146
  Checks if the current experiment, if one exists, has the same type (examples of traces)
257
147
  """
258
- try:
259
- response = requests.post(
260
- f"{ROOT_API}/check_experiment_type/",
261
- headers={
262
- "Content-Type": "application/json",
263
- "Authorization": f"Bearer {judgment_api_key}",
264
- "X-Organization-Id": organization_id,
265
- },
266
- json={
267
- "eval_name": eval_name,
268
- "project_name": project_name,
269
- "judgment_api_key": judgment_api_key,
270
- "is_trace": is_trace,
271
- },
272
- verify=True,
273
- )
274
-
275
- if response.status_code == 422:
276
- judgeval_logger.error(f"{response.json()}")
277
- raise ValueError(f"{response.json()}")
148
+ api_client = JudgmentApiClient(judgment_api_key, organization_id)
278
149
 
279
- if not response.ok:
280
- response_data = response.json()
281
- error_message = response_data.get("detail", "An unknown error occurred.")
282
- judgeval_logger.error(f"Error checking eval run name: {error_message}")
283
- raise JudgmentAPIError(error_message)
284
-
285
- except exceptions.RequestException as e:
150
+ try:
151
+ api_client.check_experiment_type(eval_name, project_name, is_trace)
152
+ except JudgmentAPIException as e:
153
+ if e.response.status_code == 422:
154
+ judgeval_logger.error(f"{e.response_json}")
155
+ raise ValueError(f"{e.response_json}")
156
+ else:
157
+ raise e
158
+ except Exception as e:
286
159
  judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
287
160
  raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
288
161
 
@@ -302,125 +175,56 @@ def check_eval_run_name_exists(
302
175
  ValueError: If the evaluation run name already exists
303
176
  JudgmentAPIError: If there's an API error during the check
304
177
  """
178
+ api_client = JudgmentApiClient(judgment_api_key, organization_id)
305
179
  try:
306
- response = requests.post(
307
- f"{ROOT_API}/eval-run-name-exists/",
308
- headers={
309
- "Content-Type": "application/json",
310
- "Authorization": f"Bearer {judgment_api_key}",
311
- "X-Organization-Id": organization_id,
312
- },
313
- json={
314
- "eval_name": eval_name,
315
- "project_name": project_name,
316
- "judgment_api_key": judgment_api_key,
317
- },
318
- verify=True,
319
- )
320
-
321
- if response.status_code == 409:
322
- judgeval_logger.error(
323
- f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true."
324
- )
325
- raise ValueError(
326
- f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true."
327
- )
328
-
329
- if not response.ok:
330
- response_data = response.json()
331
- error_message = response_data.get("detail", "An unknown error occurred.")
332
- judgeval_logger.error(f"Error checking eval run name: {error_message}")
333
- raise JudgmentAPIError(error_message)
180
+ api_client.check_eval_run_name_exists(eval_name, project_name)
181
+ except JudgmentAPIException as e:
182
+ if e.response.status_code == 409:
183
+ error_str = f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true. See https://docs.judgmentlabs.ai/sdk-reference/judgment-client#override for more information."
184
+ judgeval_logger.error(error_str)
185
+ raise ValueError(error_str)
186
+ else:
187
+ raise e
334
188
 
335
- except exceptions.RequestException as e:
189
+ except Exception as e:
336
190
  judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
337
191
  raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
338
192
 
339
193
 
340
194
  def log_evaluation_results(
341
- scoring_results: List[ScoringResult], run: Union[EvaluationRun, TraceRun]
342
- ) -> str | None:
195
+ scoring_results: List[ScoringResult],
196
+ run: Union[EvaluationRun, TraceRun],
197
+ judgment_api_key: str,
198
+ ) -> str:
343
199
  """
344
200
  Logs evaluation results to the Judgment API database.
345
201
 
346
202
  Args:
347
203
  merged_results (List[ScoringResult]): The results to log
348
204
  evaluation_run (EvaluationRun): The evaluation run containing project info and API key
205
+ judgment_api_key (str): The API key for the Judgment API
349
206
 
350
207
  Raises:
351
208
  JudgmentAPIError: If there's an API error during logging
352
209
  ValueError: If there's a validation error with the results
353
210
  """
354
211
  try:
355
- res = requests.post(
356
- JUDGMENT_EVAL_LOG_API_URL,
357
- headers={
358
- "Content-Type": "application/json",
359
- "Authorization": f"Bearer {run.judgment_api_key}",
360
- "X-Organization-Id": run.organization_id,
361
- },
362
- json={"results": scoring_results, "run": run.model_dump(warnings=False)},
363
- verify=True,
364
- )
212
+ if not judgment_api_key or not run.organization_id:
213
+ raise ValueError("API key and organization ID are required")
365
214
 
366
- if not res.ok:
367
- response_data = res.json()
368
- error_message = response_data.get("detail", "An unknown error occurred.")
369
- judgeval_logger.error(f"Error {res.status_code}: {error_message}")
370
- raise JudgmentAPIError(error_message)
371
-
372
- if "ui_results_url" in res.json():
373
- url = res.json()["ui_results_url"]
374
- pretty_str = f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
375
- return pretty_str
376
-
377
- return None
378
-
379
- except exceptions.RequestException as e:
380
- judgeval_logger.error(
381
- f"Request failed while saving evaluation results to DB: {str(e)}"
215
+ api_client = JudgmentApiClient(judgment_api_key, run.organization_id)
216
+ response = api_client.log_evaluation_results(
217
+ scoring_results,
218
+ run.model_dump(warnings=False),
382
219
  )
220
+ url = response.get("ui_results_url")
221
+ return url
222
+
223
+ except Exception as e:
224
+ judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
383
225
  raise JudgmentAPIError(
384
226
  f"Request failed while saving evaluation results to DB: {str(e)}"
385
227
  )
386
- except Exception as e:
387
- judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
388
- raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
389
-
390
-
391
- def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
392
- """Run a function with a spinner in the terminal."""
393
- spinner = itertools.cycle(["|", "/", "-", "\\"])
394
-
395
- def display_spinner():
396
- while not stop_spinner_event.is_set():
397
- sys.stdout.write(f"\r{message}{next(spinner)}")
398
- sys.stdout.flush()
399
- time.sleep(0.1)
400
-
401
- stop_spinner_event = threading.Event()
402
- spinner_thread = threading.Thread(target=display_spinner)
403
- spinner_thread.start()
404
-
405
- try:
406
- if asyncio.iscoroutinefunction(func):
407
- coro = func(*args, **kwargs)
408
- result = safe_run_async(coro)
409
- else:
410
- result = func(*args, **kwargs)
411
- except Exception as e:
412
- judgeval_logger.error(f"An error occurred: {str(e)}")
413
- stop_spinner_event.set()
414
- spinner_thread.join()
415
- raise e
416
- finally:
417
- stop_spinner_event.set()
418
- spinner_thread.join()
419
-
420
- sys.stdout.write("\r" + " " * (len(message) + 1) + "\r")
421
- sys.stdout.flush()
422
-
423
- return result
424
228
 
425
229
 
426
230
  def check_examples(
@@ -455,6 +259,7 @@ def check_examples(
455
259
 
456
260
  def run_trace_eval(
457
261
  trace_run: TraceRun,
262
+ judgment_api_key: str,
458
263
  override: bool = False,
459
264
  function: Optional[Callable] = None,
460
265
  tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
@@ -465,7 +270,7 @@ def run_trace_eval(
465
270
  check_eval_run_name_exists(
466
271
  trace_run.eval_name,
467
272
  trace_run.project_name,
468
- trace_run.judgment_api_key,
273
+ judgment_api_key,
469
274
  trace_run.organization_id,
470
275
  )
471
276
 
@@ -474,7 +279,7 @@ def run_trace_eval(
474
279
  check_experiment_type(
475
280
  trace_run.eval_name,
476
281
  trace_run.project_name,
477
- trace_run.judgment_api_key,
282
+ judgment_api_key,
478
283
  trace_run.organization_id,
479
284
  True,
480
285
  )
@@ -489,22 +294,19 @@ def run_trace_eval(
489
294
 
490
295
  actual_tracer.offline_mode = True
491
296
  actual_tracer.traces = []
297
+ judgeval_logger.info("Running agent function: ")
492
298
  for example in examples:
493
299
  if example.input:
494
300
  if isinstance(example.input, str):
495
- run_with_spinner(
496
- "Running agent function: ", function, example.input
497
- )
301
+ function(example.input)
498
302
  elif isinstance(example.input, dict):
499
- run_with_spinner(
500
- "Running agent function: ", function, **example.input
501
- )
303
+ function(**example.input)
502
304
  else:
503
305
  raise ValueError(
504
306
  f"Input must be string or dict, got {type(example.input)}"
505
307
  )
506
308
  else:
507
- run_with_spinner("Running agent function: ", function)
309
+ function()
508
310
 
509
311
  for i, trace in enumerate(actual_tracer.traces):
510
312
  # We set the root-level trace span with the expected tools of the Trace
@@ -516,9 +318,8 @@ def run_trace_eval(
516
318
 
517
319
  # Execute evaluation using Judgment API
518
320
  try: # execute an EvaluationRun with just JudgmentScorers
519
- response_data: Dict = run_with_spinner(
520
- "Running Trace Evaluation: ", execute_api_trace_eval, trace_run
521
- )
321
+ judgeval_logger.info("Executing Trace Evaluation... ")
322
+ response_data: Dict = execute_api_trace_eval(trace_run, judgment_api_key)
522
323
  scoring_results = [
523
324
  ScoringResult(**result) for result in response_data["results"]
524
325
  ]
@@ -534,14 +335,12 @@ def run_trace_eval(
534
335
  # Convert the response data to `ScoringResult` objects
535
336
  # TODO: allow for custom scorer on traces
536
337
 
537
- pretty_str = run_with_spinner(
538
- "Logging Results: ",
539
- log_evaluation_results,
540
- response_data["agent_results"],
541
- trace_run,
338
+ url = log_evaluation_results(
339
+ response_data["agent_results"], trace_run, judgment_api_key
340
+ )
341
+ rprint(
342
+ f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
542
343
  )
543
- rprint(pretty_str)
544
-
545
344
  return scoring_results
546
345
 
547
346
 
@@ -563,41 +362,33 @@ async def get_evaluation_status(
563
362
  - results: List of ScoringResult objects if completed
564
363
  - error: Error message if failed
565
364
  """
365
+ api_client = JudgmentApiClient(judgment_api_key, organization_id)
566
366
  try:
567
- response = requests.get(
568
- JUDGMENT_GET_EVAL_STATUS_API_URL,
569
- headers={
570
- "Content-Type": "application/json",
571
- "Authorization": f"Bearer {judgment_api_key}",
572
- "X-Organization-Id": organization_id,
573
- },
574
- params={
575
- "eval_name": eval_name,
576
- "project_name": project_name,
577
- },
578
- verify=True,
367
+ return api_client.get_evaluation_status(eval_name, project_name)
368
+ except Exception as e:
369
+ raise JudgmentAPIError(
370
+ f"An error occurred while checking evaluation status: {str(e)}"
579
371
  )
580
372
 
581
- if not response.ok:
582
- error_message = response.json().get("detail", "An unknown error occurred.")
583
- judgeval_logger.error(f"Error checking evaluation status: {error_message}")
584
- raise JudgmentAPIError(error_message)
585
373
 
586
- return response.json()
587
- except exceptions.RequestException as e:
588
- judgeval_logger.error(f"Failed to check evaluation status: {str(e)}")
589
- raise JudgmentAPIError(f"Failed to check evaluation status: {str(e)}")
374
+ def retrieve_counts(result: Dict):
375
+ scorer_data_count = 0
376
+ for example in result.get("examples", []):
377
+ for scorer in example.get("scorer_data", []):
378
+ scorer_data_count += 1
379
+ return scorer_data_count
590
380
 
591
381
 
592
- async def _poll_evaluation_until_complete(
382
+ def _poll_evaluation_until_complete(
593
383
  eval_name: str,
594
384
  project_name: str,
595
385
  judgment_api_key: str,
596
386
  organization_id: str,
597
- expected_scorer_count: int,
598
- original_examples: List[Example],
599
- poll_interval_seconds: int = 5,
600
- ) -> List[ScoringResult]:
387
+ expected_scorer_data_count: int,
388
+ poll_interval_seconds: float = 5,
389
+ max_failures: int = 5,
390
+ max_poll_count: int = 24, # This should be equivalent to 120 seconds
391
+ ) -> Tuple[List[ScoringResult], str]:
601
392
  """
602
393
  Polls until the evaluation is complete and returns the results.
603
394
 
@@ -614,210 +405,93 @@ async def _poll_evaluation_until_complete(
614
405
  List[ScoringResult]: The evaluation results
615
406
  """
616
407
  poll_count = 0
617
-
618
- while True:
408
+ exception_count = 0
409
+ api_client = JudgmentApiClient(judgment_api_key, organization_id)
410
+ while poll_count < max_poll_count:
619
411
  poll_count += 1
620
412
  try:
621
413
  # Check status
622
- response = await asyncio.to_thread(
623
- requests.get,
624
- JUDGMENT_GET_EVAL_STATUS_API_URL,
625
- headers={
626
- "Content-Type": "application/json",
627
- "Authorization": f"Bearer {judgment_api_key}",
628
- "X-Organization-Id": organization_id,
629
- },
630
- params={"eval_name": eval_name, "project_name": project_name},
631
- verify=True,
632
- )
414
+ status_response = api_client.get_evaluation_status(eval_name, project_name)
633
415
 
634
- if not response.ok:
635
- error_message = response.json().get(
636
- "detail", "An unknown error occurred."
637
- )
638
- judgeval_logger.error(
639
- f"Error checking evaluation status: {error_message}"
640
- )
641
- # Don't raise exception immediately, just log and continue polling
642
- await asyncio.sleep(poll_interval_seconds)
416
+ if status_response.get("status") != "completed":
417
+ time.sleep(poll_interval_seconds)
643
418
  continue
644
419
 
645
- status_data = response.json()
646
- status = status_data.get("status")
647
-
648
- # If complete, get results and return
649
- if status == "completed" or status == "complete":
650
- results_response = await asyncio.to_thread(
651
- requests.post,
652
- JUDGMENT_EVAL_FETCH_API_URL,
653
- headers={
654
- "Content-Type": "application/json",
655
- "Authorization": f"Bearer {judgment_api_key}",
656
- "X-Organization-Id": organization_id,
657
- },
658
- json={"project_name": project_name, "eval_name": eval_name},
659
- verify=True,
660
- )
661
-
662
- if not results_response.ok:
663
- error_message = results_response.json().get(
664
- "detail", "An unknown error occurred."
665
- )
666
- judgeval_logger.error(
667
- f"Error fetching evaluation results: {error_message}"
668
- )
669
- raise JudgmentAPIError(error_message)
670
-
671
- result_data = results_response.json()
672
-
673
- if result_data.get("examples") is None:
674
- continue
675
-
676
- examples_data = result_data.get("examples", [])
677
- scoring_results = []
678
-
679
- for example_data in examples_data:
680
- # Create ScorerData objects
681
- scorer_data_list = []
682
- for raw_scorer_data in example_data.get("scorer_data", []):
683
- scorer_data_list.append(ScorerData(**raw_scorer_data))
420
+ results_response = api_client.fetch_evaluation_results(
421
+ project_name, eval_name
422
+ )
423
+ url = results_response.get("ui_results_url")
684
424
 
685
- if len(scorer_data_list) != expected_scorer_count:
686
- # This means that not all scorers were loading for a specific example
687
- continue
425
+ if results_response.get("examples") is None:
426
+ time.sleep(poll_interval_seconds)
427
+ continue
688
428
 
689
- example = Example(**example_data)
429
+ examples_data = results_response.get("examples", [])
430
+ scoring_results = []
431
+ scorer_data_count = 0
690
432
 
691
- # Calculate success based on whether all scorer_data entries were successful
692
- success = all(
693
- scorer_data.success for scorer_data in scorer_data_list
694
- )
695
- scoring_result = ScoringResult(
696
- success=success, # Set based on all scorer data success values
697
- scorers_data=scorer_data_list,
698
- data_object=example,
699
- )
700
- scoring_results.append(scoring_result)
433
+ for example_data in examples_data:
434
+ scorer_data_list = []
435
+ for raw_scorer_data in example_data.get("scorer_data", []):
436
+ scorer_data = ScorerData(**raw_scorer_data)
437
+ scorer_data_list.append(scorer_data)
438
+ scorer_data_count += 1
701
439
 
702
- if len(scoring_results) != len(original_examples):
703
- # This means that not all examples were evaluated
704
- continue
440
+ example = Example(**example_data)
705
441
 
706
- return scoring_results
707
- elif status == "failed":
708
- # Evaluation failed
709
- error_message = status_data.get("error", "Unknown error")
710
- judgeval_logger.error(
711
- f"Evaluation '{eval_name}' failed: {error_message}"
442
+ success = all(scorer_data.success for scorer_data in scorer_data_list)
443
+ scoring_result = ScoringResult(
444
+ success=success,
445
+ scorers_data=scorer_data_list,
446
+ data_object=example,
712
447
  )
713
- raise JudgmentAPIError(f"Evaluation failed: {error_message}")
448
+ scoring_results.append(scoring_result)
714
449
 
715
- # Wait before checking again
716
- await asyncio.sleep(poll_interval_seconds)
450
+ if scorer_data_count != expected_scorer_data_count:
451
+ time.sleep(poll_interval_seconds)
452
+ continue
717
453
 
454
+ return scoring_results, url
718
455
  except Exception as e:
456
+ exception_count += 1
719
457
  if isinstance(e, JudgmentAPIError):
720
458
  raise
721
459
 
722
- # For other exceptions, log and continue polling
723
460
  judgeval_logger.error(f"Error checking evaluation status: {str(e)}")
724
- if poll_count > 20: # Only raise exception after many failed attempts
461
+ if exception_count > max_failures:
725
462
  raise JudgmentAPIError(
726
463
  f"Error checking evaluation status after {poll_count} attempts: {str(e)}"
727
464
  )
728
465
 
729
- # Continue polling after a delay
730
- await asyncio.sleep(poll_interval_seconds)
731
-
732
-
733
- async def await_with_spinner(task, message: str = "Awaiting async task: "):
734
- """
735
- Display a spinner while awaiting an async task.
736
-
737
- Args:
738
- task: The asyncio task to await
739
- message (str): Message to display with the spinner
740
-
741
- Returns:
742
- Any: The result of the awaited task
743
- """
744
- spinner = itertools.cycle(["|", "/", "-", "\\"])
745
-
746
- # Create an event to signal when to stop the spinner
747
- stop_spinner_event = asyncio.Event()
748
-
749
- async def display_spinner():
750
- while not stop_spinner_event.is_set():
751
- sys.stdout.write(f"\r{message}{next(spinner)}")
752
- sys.stdout.flush()
753
- await asyncio.sleep(0.1)
754
-
755
- # Start the spinner in a separate task
756
- spinner_task = asyncio.create_task(display_spinner())
757
-
758
- try:
759
- # Await the actual task
760
- result = await task
761
- finally:
762
- # Signal the spinner to stop and wait for it to finish
763
- stop_spinner_event.set()
764
- await spinner_task
765
-
766
- # Clear the spinner line
767
- sys.stdout.write("\r" + " " * (len(message) + 1) + "\r")
768
- sys.stdout.flush()
769
-
770
- return result
771
-
772
-
773
- class SpinnerWrappedTask:
774
- """
775
- A wrapper for an asyncio task that displays a spinner when awaited.
776
- """
777
-
778
- def __init__(self, task, message: str):
779
- self.task = task
780
- self.message = message
781
-
782
- def __await__(self):
783
- async def _spin_and_await():
784
- # self.task resolves to (scoring_results, pretty_str_to_print)
785
- task_result_tuple = await await_with_spinner(self.task, self.message)
786
-
787
- # Unpack the tuple
788
- scoring_results, pretty_str_to_print = task_result_tuple
789
-
790
- # Print the pretty string if it exists, after spinner is cleared
791
- if pretty_str_to_print:
792
- rprint(pretty_str_to_print)
466
+ time.sleep(poll_interval_seconds)
793
467
 
794
- # Return only the scoring_results to the original awaiter
795
- return scoring_results
468
+ raise JudgmentAPIError(
469
+ f"Error checking evaluation status after {poll_count} attempts"
470
+ )
796
471
 
797
- return _spin_and_await().__await__()
798
472
 
799
- # Proxy all Task attributes and methods to the underlying task
800
- def __getattr__(self, name):
801
- return getattr(self.task, name)
473
+ def progress_logger(stop_event, msg="Working...", interval=5):
474
+ start = time.time()
475
+ while not stop_event.is_set():
476
+ elapsed = int(time.time() - start)
477
+ judgeval_logger.info(f"{msg} ({elapsed} sec)")
478
+ stop_event.wait(interval)
802
479
 
803
480
 
804
481
  def run_eval(
805
482
  evaluation_run: EvaluationRun,
483
+ judgment_api_key: str,
806
484
  override: bool = False,
807
- async_execution: bool = False,
808
- ) -> Union[List[ScoringResult], asyncio.Task, SpinnerWrappedTask]:
485
+ ) -> List[ScoringResult]:
809
486
  """
810
487
  Executes an evaluation of `Example`s using one or more `Scorer`s
811
488
 
812
489
  Args:
813
490
  evaluation_run (EvaluationRun): Stores example and evaluation together for running
814
491
  override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
815
- async_execution (bool, optional): Whether to execute the evaluation asynchronously. Defaults to False.
816
492
 
817
493
  Returns:
818
- Union[List[ScoringResult], Union[asyncio.Task, SpinnerWrappedTask]]:
819
- - If async_execution is False, returns a list of ScoringResult objects
820
- - If async_execution is True, returns a Task that will resolve to a list of ScoringResult objects when awaited
494
+ List[ScoringResult]: A list of ScoringResult objects
821
495
  """
822
496
 
823
497
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -825,7 +499,7 @@ def run_eval(
825
499
  check_eval_run_name_exists(
826
500
  evaluation_run.eval_name,
827
501
  evaluation_run.project_name,
828
- evaluation_run.judgment_api_key,
502
+ judgment_api_key,
829
503
  evaluation_run.organization_id,
830
504
  )
831
505
 
@@ -834,7 +508,7 @@ def run_eval(
834
508
  check_experiment_type(
835
509
  evaluation_run.eval_name,
836
510
  evaluation_run.project_name,
837
- evaluation_run.judgment_api_key,
511
+ judgment_api_key,
838
512
  evaluation_run.organization_id,
839
513
  False,
840
514
  )
@@ -851,148 +525,81 @@ def run_eval(
851
525
  else:
852
526
  local_scorers.append(scorer)
853
527
 
854
- api_results: List[ScoringResult] = []
855
- local_results: List[ScoringResult] = []
856
-
857
- if async_execution:
858
- if len(local_scorers) > 0:
859
- judgeval_logger.error("Local scorers are not supported in async execution")
860
- raise ValueError("Local scorers are not supported in async execution")
861
-
862
- check_examples(evaluation_run.examples, evaluation_run.scorers)
863
-
864
- async def _async_evaluation_workflow():
865
- # Create a payload
866
- payload = evaluation_run.model_dump(warnings=False)
867
-
868
- # Send the evaluation to the queue
869
- response = await asyncio.to_thread(
870
- requests.post,
871
- JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
872
- headers={
873
- "Content-Type": "application/json",
874
- "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
875
- "X-Organization-Id": evaluation_run.organization_id,
876
- },
877
- json=payload,
878
- verify=True,
528
+ results: List[ScoringResult] = []
529
+ url = ""
530
+
531
+ if len(local_scorers) > 0 and len(judgment_scorers) > 0:
532
+ error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
533
+ judgeval_logger.error(error_msg)
534
+ raise ValueError(error_msg)
535
+
536
+ if len(judgment_scorers) > 0:
537
+ check_examples(evaluation_run.examples, judgment_scorers)
538
+ stop_event = threading.Event()
539
+ t = threading.Thread(
540
+ target=progress_logger, args=(stop_event, "Running evaluation...")
541
+ )
542
+ t.start()
543
+ try:
544
+ api_client = JudgmentApiClient(
545
+ judgment_api_key, evaluation_run.organization_id
546
+ )
547
+ response = api_client.add_to_evaluation_queue(
548
+ evaluation_run.model_dump(warnings=False)
879
549
  )
880
550
 
881
- if not response.ok:
882
- error_message = response.json().get(
883
- "detail", "An unknown error occurred."
884
- )
551
+ if not response.get("success", False):
552
+ error_message = response.error
885
553
  judgeval_logger.error(
886
554
  f"Error adding evaluation to queue: {error_message}"
887
555
  )
888
556
  raise JudgmentAPIError(error_message)
889
557
 
890
- # Poll until the evaluation is complete
891
- results = await _poll_evaluation_until_complete(
558
+ old_scorer_data_count = 0
559
+ if evaluation_run.append:
560
+ try:
561
+ results_response = api_client.fetch_evaluation_results(
562
+ evaluation_run.project_name, evaluation_run.eval_name
563
+ )
564
+ old_scorer_data_count = retrieve_counts(results_response)
565
+ except Exception:
566
+ # This usually means the user did append = True but the eval run name doesn't exist yet
567
+ pass
568
+
569
+ results, url = _poll_evaluation_until_complete(
892
570
  eval_name=evaluation_run.eval_name,
893
571
  project_name=evaluation_run.project_name,
894
- judgment_api_key=evaluation_run.judgment_api_key,
572
+ judgment_api_key=judgment_api_key,
895
573
  organization_id=evaluation_run.organization_id,
896
- original_examples=evaluation_run.examples, # Pass the original examples
897
- expected_scorer_count=len(evaluation_run.scorers),
574
+ expected_scorer_data_count=(
575
+ len(evaluation_run.scorers) * len(evaluation_run.examples)
576
+ )
577
+ + old_scorer_data_count,
578
+ )
579
+ finally:
580
+ stop_event.set()
581
+ t.join()
582
+
583
+ if len(local_scorers) > 0:
584
+ results = safe_run_async(
585
+ a_execute_scoring(
586
+ evaluation_run.examples,
587
+ local_scorers,
588
+ model=evaluation_run.model,
589
+ throttle_value=0,
590
+ max_concurrent=MAX_CONCURRENT_EVALUATIONS,
898
591
  )
899
-
900
- pretty_str_to_print = None
901
- if results: # Ensure results exist before logging
902
- send_results = [
903
- scoring_result.model_dump(warnings=False)
904
- for scoring_result in results
905
- ]
906
- try:
907
- # Run the blocking log_evaluation_results in a separate thread
908
- pretty_str_to_print = await asyncio.to_thread(
909
- log_evaluation_results, send_results, evaluation_run
910
- )
911
- except Exception as e:
912
- judgeval_logger.error(
913
- f"Error logging results after async evaluation: {str(e)}"
914
- )
915
-
916
- return results, pretty_str_to_print
917
-
918
- # Create a regular task
919
- task = asyncio.create_task(_async_evaluation_workflow())
920
-
921
- # Wrap it in our custom awaitable that will show a spinner only when awaited
922
- return SpinnerWrappedTask(
923
- task, f"Processing evaluation '{evaluation_run.eval_name}': "
924
592
  )
925
- else:
926
- check_examples(evaluation_run.examples, evaluation_run.scorers)
927
- if judgment_scorers:
928
- # Execute evaluation using Judgment API
929
- try: # execute an EvaluationRun with just JudgmentScorers
930
- api_evaluation_run: EvaluationRun = EvaluationRun(
931
- eval_name=evaluation_run.eval_name,
932
- project_name=evaluation_run.project_name,
933
- examples=evaluation_run.examples,
934
- scorers=judgment_scorers,
935
- model=evaluation_run.model,
936
- judgment_api_key=evaluation_run.judgment_api_key,
937
- organization_id=evaluation_run.organization_id,
938
- )
939
- response_data: Dict = run_with_spinner(
940
- "Running Evaluation: ", execute_api_eval, api_evaluation_run
941
- )
942
- except JudgmentAPIError as e:
943
- judgeval_logger.error(
944
- f"An error occurred while executing the Judgment API request: {str(e)}"
945
- )
946
- raise JudgmentAPIError(
947
- f"An error occurred while executing the Judgment API request: {str(e)}"
948
- )
949
- except ValueError as e:
950
- raise ValueError(
951
- f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}"
952
- )
953
593
 
954
- # Convert the response data to `ScoringResult` objects
955
- api_results = [
956
- ScoringResult(**result) for result in response_data["results"]
957
- ]
958
- # Run local evals
959
- if local_scorers: # List[BaseScorer]
960
- results: List[ScoringResult] = safe_run_async(
961
- a_execute_scoring(
962
- evaluation_run.examples,
963
- local_scorers,
964
- model=evaluation_run.model,
965
- throttle_value=0,
966
- max_concurrent=MAX_CONCURRENT_EVALUATIONS,
967
- )
968
- )
969
- local_results = results
970
- # Aggregate the ScorerData from the API and local evaluations
971
- merged_results: List[ScoringResult] = merge_results(api_results, local_results)
972
- merged_results = check_missing_scorer_data(merged_results)
973
-
974
- # Evaluate rules against local scoring results if rules exist (this cant be done just yet)
975
- # if evaluation_run.rules and merged_results:
976
- # run_rules(
977
- # local_results=merged_results,
978
- # rules=evaluation_run.rules,
979
- # judgment_api_key=evaluation_run.judgment_api_key,
980
- # organization_id=evaluation_run.organization_id
981
- # )
982
- # print(merged_results)
983
594
  send_results = [
984
- scoring_result.model_dump(warnings=False)
985
- for scoring_result in merged_results
595
+ scoring_result.model_dump(warnings=False) for scoring_result in results
986
596
  ]
987
- pretty_str = run_with_spinner(
988
- "Logging Results: ",
989
- log_evaluation_results,
990
- send_results,
991
- evaluation_run,
992
- )
993
- rprint(pretty_str)
994
597
 
995
- return merged_results
598
+ url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
599
+ rprint(
600
+ f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
601
+ )
602
+ return results
996
603
 
997
604
 
998
605
  def assert_test(scoring_results: List[ScoringResult]) -> None:
@@ -1025,15 +632,6 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
1025
632
  if failed_cases:
1026
633
  error_msg = "The following test cases failed: \n"
1027
634
  for fail_case in failed_cases:
1028
- # error_msg += f"\nInput: {fail_case['input']}\n"
1029
- # error_msg += f"Actual Output: {fail_case['actual_output']}\n"
1030
- # error_msg += f"Expected Output: {fail_case['expected_output']}\n"
1031
- # error_msg += f"Context: {fail_case['context']}\n"
1032
- # error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
1033
- # error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
1034
- # error_msg += f"Tools Called: {fail_case['tools_called']}\n"
1035
- # error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
1036
-
1037
635
  for fail_scorer in fail_case["failed_scorers"]:
1038
636
  error_msg += (
1039
637
  f"\nScorer Name: {fail_scorer.name}\n"