judgeval 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,7 +13,6 @@ from judgeval.data import (
13
13
  ScoringResult,
14
14
  Example,
15
15
  CustomExample,
16
- Sequence,
17
16
  Trace
18
17
  )
19
18
  from judgeval.scorers import (
@@ -25,21 +24,23 @@ from judgeval.scorers.score import a_execute_scoring
25
24
  from judgeval.constants import (
26
25
  ROOT_API,
27
26
  JUDGMENT_EVAL_API_URL,
28
- JUDGMENT_SEQUENCE_EVAL_API_URL,
27
+ JUDGMENT_TRACE_EVAL_API_URL,
29
28
  JUDGMENT_EVAL_LOG_API_URL,
30
29
  MAX_CONCURRENT_EVALUATIONS,
31
30
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
32
- JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL
31
+ JUDGMENT_GET_EVAL_STATUS_API_URL,
32
+ JUDGMENT_EVAL_FETCH_API_URL
33
33
  )
34
34
  from judgeval.common.exceptions import JudgmentAPIError
35
35
  from judgeval.common.logger import (
36
36
  debug,
37
37
  info,
38
- error,
38
+ error,
39
+ warning,
39
40
  example_logging_context
40
41
  )
41
42
  from judgeval.evaluation_run import EvaluationRun
42
- from judgeval.data.sequence_run import SequenceRun
43
+ from judgeval.data.trace_run import TraceRun
43
44
  from judgeval.common.tracer import Tracer
44
45
  from langchain_core.callbacks import BaseCallbackHandler
45
46
 
@@ -98,20 +99,20 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
98
99
  raise JudgmentAPIError(error_message)
99
100
  return response_data
100
101
 
101
- def execute_api_sequence_eval(sequence_run: SequenceRun) -> List[Dict]:
102
+ def execute_api_trace_eval(trace_run: TraceRun) -> List[Dict]:
102
103
  """
103
104
  Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
104
105
  """
105
106
 
106
107
  try:
107
108
  # submit API request to execute evals
108
- payload = sequence_run.model_dump(warnings=False)
109
+ payload = trace_run.model_dump(warnings=False)
109
110
  response = requests.post(
110
- JUDGMENT_SEQUENCE_EVAL_API_URL,
111
+ JUDGMENT_TRACE_EVAL_API_URL,
111
112
  headers={
112
113
  "Content-Type": "application/json",
113
- "Authorization": f"Bearer {sequence_run.judgment_api_key}",
114
- "X-Organization-Id": sequence_run.organization_id
114
+ "Authorization": f"Bearer {trace_run.judgment_api_key}",
115
+ "X-Organization-Id": trace_run.organization_id
115
116
  },
116
117
  json=payload,
117
118
  verify=True
@@ -282,7 +283,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
282
283
  raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
283
284
 
284
285
 
285
- def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
286
+ def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, TraceRun]) -> str:
286
287
  """
287
288
  Logs evaluation results to the Judgment API database.
288
289
 
@@ -327,51 +328,6 @@ def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[Eval
327
328
  error(f"Failed to save evaluation results to DB: {str(e)}")
328
329
  raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
329
330
 
330
- def retrieve_sequence_from_trace(trace_id: str, parent_span: str, judgment_api_key: str, organization_id: str) -> Sequence:
331
- """
332
- Retrieves a sequence from a trace ID.
333
- """
334
- """
335
- Logs evaluation results to the Judgment API database.
336
-
337
- Args:
338
- merged_results (List[ScoringResult]): The results to log
339
- evaluation_run (EvaluationRun): The evaluation run containing project info and API key
340
-
341
- Raises:
342
- JudgmentAPIError: If there's an API error during logging
343
- ValueError: If there's a validation error with the results
344
- """
345
- try:
346
- res = requests.post(
347
- JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL,
348
- headers={
349
- "Content-Type": "application/json",
350
- "Authorization": f"Bearer {judgment_api_key}",
351
- "X-Organization-Id": organization_id
352
- },
353
- json={
354
- "trace_id": trace_id,
355
- "trace_span_id": parent_span,
356
- },
357
- verify=True
358
- )
359
-
360
- if not res.ok:
361
- response_data = res.json()
362
- error_message = response_data.get('detail', 'An unknown error occurred.')
363
- error(f"Error {res.status_code}: {error_message}")
364
- raise JudgmentAPIError(error_message)
365
-
366
- return Sequence(**res.json())
367
- except requests.exceptions.RequestException as e:
368
- error(f"Request failed while saving evaluation results to DB: {str(e)}")
369
- raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
370
- except Exception as e:
371
- error(f"Failed to save evaluation results to DB: {str(e)}")
372
- raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
373
-
374
-
375
331
  def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
376
332
  """Run a function with a spinner in the terminal."""
377
333
  spinner = itertools.cycle(['|', '/', '-', '\\'])
@@ -415,69 +371,366 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
415
371
  if missing_params:
416
372
  print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
417
373
 
418
- def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
374
+ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
419
375
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
420
- if not override and sequence_run.log_results and not sequence_run.append:
376
+ if not override and trace_run.log_results and not trace_run.append:
421
377
  check_eval_run_name_exists(
422
- sequence_run.eval_name,
423
- sequence_run.project_name,
424
- sequence_run.judgment_api_key,
425
- sequence_run.organization_id
378
+ trace_run.eval_name,
379
+ trace_run.project_name,
380
+ trace_run.judgment_api_key,
381
+ trace_run.organization_id
426
382
  )
427
383
 
428
- if sequence_run.append:
384
+ if trace_run.append:
429
385
  # Check that the current experiment, if one exists, has the same type (examples of sequences)
430
386
  check_experiment_type(
431
- sequence_run.eval_name,
432
- sequence_run.project_name,
433
- sequence_run.judgment_api_key,
434
- sequence_run.organization_id,
387
+ trace_run.eval_name,
388
+ trace_run.project_name,
389
+ trace_run.judgment_api_key,
390
+ trace_run.organization_id,
435
391
  True
436
392
  )
437
393
 
438
394
  if function and tracer:
439
- new_sequences: List[Sequence] = []
395
+ new_traces: List[Trace] = []
396
+ tracer.offline_mode = True
440
397
  for example in examples:
441
398
  if example.input:
442
399
  result = run_with_spinner("Running agent function: ", function, **example.input)
443
400
  else:
444
401
  result = run_with_spinner("Running agent function: ", function)
445
402
  for i, trace in enumerate(tracer.traces):
446
- trace_id = trace['trace_id']
447
- parent_span = trace['entries'][0]['span_id']
448
- new_sequence = retrieve_sequence_from_trace(trace_id, parent_span, sequence_run.judgment_api_key, sequence_run.organization_id)
449
- new_sequence.expected_tools = examples[i].expected_tools
450
- new_sequences.append(new_sequence)
451
- sequence_run.sequences = new_sequences
452
-
453
- for sequence in sequence_run.sequences:
454
- sequence.scorers = sequence_run.scorers
403
+ # We set the root-level trace span with the expected tools of the Trace
404
+ trace = Trace(**trace)
405
+ trace.entries[0].expected_tools = examples[i].expected_tools
406
+ new_traces.append(trace)
407
+ trace_run.traces = new_traces
455
408
 
456
409
  # Execute evaluation using Judgment API
457
410
  info("Starting API evaluation")
458
411
  try: # execute an EvaluationRun with just JudgmentScorers
459
412
  debug("Sending request to Judgment API")
460
- response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
413
+ response_data: List[Dict] = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
461
414
  scoring_results = [ScoringResult(**result) for result in response_data["results"]]
462
415
  info(f"Received {len(scoring_results)} results from API")
463
416
  except JudgmentAPIError as e:
464
417
  error(f"An error occurred while executing the Judgment API request: {str(e)}")
465
418
  raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
466
419
  except ValueError as e:
467
- raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: {str(e)}")
420
+ raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: {str(e)}")
468
421
 
469
422
  # Convert the response data to `ScoringResult` objects
470
423
  debug("Processing API results")
471
- # TODO: allow for custom scorer on sequences
472
- if sequence_run.log_results:
473
- pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], sequence_run)
424
+ # TODO: allow for custom scorer on traces
425
+ if trace_run.log_results:
426
+ pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], trace_run)
474
427
  rprint(pretty_str)
475
428
 
476
429
  return scoring_results
477
430
 
478
431
 
479
432
 
480
- def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
433
+ async def get_evaluation_status(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str) -> Dict:
434
+ """
435
+ Gets the status of an async evaluation run.
436
+
437
+ Args:
438
+ eval_name (str): Name of the evaluation run
439
+ project_name (str): Name of the project
440
+ judgment_api_key (str): API key for authentication
441
+ organization_id (str): Organization ID for the evaluation
442
+
443
+ Returns:
444
+ Dict: Status information including:
445
+ - status: 'pending', 'running', 'completed', or 'failed'
446
+ - results: List of ScoringResult objects if completed
447
+ - error: Error message if failed
448
+ """
449
+ try:
450
+ response = requests.get(
451
+ JUDGMENT_GET_EVAL_STATUS_API_URL,
452
+ headers={
453
+ "Content-Type": "application/json",
454
+ "Authorization": f"Bearer {judgment_api_key}",
455
+ "X-Organization-Id": organization_id
456
+ },
457
+ params={
458
+ "eval_name": eval_name,
459
+ "project_name": project_name,
460
+ },
461
+ verify=True
462
+ )
463
+
464
+ if not response.ok:
465
+ error_message = response.json().get('detail', 'An unknown error occurred.')
466
+ error(f"Error checking evaluation status: {error_message}")
467
+ raise JudgmentAPIError(error_message)
468
+
469
+ return response.json()
470
+ except requests.exceptions.RequestException as e:
471
+ error(f"Failed to check evaluation status: {str(e)}")
472
+ raise JudgmentAPIError(f"Failed to check evaluation status: {str(e)}")
473
+
474
+ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, poll_interval_seconds: int = 5, original_examples: Optional[List[Example]] = None) -> List[ScoringResult]:
475
+ """
476
+ Polls until the evaluation is complete and returns the results.
477
+
478
+ Args:
479
+ eval_name (str): Name of the evaluation run
480
+ project_name (str): Name of the project
481
+ judgment_api_key (str): API key for authentication
482
+ organization_id (str): Organization ID for the evaluation
483
+ poll_interval_seconds (int, optional): Time between status checks in seconds. Defaults to 5.
484
+ original_examples (List[Example], optional): The original examples sent for evaluation.
485
+ If provided, will match results with original examples.
486
+
487
+ Returns:
488
+ List[ScoringResult]: The evaluation results
489
+ """
490
+ poll_count = 0
491
+ # Create example_id to Example mapping if original examples are provided
492
+ original_example_map = {}
493
+ if original_examples:
494
+ for example in original_examples:
495
+ original_example_map[example.example_id] = example
496
+
497
+ # Remove the expected scorer names extraction and checking
498
+ # We'll instead verify all examples have consistent scorer data
499
+ while True:
500
+ poll_count += 1
501
+ try:
502
+ # Log polling attempt
503
+ if poll_count % 4 == 0: # Log every 4th poll to avoid excess logging
504
+ info(f"Polling for evaluation '{eval_name}' in project '{project_name}' (attempt {poll_count})")
505
+
506
+ # Check status
507
+ response = requests.get(
508
+ JUDGMENT_GET_EVAL_STATUS_API_URL,
509
+ headers={
510
+ "Content-Type": "application/json",
511
+ "Authorization": f"Bearer {judgment_api_key}",
512
+ "X-Organization-Id": organization_id
513
+ },
514
+ params={
515
+ "eval_name": eval_name,
516
+ "project_name": project_name
517
+ },
518
+ verify=True
519
+ )
520
+
521
+ if not response.ok:
522
+ error_message = response.json().get('detail', 'An unknown error occurred.')
523
+ error(f"Error checking evaluation status: {error_message}")
524
+ # Don't raise exception immediately, just log and continue polling
525
+ await asyncio.sleep(poll_interval_seconds)
526
+ continue
527
+
528
+ status_data = response.json()
529
+ status = status_data.get("status")
530
+
531
+ # If complete, get results and return
532
+ if status == "completed" or status == "complete":
533
+ info(f"Evaluation '{eval_name}' reported as completed, fetching and verifying results...")
534
+ results_response = requests.post(
535
+ JUDGMENT_EVAL_FETCH_API_URL,
536
+ headers={
537
+ "Content-Type": "application/json",
538
+ "Authorization": f"Bearer {judgment_api_key}",
539
+ "X-Organization-Id": organization_id
540
+ },
541
+ json={
542
+ "project_name": project_name,
543
+ "eval_name": eval_name
544
+ },
545
+ verify=True
546
+ )
547
+
548
+ if not results_response.ok:
549
+ error_message = results_response.json().get('detail', 'An unknown error occurred.')
550
+ error(f"Error fetching evaluation results: {error_message}")
551
+ raise JudgmentAPIError(error_message)
552
+
553
+ result_data = results_response.json()
554
+
555
+ if "examples" in result_data:
556
+ examples_data = result_data.get("examples", [])
557
+
558
+
559
+ info(f"Successfully fetched {len(examples_data)} results for evaluation '{eval_name}'")
560
+
561
+ # Check for result validity if original examples are provided
562
+ if original_example_map:
563
+ # Verify all returned examples have matching original examples
564
+ has_invalid_results = False
565
+ for example_data in examples_data:
566
+ example_id = example_data.get("example_id")
567
+
568
+ if example_id not in original_example_map:
569
+ warning(f"Server returned example with ID {example_id} not found in original examples. " +
570
+ f"This indicates stale or incorrect data. Continuing to poll...")
571
+ has_invalid_results = True
572
+ break
573
+
574
+ # If any invalid examples found, continue polling
575
+ if has_invalid_results:
576
+ info("Detected stale data. Waiting before polling again...")
577
+ await asyncio.sleep(poll_interval_seconds)
578
+ continue
579
+
580
+ # Check if we received the expected number of results
581
+ if len(original_examples) != len(examples_data):
582
+ warning(f"Expected {len(original_examples)} results but got {len(examples_data)} results. " +
583
+ f"This indicates incomplete data. Continuing to poll...")
584
+ await asyncio.sleep(poll_interval_seconds)
585
+ continue
586
+
587
+ # Collect all example IDs from scorer data
588
+ scorer_example_ids = set()
589
+ for example_data in examples_data:
590
+ scorer_data_list = example_data.get("scorer_data", [])
591
+ for scorer_data in scorer_data_list:
592
+ if "example_id" in scorer_data:
593
+ scorer_example_ids.add(scorer_data["example_id"])
594
+
595
+ # Get the set of original example IDs
596
+ original_example_ids = set(original_example_map.keys())
597
+
598
+ # Check if the sets are equal
599
+ missing_in_scorer = original_example_ids - scorer_example_ids
600
+ extra_in_scorer = scorer_example_ids - original_example_ids
601
+
602
+ if missing_in_scorer or extra_in_scorer:
603
+ if missing_in_scorer:
604
+ warning(f"Examples missing in scorer data: {missing_in_scorer}")
605
+ if extra_in_scorer:
606
+ warning(f"Extra examples in scorer data: {extra_in_scorer}")
607
+ info("Detected mismatched example IDs in scorer data. Waiting before polling again...")
608
+ await asyncio.sleep(poll_interval_seconds)
609
+ continue
610
+
611
+ # Create ScoringResult objects from the raw data
612
+ scoring_results = []
613
+
614
+ for example_data in examples_data:
615
+ # Extract example_id from the server response
616
+ example_id = example_data.get("example_id")
617
+
618
+ # Create ScorerData objects
619
+ scorer_data_list = []
620
+ for raw_scorer_data in example_data.get("scorer_data", []):
621
+ scorer_data_list.append(ScorerData(**raw_scorer_data))
622
+
623
+ # Use the original Example object if we have it and the ID matches
624
+ if original_example_map:
625
+ example = original_example_map[example_id]
626
+ debug(f"Matched result with original example {example_id}")
627
+ else:
628
+ # Create Example from example data (excluding scorer_data) if no original examples provided
629
+ example_dict = {k: v for k, v in example_data.items() if k != "scorer_data"}
630
+ example = Example(**example_dict)
631
+
632
+ # Calculate success based on whether all scorer_data entries were successful
633
+ success = all(scorer_data.success for scorer_data in scorer_data_list) if scorer_data_list else False
634
+
635
+ # Create ScoringResult
636
+ scoring_result = ScoringResult(
637
+ success=success, # Set based on all scorer data success values
638
+ scorers_data=scorer_data_list,
639
+ data_object=example
640
+ )
641
+ scoring_results.append(scoring_result)
642
+
643
+ # If we got here, all validation checks passed
644
+ info(f"Verified complete results for all {len(scoring_results)} examples with all expected scorer data")
645
+ return scoring_results
646
+ else:
647
+ # No examples found
648
+ info(f"No example results found for completed evaluation '{eval_name}'")
649
+ return []
650
+
651
+ elif status == "failed":
652
+ # Evaluation failed
653
+ error_message = status_data.get("error", "Unknown error")
654
+ error(f"Evaluation '{eval_name}' failed: {error_message}")
655
+ raise JudgmentAPIError(f"Evaluation failed: {error_message}")
656
+
657
+ elif status == "pending" or status == "running":
658
+ # Only log occasionally for pending/running to avoid flooding logs
659
+ if poll_count % 4 == 0:
660
+ info(f"Evaluation '{eval_name}' status: {status}")
661
+
662
+ # Wait before checking again
663
+ await asyncio.sleep(poll_interval_seconds)
664
+
665
+ except Exception as e:
666
+ if isinstance(e, JudgmentAPIError):
667
+ raise
668
+
669
+ # For other exceptions, log and continue polling
670
+ error(f"Error checking evaluation status: {str(e)}")
671
+ if poll_count > 20: # Only raise exception after many failed attempts
672
+ raise JudgmentAPIError(f"Error checking evaluation status after {poll_count} attempts: {str(e)}")
673
+
674
+ # Continue polling after a delay
675
+ await asyncio.sleep(poll_interval_seconds)
676
+
677
+ async def await_with_spinner(task, message: str = "Awaiting async task: "):
678
+ """
679
+ Display a spinner while awaiting an async task.
680
+
681
+ Args:
682
+ task: The asyncio task to await
683
+ message (str): Message to display with the spinner
684
+
685
+ Returns:
686
+ Any: The result of the awaited task
687
+ """
688
+ spinner = itertools.cycle(['|', '/', '-', '\\'])
689
+
690
+ # Create an event to signal when to stop the spinner
691
+ stop_spinner_event = asyncio.Event()
692
+
693
+ async def display_spinner():
694
+ while not stop_spinner_event.is_set():
695
+ sys.stdout.write(f'\r{message}{next(spinner)}')
696
+ sys.stdout.flush()
697
+ await asyncio.sleep(0.1)
698
+
699
+ # Start the spinner in a separate task
700
+ spinner_task = asyncio.create_task(display_spinner())
701
+
702
+ try:
703
+ # Await the actual task
704
+ result = await task
705
+ finally:
706
+ # Signal the spinner to stop and wait for it to finish
707
+ stop_spinner_event.set()
708
+ await spinner_task
709
+
710
+ # Clear the spinner line
711
+ sys.stdout.write('\r' + ' ' * (len(message) + 1) + '\r')
712
+ sys.stdout.flush()
713
+
714
+ return result
715
+
716
+ class SpinnerWrappedTask:
717
+ """
718
+ A wrapper for an asyncio task that displays a spinner when awaited.
719
+ """
720
+ def __init__(self, task, message: str):
721
+ self.task = task
722
+ self.message = message
723
+
724
+ def __await__(self):
725
+ async def _spin_and_await():
726
+ return await await_with_spinner(self.task, self.message)
727
+ return _spin_and_await().__await__()
728
+
729
+ # Proxy all Task attributes and methods to the underlying task
730
+ def __getattr__(self, name):
731
+ return getattr(self.task, name)
732
+
733
+ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> Union[List[ScoringResult], asyncio.Task]:
481
734
  """
482
735
  Executes an evaluation of `Example`s using one or more `Scorer`s
483
736
 
@@ -485,21 +738,12 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
485
738
  evaluation_run (EvaluationRun): Stores example and evaluation together for running
486
739
  override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
487
740
  ignore_errors (bool, optional): Whether to ignore scorer errors during evaluation. Defaults to True.
741
+ async_execution (bool, optional): Whether to execute the evaluation asynchronously. Defaults to False.
488
742
 
489
- Args:
490
- project_name (str): The name of the project the evaluation results belong to
491
- eval_name (str): The name of the evaluation run
492
- examples (List[Example]): The examples to evaluate
493
- scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
494
- model (str): The model used as a judge when using LLM as a Judge
495
- aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
496
- metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
497
- judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
498
- log_results (bool): Whether to log the results to the Judgment API
499
- rules (Optional[List[Rule]]): Rules to evaluate against scoring results
500
-
501
743
  Returns:
502
- List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
744
+ Union[List[ScoringResult], Union[asyncio.Task, SpinnerWrappedTask]]:
745
+ - If async_execution is False, returns a list of ScoringResult objects
746
+ - If async_execution is True, returns a Task that will resolve to a list of ScoringResult objects when awaited
503
747
  """
504
748
 
505
749
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -570,21 +814,51 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
570
814
  if async_execution:
571
815
  if len(local_scorers) > 0:
572
816
  error("Local scorers are not supported in async execution")
817
+ raise ValueError("Local scorers are not supported in async execution")
573
818
 
574
819
  check_examples(evaluation_run.examples, evaluation_run.scorers)
575
820
  info("Starting async evaluation")
576
- payload = evaluation_run.model_dump(warnings=False)
577
- requests.post(
578
- JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
579
- headers={
580
- "Content-Type": "application/json",
581
- "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
582
- "X-Organization-Id": evaluation_run.organization_id
583
- },
584
- json=payload,
585
- verify=True
821
+
822
+ async def _async_evaluation_workflow():
823
+ # Create a payload
824
+ payload = evaluation_run.model_dump(warnings=False)
825
+
826
+ # Send the evaluation to the queue
827
+ response = requests.post(
828
+ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
829
+ headers={
830
+ "Content-Type": "application/json",
831
+ "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
832
+ "X-Organization-Id": evaluation_run.organization_id
833
+ },
834
+ json=payload,
835
+ verify=True
836
+ )
837
+
838
+ if not response.ok:
839
+ error_message = response.json().get('detail', 'An unknown error occurred.')
840
+ error(f"Error adding evaluation to queue: {error_message}")
841
+ raise JudgmentAPIError(error_message)
842
+
843
+ info(f"Successfully added evaluation '{evaluation_run.eval_name}' to queue")
844
+
845
+ # Poll until the evaluation is complete
846
+ return await _poll_evaluation_until_complete(
847
+ eval_name=evaluation_run.eval_name,
848
+ project_name=evaluation_run.project_name,
849
+ judgment_api_key=evaluation_run.judgment_api_key,
850
+ organization_id=evaluation_run.organization_id,
851
+ original_examples=evaluation_run.examples # Pass the original examples
852
+ )
853
+
854
+ # Create a regular task
855
+ task = asyncio.create_task(_async_evaluation_workflow())
856
+
857
+ # Wrap it in our custom awaitable that will show a spinner only when awaited
858
+ return SpinnerWrappedTask(
859
+ task,
860
+ f"Processing evaluation '{evaluation_run.eval_name}': "
586
861
  )
587
- print("Successfully added evaluation to queue")
588
862
  else:
589
863
  if judgment_scorers:
590
864
  # Execute evaluation using Judgment API
@@ -5,13 +5,15 @@
5
5
  # Internal imports
6
6
  from judgeval.scorers.api_scorer import APIJudgmentScorer
7
7
  from judgeval.constants import APIScorer
8
-
8
+ from typing import Optional, Dict
9
9
  class ToolOrderScorer(APIJudgmentScorer):
10
- def __init__(self, threshold: float=1.0):
10
+ kwargs: Optional[Dict] = None
11
+ def __init__(self, threshold: float=1.0, exact_match: bool=False):
11
12
  super().__init__(
12
13
  threshold=threshold,
13
14
  score_type=APIScorer.TOOL_ORDER,
14
15
  )
16
+ self.kwargs = {"exact_match": exact_match}
15
17
 
16
18
  @property
17
19
  def __name__(self):