judgeval 0.0.38__py3-none-any.whl → 0.0.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,12 +28,15 @@ from judgeval.constants import (
28
28
  JUDGMENT_EVAL_LOG_API_URL,
29
29
  MAX_CONCURRENT_EVALUATIONS,
30
30
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
31
+ JUDGMENT_GET_EVAL_STATUS_API_URL,
32
+ JUDGMENT_EVAL_FETCH_API_URL
31
33
  )
32
34
  from judgeval.common.exceptions import JudgmentAPIError
33
35
  from judgeval.common.logger import (
34
36
  debug,
35
37
  info,
36
- error,
38
+ error,
39
+ warning,
37
40
  example_logging_context
38
41
  )
39
42
  from judgeval.evaluation_run import EvaluationRun
@@ -201,9 +204,9 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
201
204
  )
202
205
  return results
203
206
 
204
- def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_sequence: bool) -> None:
207
+ def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_trace: bool) -> None:
205
208
  """
206
- Checks if the current experiment, if one exists, has the same type (examples of sequences)
209
+ Checks if the current experiment, if one exists, has the same type (examples of traces)
207
210
  """
208
211
  try:
209
212
  response = requests.post(
@@ -217,7 +220,7 @@ def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: s
217
220
  "eval_name": eval_name,
218
221
  "project_name": project_name,
219
222
  "judgment_api_key": judgment_api_key,
220
- "is_sequence": is_sequence
223
+ "is_trace": is_trace
221
224
  },
222
225
  verify=True
223
226
  )
@@ -379,7 +382,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
379
382
  )
380
383
 
381
384
  if trace_run.append:
382
- # Check that the current experiment, if one exists, has the same type (examples of sequences)
385
+ # Check that the current experiment, if one exists, has the same type (examples or traces)
383
386
  check_experiment_type(
384
387
  trace_run.eval_name,
385
388
  trace_run.project_name,
@@ -387,13 +390,18 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
387
390
  trace_run.organization_id,
388
391
  True
389
392
  )
390
-
391
393
  if function and tracer:
392
394
  new_traces: List[Trace] = []
393
395
  tracer.offline_mode = True
396
+ tracer.traces = []
394
397
  for example in examples:
395
398
  if example.input:
396
- result = run_with_spinner("Running agent function: ", function, **example.input)
399
+ if isinstance(example.input, str):
400
+ result = run_with_spinner("Running agent function: ", function, example.input)
401
+ elif isinstance(example.input, dict):
402
+ result = run_with_spinner("Running agent function: ", function, **example.input)
403
+ else:
404
+ raise ValueError(f"Input must be string or dict, got {type(example.input)}")
397
405
  else:
398
406
  result = run_with_spinner("Running agent function: ", function)
399
407
  for i, trace in enumerate(tracer.traces):
@@ -402,6 +410,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
402
410
  trace.entries[0].expected_tools = examples[i].expected_tools
403
411
  new_traces.append(trace)
404
412
  trace_run.traces = new_traces
413
+ tracer.traces = []
405
414
 
406
415
  # Execute evaluation using Judgment API
407
416
  info("Starting API evaluation")
@@ -420,14 +429,327 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
420
429
  debug("Processing API results")
421
430
  # TODO: allow for custom scorer on traces
422
431
  if trace_run.log_results:
423
- pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], trace_run)
432
+ pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["agent_results"], trace_run)
424
433
  rprint(pretty_str)
425
434
 
426
435
  return scoring_results
427
436
 
428
437
 
429
438
 
430
- def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
439
+ async def get_evaluation_status(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str) -> Dict:
440
+ """
441
+ Gets the status of an async evaluation run.
442
+
443
+ Args:
444
+ eval_name (str): Name of the evaluation run
445
+ project_name (str): Name of the project
446
+ judgment_api_key (str): API key for authentication
447
+ organization_id (str): Organization ID for the evaluation
448
+
449
+ Returns:
450
+ Dict: Status information including:
451
+ - status: 'pending', 'running', 'completed', or 'failed'
452
+ - results: List of ScoringResult objects if completed
453
+ - error: Error message if failed
454
+ """
455
+ try:
456
+ response = requests.get(
457
+ JUDGMENT_GET_EVAL_STATUS_API_URL,
458
+ headers={
459
+ "Content-Type": "application/json",
460
+ "Authorization": f"Bearer {judgment_api_key}",
461
+ "X-Organization-Id": organization_id
462
+ },
463
+ params={
464
+ "eval_name": eval_name,
465
+ "project_name": project_name,
466
+ },
467
+ verify=True
468
+ )
469
+
470
+ if not response.ok:
471
+ error_message = response.json().get('detail', 'An unknown error occurred.')
472
+ error(f"Error checking evaluation status: {error_message}")
473
+ raise JudgmentAPIError(error_message)
474
+
475
+ return response.json()
476
+ except requests.exceptions.RequestException as e:
477
+ error(f"Failed to check evaluation status: {str(e)}")
478
+ raise JudgmentAPIError(f"Failed to check evaluation status: {str(e)}")
479
+
480
+ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, poll_interval_seconds: int = 5, original_examples: Optional[List[Example]] = None) -> List[ScoringResult]:
481
+ """
482
+ Polls until the evaluation is complete and returns the results.
483
+
484
+ Args:
485
+ eval_name (str): Name of the evaluation run
486
+ project_name (str): Name of the project
487
+ judgment_api_key (str): API key for authentication
488
+ organization_id (str): Organization ID for the evaluation
489
+ poll_interval_seconds (int, optional): Time between status checks in seconds. Defaults to 5.
490
+ original_examples (List[Example], optional): The original examples sent for evaluation.
491
+ If provided, will match results with original examples.
492
+
493
+ Returns:
494
+ List[ScoringResult]: The evaluation results
495
+ """
496
+ poll_count = 0
497
+ # Create example_id to Example mapping if original examples are provided
498
+ original_example_map = {}
499
+ if original_examples:
500
+ for example in original_examples:
501
+ original_example_map[example.example_id] = example
502
+
503
+ # Remove the expected scorer names extraction and checking
504
+ # We'll instead verify all examples have consistent scorer data
505
+ while True:
506
+ poll_count += 1
507
+ try:
508
+ # Log polling attempt
509
+ if poll_count % 4 == 0: # Log every 4th poll to avoid excess logging
510
+ info(f"Polling for evaluation '{eval_name}' in project '{project_name}' (attempt {poll_count})")
511
+
512
+ # Check status
513
+ response = await asyncio.to_thread(
514
+ requests.get,
515
+ JUDGMENT_GET_EVAL_STATUS_API_URL,
516
+ headers={
517
+ "Content-Type": "application/json",
518
+ "Authorization": f"Bearer {judgment_api_key}",
519
+ "X-Organization-Id": organization_id
520
+ },
521
+ params={
522
+ "eval_name": eval_name,
523
+ "project_name": project_name
524
+ },
525
+ verify=True
526
+ )
527
+
528
+ if not response.ok:
529
+ error_message = response.json().get('detail', 'An unknown error occurred.')
530
+ error(f"Error checking evaluation status: {error_message}")
531
+ # Don't raise exception immediately, just log and continue polling
532
+ await asyncio.sleep(poll_interval_seconds)
533
+ continue
534
+
535
+ status_data = response.json()
536
+ status = status_data.get("status")
537
+
538
+ # If complete, get results and return
539
+ if status == "completed" or status == "complete":
540
+ info(f"Evaluation '{eval_name}' reported as completed, fetching and verifying results...")
541
+ results_response = await asyncio.to_thread(
542
+ requests.post,
543
+ JUDGMENT_EVAL_FETCH_API_URL,
544
+ headers={
545
+ "Content-Type": "application/json",
546
+ "Authorization": f"Bearer {judgment_api_key}",
547
+ "X-Organization-Id": organization_id
548
+ },
549
+ json={
550
+ "project_name": project_name,
551
+ "eval_name": eval_name
552
+ },
553
+ verify=True
554
+ )
555
+
556
+ if not results_response.ok:
557
+ error_message = results_response.json().get('detail', 'An unknown error occurred.')
558
+ error(f"Error fetching evaluation results: {error_message}")
559
+ raise JudgmentAPIError(error_message)
560
+
561
+ result_data = results_response.json()
562
+
563
+ if "examples" in result_data:
564
+ examples_data = result_data.get("examples", [])
565
+
566
+
567
+ info(f"Successfully fetched {len(examples_data)} results for evaluation '{eval_name}'")
568
+
569
+ # Check for result validity if original examples are provided
570
+ if original_example_map:
571
+ # Verify all returned examples have matching original examples
572
+ has_invalid_results = False
573
+ for example_data in examples_data:
574
+ example_id = example_data.get("example_id")
575
+
576
+ if example_id not in original_example_map:
577
+ warning(f"Server returned example with ID {example_id} not found in original examples. " +
578
+ f"This indicates stale or incorrect data. Continuing to poll...")
579
+ has_invalid_results = True
580
+ break
581
+
582
+ # If any invalid examples found, continue polling
583
+ if has_invalid_results:
584
+ info("Detected stale data. Waiting before polling again...")
585
+ await asyncio.sleep(poll_interval_seconds)
586
+ continue
587
+
588
+ # Check if we received the expected number of results
589
+ if len(original_examples) != len(examples_data):
590
+ warning(f"Expected {len(original_examples)} results but got {len(examples_data)} results. " +
591
+ f"This indicates incomplete data. Continuing to poll...")
592
+ await asyncio.sleep(poll_interval_seconds)
593
+ continue
594
+
595
+ # Collect all example IDs from scorer data
596
+ scorer_example_ids = set()
597
+ for example_data in examples_data:
598
+ scorer_data_list = example_data.get("scorer_data", [])
599
+ for scorer_data in scorer_data_list:
600
+ if "example_id" in scorer_data:
601
+ scorer_example_ids.add(scorer_data["example_id"])
602
+
603
+ # Get the set of original example IDs
604
+ original_example_ids = set(original_example_map.keys())
605
+
606
+ # Check if the sets are equal
607
+ missing_in_scorer = original_example_ids - scorer_example_ids
608
+ extra_in_scorer = scorer_example_ids - original_example_ids
609
+
610
+ if missing_in_scorer or extra_in_scorer:
611
+ if missing_in_scorer:
612
+ warning(f"Examples missing in scorer data: {missing_in_scorer}")
613
+ if extra_in_scorer:
614
+ warning(f"Extra examples in scorer data: {extra_in_scorer}")
615
+ info("Detected mismatched example IDs in scorer data. Waiting before polling again...")
616
+ await asyncio.sleep(poll_interval_seconds)
617
+ continue
618
+
619
+ # Create ScoringResult objects from the raw data
620
+ scoring_results = []
621
+
622
+ for example_data in examples_data:
623
+ # Extract example_id from the server response
624
+ example_id = example_data.get("example_id")
625
+
626
+ # Create ScorerData objects
627
+ scorer_data_list = []
628
+ for raw_scorer_data in example_data.get("scorer_data", []):
629
+ scorer_data_list.append(ScorerData(**raw_scorer_data))
630
+
631
+ # Use the original Example object if we have it and the ID matches
632
+ if original_example_map:
633
+ example = original_example_map[example_id]
634
+ debug(f"Matched result with original example {example_id}")
635
+ else:
636
+ # Create Example from example data (excluding scorer_data) if no original examples provided
637
+ example_dict = {k: v for k, v in example_data.items() if k != "scorer_data"}
638
+ example = Example(**example_dict)
639
+
640
+ # Calculate success based on whether all scorer_data entries were successful
641
+ success = all(scorer_data.success for scorer_data in scorer_data_list) if scorer_data_list else False
642
+
643
+ # Create ScoringResult
644
+ scoring_result = ScoringResult(
645
+ success=success, # Set based on all scorer data success values
646
+ scorers_data=scorer_data_list,
647
+ data_object=example
648
+ )
649
+ scoring_results.append(scoring_result)
650
+
651
+ # If we got here, all validation checks passed
652
+ info(f"Verified complete results for all {len(scoring_results)} examples with all expected scorer data")
653
+ return scoring_results
654
+ else:
655
+ # No examples found
656
+ info(f"No example results found for completed evaluation '{eval_name}'")
657
+ return []
658
+
659
+ elif status == "failed":
660
+ # Evaluation failed
661
+ error_message = status_data.get("error", "Unknown error")
662
+ error(f"Evaluation '{eval_name}' failed: {error_message}")
663
+ raise JudgmentAPIError(f"Evaluation failed: {error_message}")
664
+
665
+ elif status == "pending" or status == "running":
666
+ # Only log occasionally for pending/running to avoid flooding logs
667
+ if poll_count % 4 == 0:
668
+ info(f"Evaluation '{eval_name}' status: {status}")
669
+
670
+ # Wait before checking again
671
+ await asyncio.sleep(poll_interval_seconds)
672
+
673
+ except Exception as e:
674
+ if isinstance(e, JudgmentAPIError):
675
+ raise
676
+
677
+ # For other exceptions, log and continue polling
678
+ error(f"Error checking evaluation status: {str(e)}")
679
+ if poll_count > 20: # Only raise exception after many failed attempts
680
+ raise JudgmentAPIError(f"Error checking evaluation status after {poll_count} attempts: {str(e)}")
681
+
682
+ # Continue polling after a delay
683
+ await asyncio.sleep(poll_interval_seconds)
684
+
685
+ async def await_with_spinner(task, message: str = "Awaiting async task: "):
686
+ """
687
+ Display a spinner while awaiting an async task.
688
+
689
+ Args:
690
+ task: The asyncio task to await
691
+ message (str): Message to display with the spinner
692
+
693
+ Returns:
694
+ Any: The result of the awaited task
695
+ """
696
+ spinner = itertools.cycle(['|', '/', '-', '\\'])
697
+
698
+ # Create an event to signal when to stop the spinner
699
+ stop_spinner_event = asyncio.Event()
700
+
701
+ async def display_spinner():
702
+ while not stop_spinner_event.is_set():
703
+ sys.stdout.write(f'\r{message}{next(spinner)}')
704
+ sys.stdout.flush()
705
+ await asyncio.sleep(0.1)
706
+
707
+ # Start the spinner in a separate task
708
+ spinner_task = asyncio.create_task(display_spinner())
709
+
710
+ try:
711
+ # Await the actual task
712
+ result = await task
713
+ finally:
714
+ # Signal the spinner to stop and wait for it to finish
715
+ stop_spinner_event.set()
716
+ await spinner_task
717
+
718
+ # Clear the spinner line
719
+ sys.stdout.write('\r' + ' ' * (len(message) + 1) + '\r')
720
+ sys.stdout.flush()
721
+
722
+ return result
723
+
724
+ class SpinnerWrappedTask:
725
+ """
726
+ A wrapper for an asyncio task that displays a spinner when awaited.
727
+ """
728
+ def __init__(self, task, message: str):
729
+ self.task = task
730
+ self.message = message
731
+
732
+ def __await__(self):
733
+ async def _spin_and_await():
734
+ # self.task resolves to (scoring_results, pretty_str_to_print)
735
+ task_result_tuple = await await_with_spinner(self.task, self.message)
736
+
737
+ # Unpack the tuple
738
+ scoring_results, pretty_str_to_print = task_result_tuple
739
+
740
+ # Print the pretty string if it exists, after spinner is cleared
741
+ if pretty_str_to_print:
742
+ rprint(pretty_str_to_print)
743
+
744
+ # Return only the scoring_results to the original awaiter
745
+ return scoring_results
746
+ return _spin_and_await().__await__()
747
+
748
+ # Proxy all Task attributes and methods to the underlying task
749
+ def __getattr__(self, name):
750
+ return getattr(self.task, name)
751
+
752
+ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> Union[List[ScoringResult], asyncio.Task]:
431
753
  """
432
754
  Executes an evaluation of `Example`s using one or more `Scorer`s
433
755
 
@@ -435,21 +757,12 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
435
757
  evaluation_run (EvaluationRun): Stores example and evaluation together for running
436
758
  override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
437
759
  ignore_errors (bool, optional): Whether to ignore scorer errors during evaluation. Defaults to True.
760
+ async_execution (bool, optional): Whether to execute the evaluation asynchronously. Defaults to False.
438
761
 
439
- Args:
440
- project_name (str): The name of the project the evaluation results belong to
441
- eval_name (str): The name of the evaluation run
442
- examples (List[Example]): The examples to evaluate
443
- scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
444
- model (str): The model used as a judge when using LLM as a Judge
445
- aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
446
- metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
447
- judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
448
- log_results (bool): Whether to log the results to the Judgment API
449
- rules (Optional[List[Rule]]): Rules to evaluate against scoring results
450
-
451
762
  Returns:
452
- List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
763
+ Union[List[ScoringResult], Union[asyncio.Task, SpinnerWrappedTask]]:
764
+ - If async_execution is False, returns a list of ScoringResult objects
765
+ - If async_execution is True, returns a Task that will resolve to a list of ScoringResult objects when awaited
453
766
  """
454
767
 
455
768
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -462,7 +775,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
462
775
  )
463
776
 
464
777
  if evaluation_run.append:
465
- # Check that the current experiment, if one exists, has the same type (examples of sequences)
778
+ # Check that the current experiment, if one exists, has the same type (examples of traces)
466
779
  check_experiment_type(
467
780
  evaluation_run.eval_name,
468
781
  evaluation_run.project_name,
@@ -475,8 +788,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
475
788
  debug("Initializing examples with IDs and timestamps")
476
789
  for idx, example in enumerate(evaluation_run.examples):
477
790
  example.example_index = idx # Set numeric index
478
- example.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
479
- with example_logging_context(example.timestamp, example.example_id):
791
+ with example_logging_context(example.created_at, example.example_id):
480
792
  debug(f"Initialized example {example.example_id} (index: {example.example_index})")
481
793
  debug(f"Input: {example.input}")
482
794
  debug(f"Actual output: {example.actual_output}")
@@ -520,21 +832,67 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
520
832
  if async_execution:
521
833
  if len(local_scorers) > 0:
522
834
  error("Local scorers are not supported in async execution")
835
+ raise ValueError("Local scorers are not supported in async execution")
523
836
 
524
837
  check_examples(evaluation_run.examples, evaluation_run.scorers)
525
838
  info("Starting async evaluation")
526
- payload = evaluation_run.model_dump(warnings=False)
527
- requests.post(
528
- JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
529
- headers={
530
- "Content-Type": "application/json",
531
- "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
532
- "X-Organization-Id": evaluation_run.organization_id
533
- },
534
- json=payload,
535
- verify=True
839
+
840
+ async def _async_evaluation_workflow():
841
+ # Create a payload
842
+ payload = evaluation_run.model_dump(warnings=False)
843
+
844
+ # Send the evaluation to the queue
845
+ response = await asyncio.to_thread(
846
+ requests.post,
847
+ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
848
+ headers={
849
+ "Content-Type": "application/json",
850
+ "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
851
+ "X-Organization-Id": evaluation_run.organization_id
852
+ },
853
+ json=payload,
854
+ verify=True
855
+ )
856
+
857
+ if not response.ok:
858
+ error_message = response.json().get('detail', 'An unknown error occurred.')
859
+ error(f"Error adding evaluation to queue: {error_message}")
860
+ raise JudgmentAPIError(error_message)
861
+
862
+ info(f"Successfully added evaluation '{evaluation_run.eval_name}' to queue")
863
+
864
+ # Poll until the evaluation is complete
865
+ results = await _poll_evaluation_until_complete(
866
+ eval_name=evaluation_run.eval_name,
867
+ project_name=evaluation_run.project_name,
868
+ judgment_api_key=evaluation_run.judgment_api_key,
869
+ organization_id=evaluation_run.organization_id,
870
+ original_examples=evaluation_run.examples # Pass the original examples
871
+ )
872
+
873
+ pretty_str_to_print = None
874
+ if evaluation_run.log_results and results: # Ensure results exist before logging
875
+ send_results = [scoring_result.model_dump(warnings=False) for scoring_result in results]
876
+ try:
877
+ # Run the blocking log_evaluation_results in a separate thread
878
+ pretty_str_to_print = await asyncio.to_thread(
879
+ log_evaluation_results,
880
+ send_results,
881
+ evaluation_run
882
+ )
883
+ except Exception as e:
884
+ error(f"Error logging results after async evaluation: {str(e)}")
885
+
886
+ return results, pretty_str_to_print
887
+
888
+ # Create a regular task
889
+ task = asyncio.create_task(_async_evaluation_workflow())
890
+
891
+ # Wrap it in our custom awaitable that will show a spinner only when awaited
892
+ return SpinnerWrappedTask(
893
+ task,
894
+ f"Processing evaluation '{evaluation_run.eval_name}': "
536
895
  )
537
- print("Successfully added evaluation to queue")
538
896
  else:
539
897
  if judgment_scorers:
540
898
  # Execute evaluation using Judgment API
@@ -571,7 +929,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
571
929
  # We should be removing local scorers soon
572
930
  info("Starting local evaluation")
573
931
  for example in evaluation_run.examples:
574
- with example_logging_context(example.timestamp, example.example_id):
932
+ with example_logging_context(example.created_at, example.example_id):
575
933
  debug(f"Processing example {example.example_id}: {example.input}")
576
934
 
577
935
  results: List[ScoringResult] = asyncio.run(
@@ -1,6 +1,6 @@
1
1
  from judgeval.scorers.api_scorer import APIJudgmentScorer
2
2
  from judgeval.scorers.judgeval_scorer import JudgevalScorer
3
- from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
3
+ from judgeval.scorers.prompt_scorer import PromptScorer
4
4
  from judgeval.scorers.judgeval_scorers.api_scorers import (
5
5
  ExecutionOrderScorer,
6
6
  JSONCorrectnessScorer,
@@ -17,6 +17,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
17
17
  GroundednessScorer,
18
18
  DerailmentScorer,
19
19
  ToolOrderScorer,
20
+ ClassifierScorer,
21
+ ToolDependencyScorer,
20
22
  )
21
23
  from judgeval.scorers.judgeval_scorers.classifiers import (
22
24
  Text2SQLScorer,
@@ -43,4 +45,5 @@ __all__ = [
43
45
  "GroundednessScorer",
44
46
  "DerailmentScorer",
45
47
  "ToolOrderScorer",
48
+ "ToolDependencyScorer",
46
49
  ]
@@ -39,6 +39,8 @@ class JudgevalScorer:
39
39
  evaluation_cost: Optional[float] = None # The cost of running the scorer
40
40
  verbose_logs: Optional[str] = None # The verbose logs of the scorer
41
41
  additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
42
+ error: Optional[str] = None
43
+ success: Optional[bool] = None
42
44
 
43
45
  def __init__(
44
46
  self,
@@ -145,3 +147,9 @@ class JudgevalScorer:
145
147
  "additional_metadata": self.additional_metadata,
146
148
  }
147
149
  return f"JudgevalScorer({attributes})"
150
+
151
+ def to_dict(self):
152
+ return {
153
+ "score_type": str(self.score_type), # Convert enum to string for serialization
154
+ "threshold": self.threshold
155
+ }
@@ -13,6 +13,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import
13
13
  from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
14
14
  from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
15
15
  from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
16
+ from judgeval.scorers.judgeval_scorers.api_scorers.classifier_scorer import ClassifierScorer
17
+ from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import ToolDependencyScorer
16
18
  __all__ = [
17
19
  "ExecutionOrderScorer",
18
20
  "JSONCorrectnessScorer",
@@ -29,4 +31,6 @@ __all__ = [
29
31
  "GroundednessScorer",
30
32
  "DerailmentScorer",
31
33
  "ToolOrderScorer",
34
+ "ClassifierScorer",
35
+ "ToolDependencyScorer",
32
36
  ]