judgeval 0.0.52__py3-none-any.whl → 0.0.53__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. judgeval/common/logger.py +46 -199
  2. judgeval/common/s3_storage.py +2 -6
  3. judgeval/common/tracer.py +182 -262
  4. judgeval/common/utils.py +16 -36
  5. judgeval/constants.py +14 -20
  6. judgeval/data/__init__.py +0 -2
  7. judgeval/data/datasets/dataset.py +6 -10
  8. judgeval/data/datasets/eval_dataset_client.py +25 -27
  9. judgeval/data/example.py +5 -138
  10. judgeval/data/judgment_types.py +214 -0
  11. judgeval/data/result.py +7 -25
  12. judgeval/data/scorer_data.py +28 -40
  13. judgeval/data/scripts/fix_default_factory.py +23 -0
  14. judgeval/data/scripts/openapi_transform.py +123 -0
  15. judgeval/data/tool.py +3 -54
  16. judgeval/data/trace.py +31 -50
  17. judgeval/data/trace_run.py +3 -3
  18. judgeval/evaluation_run.py +16 -23
  19. judgeval/integrations/langgraph.py +11 -12
  20. judgeval/judges/litellm_judge.py +3 -6
  21. judgeval/judges/mixture_of_judges.py +8 -25
  22. judgeval/judges/together_judge.py +3 -6
  23. judgeval/judgment_client.py +22 -24
  24. judgeval/rules.py +7 -19
  25. judgeval/run_evaluation.py +79 -242
  26. judgeval/scorers/__init__.py +4 -20
  27. judgeval/scorers/agent_scorer.py +21 -0
  28. judgeval/scorers/api_scorer.py +28 -38
  29. judgeval/scorers/base_scorer.py +98 -0
  30. judgeval/scorers/example_scorer.py +19 -0
  31. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
  32. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
  34. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
  35. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
  36. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
  37. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
  38. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
  40. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
  41. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
  42. judgeval/scorers/score.py +45 -330
  43. judgeval/scorers/utils.py +6 -88
  44. judgeval/utils/file_utils.py +4 -6
  45. judgeval/version_check.py +3 -2
  46. {judgeval-0.0.52.dist-info → judgeval-0.0.53.dist-info}/METADATA +2 -1
  47. judgeval-0.0.53.dist-info/RECORD +65 -0
  48. judgeval/data/custom_example.py +0 -19
  49. judgeval/scorers/judgeval_scorer.py +0 -177
  50. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
  51. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
  52. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
  53. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
  54. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
  55. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
  56. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
  57. judgeval/scorers/prompt_scorer.py +0 -296
  58. judgeval-0.0.52.dist-info/RECORD +0 -69
  59. {judgeval-0.0.52.dist-info → judgeval-0.0.53.dist-info}/WHEEL +0 -0
  60. {judgeval-0.0.52.dist-info → judgeval-0.0.53.dist-info}/licenses/LICENSE.md +0 -0
@@ -11,7 +11,7 @@ from typing import List, Dict, Any, Union, Optional, Callable
11
11
  from rich import print as rprint
12
12
 
13
13
  from judgeval.data import ScorerData, ScoringResult, Example, Trace
14
- from judgeval.scorers import JudgevalScorer, APIJudgmentScorer, ClassifierScorer
14
+ from judgeval.scorers import BaseScorer, APIScorerConfig
15
15
  from judgeval.scorers.score import a_execute_scoring
16
16
  from judgeval.constants import (
17
17
  ROOT_API,
@@ -24,7 +24,7 @@ from judgeval.constants import (
24
24
  JUDGMENT_EVAL_FETCH_API_URL,
25
25
  )
26
26
  from judgeval.common.exceptions import JudgmentAPIError
27
- from judgeval.common.logger import debug, info, error, warning, example_logging_context
27
+ from judgeval.common.logger import judgeval_logger
28
28
  from judgeval.evaluation_run import EvaluationRun
29
29
  from judgeval.data.trace_run import TraceRun
30
30
  from judgeval.common.tracer import Tracer
@@ -86,7 +86,7 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
86
86
 
87
87
  try:
88
88
  # submit API request to execute evals
89
- payload = evaluation_run.model_dump(warnings=False)
89
+ payload = evaluation_run.model_dump()
90
90
  response = requests.post(
91
91
  JUDGMENT_EVAL_API_URL,
92
92
  headers={
@@ -99,7 +99,7 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
99
99
  )
100
100
  response_data = response.json()
101
101
  except Exception as e:
102
- error(f"Error: {e}")
102
+ judgeval_logger.error(f"Error: {e}")
103
103
  details = response.json().get("detail", "No details provided")
104
104
  raise JudgmentAPIError(
105
105
  "An error occurred while executing the Judgment API request: " + details
@@ -108,7 +108,7 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
108
108
  # Add check for the duplicate eval run name
109
109
  if not response.ok:
110
110
  error_message = response_data.get("detail", "An unknown error occurred.")
111
- error(f"Error: {error_message=}")
111
+ judgeval_logger.error(f"Error: {error_message=}")
112
112
  raise JudgmentAPIError(error_message)
113
113
  return response_data
114
114
 
@@ -133,7 +133,7 @@ def execute_api_trace_eval(trace_run: TraceRun) -> Dict:
133
133
  )
134
134
  response_data = response.json()
135
135
  except Exception as e:
136
- error(f"Error: {e}")
136
+ judgeval_logger.error(f"Error: {e}")
137
137
  details = response.json().get("detail", "No details provided")
138
138
  raise JudgmentAPIError(
139
139
  "An error occurred while executing the Judgment API request: " + details
@@ -142,7 +142,7 @@ def execute_api_trace_eval(trace_run: TraceRun) -> Dict:
142
142
  # Add check for the duplicate eval run name
143
143
  if not response.ok:
144
144
  error_message = response_data.get("detail", "An unknown error occurred.")
145
- error(f"Error: {error_message=}")
145
+ judgeval_logger.error(f"Error: {error_message=}")
146
146
  raise JudgmentAPIError(error_message)
147
147
  return response_data
148
148
 
@@ -235,7 +235,7 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
235
235
  """
236
236
  for i, result in enumerate(results):
237
237
  if not result.scorers_data:
238
- error(
238
+ judgeval_logger.error(
239
239
  f"Scorer data is missing for example {i}. "
240
240
  "This is usually caused when the example does not contain "
241
241
  "the fields required by the scorer. "
@@ -273,17 +273,17 @@ def check_experiment_type(
273
273
  )
274
274
 
275
275
  if response.status_code == 422:
276
- error(f"{response.json()}")
276
+ judgeval_logger.error(f"{response.json()}")
277
277
  raise ValueError(f"{response.json()}")
278
278
 
279
279
  if not response.ok:
280
280
  response_data = response.json()
281
281
  error_message = response_data.get("detail", "An unknown error occurred.")
282
- error(f"Error checking eval run name: {error_message}")
282
+ judgeval_logger.error(f"Error checking eval run name: {error_message}")
283
283
  raise JudgmentAPIError(error_message)
284
284
 
285
285
  except exceptions.RequestException as e:
286
- error(f"Failed to check if experiment type exists: {str(e)}")
286
+ judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
287
287
  raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
288
288
 
289
289
 
@@ -319,7 +319,7 @@ def check_eval_run_name_exists(
319
319
  )
320
320
 
321
321
  if response.status_code == 409:
322
- error(
322
+ judgeval_logger.error(
323
323
  f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true."
324
324
  )
325
325
  raise ValueError(
@@ -329,11 +329,11 @@ def check_eval_run_name_exists(
329
329
  if not response.ok:
330
330
  response_data = response.json()
331
331
  error_message = response_data.get("detail", "An unknown error occurred.")
332
- error(f"Error checking eval run name: {error_message}")
332
+ judgeval_logger.error(f"Error checking eval run name: {error_message}")
333
333
  raise JudgmentAPIError(error_message)
334
334
 
335
335
  except exceptions.RequestException as e:
336
- error(f"Failed to check if eval run name exists: {str(e)}")
336
+ judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
337
337
  raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
338
338
 
339
339
 
@@ -366,7 +366,7 @@ def log_evaluation_results(
366
366
  if not res.ok:
367
367
  response_data = res.json()
368
368
  error_message = response_data.get("detail", "An unknown error occurred.")
369
- error(f"Error {res.status_code}: {error_message}")
369
+ judgeval_logger.error(f"Error {res.status_code}: {error_message}")
370
370
  raise JudgmentAPIError(error_message)
371
371
 
372
372
  if "ui_results_url" in res.json():
@@ -377,12 +377,14 @@ def log_evaluation_results(
377
377
  return None
378
378
 
379
379
  except exceptions.RequestException as e:
380
- error(f"Request failed while saving evaluation results to DB: {str(e)}")
380
+ judgeval_logger.error(
381
+ f"Request failed while saving evaluation results to DB: {str(e)}"
382
+ )
381
383
  raise JudgmentAPIError(
382
384
  f"Request failed while saving evaluation results to DB: {str(e)}"
383
385
  )
384
386
  except Exception as e:
385
- error(f"Failed to save evaluation results to DB: {str(e)}")
387
+ judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
386
388
  raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
387
389
 
388
390
 
@@ -407,7 +409,7 @@ def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
407
409
  else:
408
410
  result = func(*args, **kwargs)
409
411
  except Exception as e:
410
- error(f"An error occurred: {str(e)}")
412
+ judgeval_logger.error(f"An error occurred: {str(e)}")
411
413
  stop_spinner_event.set()
412
414
  spinner_thread.join()
413
415
  raise e
@@ -422,7 +424,7 @@ def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
422
424
 
423
425
 
424
426
  def check_examples(
425
- examples: List[Example], scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
427
+ examples: List[Example], scorers: List[Union[APIScorerConfig, BaseScorer]]
426
428
  ) -> None:
427
429
  """
428
430
  Checks if the example contains the necessary parameters for the scorer.
@@ -513,18 +515,14 @@ def run_trace_eval(
513
515
  actual_tracer.traces = []
514
516
 
515
517
  # Execute evaluation using Judgment API
516
- info("Starting API evaluation")
517
518
  try: # execute an EvaluationRun with just JudgmentScorers
518
- debug("Sending request to Judgment API")
519
519
  response_data: Dict = run_with_spinner(
520
520
  "Running Trace Evaluation: ", execute_api_trace_eval, trace_run
521
521
  )
522
522
  scoring_results = [
523
523
  ScoringResult(**result) for result in response_data["results"]
524
524
  ]
525
- info(f"Received {len(scoring_results)} results from API")
526
525
  except JudgmentAPIError as e:
527
- error(f"An error occurred while executing the Judgment API request: {str(e)}")
528
526
  raise JudgmentAPIError(
529
527
  f"An error occurred while executing the Judgment API request: {str(e)}"
530
528
  )
@@ -534,7 +532,6 @@ def run_trace_eval(
534
532
  )
535
533
 
536
534
  # Convert the response data to `ScoringResult` objects
537
- debug("Processing API results")
538
535
  # TODO: allow for custom scorer on traces
539
536
 
540
537
  pretty_str = run_with_spinner(
@@ -583,12 +580,12 @@ async def get_evaluation_status(
583
580
 
584
581
  if not response.ok:
585
582
  error_message = response.json().get("detail", "An unknown error occurred.")
586
- error(f"Error checking evaluation status: {error_message}")
583
+ judgeval_logger.error(f"Error checking evaluation status: {error_message}")
587
584
  raise JudgmentAPIError(error_message)
588
585
 
589
586
  return response.json()
590
587
  except exceptions.RequestException as e:
591
- error(f"Failed to check evaluation status: {str(e)}")
588
+ judgeval_logger.error(f"Failed to check evaluation status: {str(e)}")
592
589
  raise JudgmentAPIError(f"Failed to check evaluation status: {str(e)}")
593
590
 
594
591
 
@@ -597,8 +594,9 @@ async def _poll_evaluation_until_complete(
597
594
  project_name: str,
598
595
  judgment_api_key: str,
599
596
  organization_id: str,
597
+ expected_scorer_count: int,
598
+ original_examples: List[Example],
600
599
  poll_interval_seconds: int = 5,
601
- original_examples: Optional[List[Example]] = None,
602
600
  ) -> List[ScoringResult]:
603
601
  """
604
602
  Polls until the evaluation is complete and returns the results.
@@ -616,23 +614,10 @@ async def _poll_evaluation_until_complete(
616
614
  List[ScoringResult]: The evaluation results
617
615
  """
618
616
  poll_count = 0
619
- # Create example_id to Example mapping if original examples are provided
620
- original_example_map = {}
621
- if original_examples:
622
- for example in original_examples:
623
- original_example_map[example.example_id] = example
624
-
625
- # Remove the expected scorer names extraction and checking
626
- # We'll instead verify all examples have consistent scorer data
617
+
627
618
  while True:
628
619
  poll_count += 1
629
620
  try:
630
- # Log polling attempt
631
- if poll_count % 4 == 0: # Log every 4th poll to avoid excess logging
632
- info(
633
- f"Polling for evaluation '{eval_name}' in project '{project_name}' (attempt {poll_count})"
634
- )
635
-
636
621
  # Check status
637
622
  response = await asyncio.to_thread(
638
623
  requests.get,
@@ -650,7 +635,9 @@ async def _poll_evaluation_until_complete(
650
635
  error_message = response.json().get(
651
636
  "detail", "An unknown error occurred."
652
637
  )
653
- error(f"Error checking evaluation status: {error_message}")
638
+ judgeval_logger.error(
639
+ f"Error checking evaluation status: {error_message}"
640
+ )
654
641
  # Don't raise exception immediately, just log and continue polling
655
642
  await asyncio.sleep(poll_interval_seconds)
656
643
  continue
@@ -660,9 +647,6 @@ async def _poll_evaluation_until_complete(
660
647
 
661
648
  # If complete, get results and return
662
649
  if status == "completed" or status == "complete":
663
- info(
664
- f"Evaluation '{eval_name}' reported as completed, fetching and verifying results..."
665
- )
666
650
  results_response = await asyncio.to_thread(
667
651
  requests.post,
668
652
  JUDGMENT_EVAL_FETCH_API_URL,
@@ -679,143 +663,55 @@ async def _poll_evaluation_until_complete(
679
663
  error_message = results_response.json().get(
680
664
  "detail", "An unknown error occurred."
681
665
  )
682
- error(f"Error fetching evaluation results: {error_message}")
666
+ judgeval_logger.error(
667
+ f"Error fetching evaluation results: {error_message}"
668
+ )
683
669
  raise JudgmentAPIError(error_message)
684
670
 
685
671
  result_data = results_response.json()
686
672
 
687
- if "examples" in result_data:
688
- examples_data = result_data.get("examples", [])
673
+ if result_data.get("examples") is None:
674
+ continue
689
675
 
690
- info(
691
- f"Successfully fetched {len(examples_data)} results for evaluation '{eval_name}'"
692
- )
676
+ examples_data = result_data.get("examples", [])
677
+ scoring_results = []
678
+
679
+ for example_data in examples_data:
680
+ # Create ScorerData objects
681
+ scorer_data_list = []
682
+ for raw_scorer_data in example_data.get("scorer_data", []):
683
+ scorer_data_list.append(ScorerData(**raw_scorer_data))
684
+
685
+ if len(scorer_data_list) != expected_scorer_count:
686
+ # This means that not all scorers were loading for a specific example
687
+ continue
693
688
 
694
- # Check for result validity if original examples are provided
695
- if original_example_map:
696
- # Verify all returned examples have matching original examples
697
- has_invalid_results = False
698
- for example_data in examples_data:
699
- example_id = example_data.get("example_id")
700
-
701
- if example_id not in original_example_map:
702
- warning(
703
- f"Server returned example with ID {example_id} not found in original examples. "
704
- + "This indicates stale or incorrect data. Continuing to poll..."
705
- )
706
- has_invalid_results = True
707
- break
708
-
709
- # If any invalid examples found, continue polling
710
- if has_invalid_results:
711
- info("Detected stale data. Waiting before polling again...")
712
- await asyncio.sleep(poll_interval_seconds)
713
- continue
714
-
715
- # Check if we received the expected number of results
716
- if original_examples and len(original_examples) != len(
717
- examples_data
718
- ):
719
- warning(
720
- f"Expected {len(original_examples)} results but got {len(examples_data)} results. "
721
- + "This indicates incomplete data. Continuing to poll..."
722
- )
723
- await asyncio.sleep(poll_interval_seconds)
724
- continue
725
-
726
- # Collect all example IDs from scorer data
727
- scorer_example_ids = set()
728
- for example_data in examples_data:
729
- scorer_data_list = example_data.get("scorer_data", [])
730
- for scorer_data in scorer_data_list:
731
- if "example_id" in scorer_data:
732
- scorer_example_ids.add(scorer_data["example_id"])
733
-
734
- # Get the set of original example IDs
735
- original_example_ids = set(original_example_map.keys())
736
-
737
- # Check if the sets are equal
738
- missing_in_scorer = original_example_ids - scorer_example_ids
739
- extra_in_scorer = scorer_example_ids - original_example_ids
740
-
741
- if missing_in_scorer or extra_in_scorer:
742
- if missing_in_scorer:
743
- warning(
744
- f"Examples missing in scorer data: {missing_in_scorer}"
745
- )
746
- if extra_in_scorer:
747
- warning(
748
- f"Extra examples in scorer data: {extra_in_scorer}"
749
- )
750
- info(
751
- "Detected mismatched example IDs in scorer data. Waiting before polling again..."
752
- )
753
- await asyncio.sleep(poll_interval_seconds)
754
- continue
755
-
756
- # Create ScoringResult objects from the raw data
757
- scoring_results = []
758
-
759
- for example_data in examples_data:
760
- # Extract example_id from the server response
761
- example_id = example_data.get("example_id")
762
-
763
- # Create ScorerData objects
764
- scorer_data_list = []
765
- for raw_scorer_data in example_data.get("scorer_data", []):
766
- scorer_data_list.append(ScorerData(**raw_scorer_data))
767
-
768
- # Use the original Example object if we have it and the ID matches
769
- if original_example_map:
770
- example = original_example_map[example_id]
771
- debug(f"Matched result with original example {example_id}")
772
- else:
773
- # Create Example from example data (excluding scorer_data) if no original examples provided
774
- example_dict = {
775
- k: v
776
- for k, v in example_data.items()
777
- if k != "scorer_data"
778
- }
779
- example = Example(**example_dict)
780
-
781
- # Calculate success based on whether all scorer_data entries were successful
782
- success = (
783
- all(scorer_data.success for scorer_data in scorer_data_list)
784
- if scorer_data_list
785
- else False
786
- )
787
-
788
- # Create ScoringResult
789
- scoring_result = ScoringResult(
790
- success=success, # Set based on all scorer data success values
791
- scorers_data=scorer_data_list,
792
- data_object=example,
793
- )
794
- scoring_results.append(scoring_result)
795
-
796
- # If we got here, all validation checks passed
797
- info(
798
- f"Verified complete results for all {len(scoring_results)} examples with all expected scorer data"
689
+ example = Example(**example_data)
690
+
691
+ # Calculate success based on whether all scorer_data entries were successful
692
+ success = all(
693
+ scorer_data.success for scorer_data in scorer_data_list
799
694
  )
800
- return scoring_results
801
- else:
802
- # No examples found
803
- info(
804
- f"No example results found for completed evaluation '{eval_name}'"
695
+ scoring_result = ScoringResult(
696
+ success=success, # Set based on all scorer data success values
697
+ scorers_data=scorer_data_list,
698
+ data_object=example,
805
699
  )
806
- return []
700
+ scoring_results.append(scoring_result)
701
+
702
+ if len(scoring_results) != len(original_examples):
703
+ # This means that not all examples were evaluated
704
+ continue
807
705
 
706
+ return scoring_results
808
707
  elif status == "failed":
809
708
  # Evaluation failed
810
709
  error_message = status_data.get("error", "Unknown error")
811
- error(f"Evaluation '{eval_name}' failed: {error_message}")
710
+ judgeval_logger.error(
711
+ f"Evaluation '{eval_name}' failed: {error_message}"
712
+ )
812
713
  raise JudgmentAPIError(f"Evaluation failed: {error_message}")
813
714
 
814
- elif status == "pending" or status == "running":
815
- # Only log occasionally for pending/running to avoid flooding logs
816
- if poll_count % 4 == 0:
817
- info(f"Evaluation '{eval_name}' status: {status}")
818
-
819
715
  # Wait before checking again
820
716
  await asyncio.sleep(poll_interval_seconds)
821
717
 
@@ -824,7 +720,7 @@ async def _poll_evaluation_until_complete(
824
720
  raise
825
721
 
826
722
  # For other exceptions, log and continue polling
827
- error(f"Error checking evaluation status: {str(e)}")
723
+ judgeval_logger.error(f"Error checking evaluation status: {str(e)}")
828
724
  if poll_count > 20: # Only raise exception after many failed attempts
829
725
  raise JudgmentAPIError(
830
726
  f"Error checking evaluation status after {poll_count} attempts: {str(e)}"
@@ -944,61 +840,26 @@ def run_eval(
944
840
  )
945
841
 
946
842
  # Set example IDs if not already set
947
- debug("Initializing examples with IDs and timestamps")
948
843
  for idx, example in enumerate(evaluation_run.examples):
949
844
  example.example_index = idx # Set numeric index
950
- with example_logging_context(example.created_at, example.example_id):
951
- debug(
952
- f"Initialized example {example.example_id} (index: {example.example_index})"
953
- )
954
- debug(f"Input: {example.input}")
955
- debug(f"Actual output: {example.actual_output}")
956
- if example.expected_output:
957
- debug(f"Expected output: {example.expected_output}")
958
- if example.context:
959
- debug(f"Context: {example.context}")
960
- if example.retrieval_context:
961
- debug(f"Retrieval context: {example.retrieval_context}")
962
- if example.additional_metadata:
963
- debug(f"Additional metadata: {example.additional_metadata}")
964
- if example.tools_called:
965
- debug(f"Tools called: {example.tools_called}")
966
- if example.expected_tools:
967
- debug(f"Expected tools: {example.expected_tools}")
968
-
969
- debug(f"Starting evaluation run with {len(evaluation_run.examples)} examples")
970
-
971
- # Group APIJudgmentScorers and JudgevalScorers, then evaluate them in parallel
972
- debug("Grouping scorers by type")
973
- judgment_scorers: List[APIJudgmentScorer] = []
974
- local_scorers: List[JudgevalScorer] = []
845
+
846
+ judgment_scorers: List[APIScorerConfig] = []
847
+ local_scorers: List[BaseScorer] = []
975
848
  for scorer in evaluation_run.scorers:
976
- if isinstance(scorer, (APIJudgmentScorer, ClassifierScorer)):
849
+ if isinstance(scorer, APIScorerConfig):
977
850
  judgment_scorers.append(scorer)
978
- debug(f"Added judgment scorer: {type(scorer).__name__}")
979
851
  else:
980
852
  local_scorers.append(scorer)
981
- debug(f"Added local scorer: {type(scorer).__name__}")
982
-
983
- custom_example_check = [scorer.custom_example for scorer in local_scorers]
984
- if any(custom_example_check) and not all(custom_example_check):
985
- error("All scorers must be custom scorers if using custom examples")
986
- raise ValueError("All scorers must be custom scorers if using custom examples")
987
-
988
- debug(
989
- f"Found {len(judgment_scorers)} judgment scorers and {len(local_scorers)} local scorers"
990
- )
991
853
 
992
854
  api_results: List[ScoringResult] = []
993
855
  local_results: List[ScoringResult] = []
994
856
 
995
857
  if async_execution:
996
858
  if len(local_scorers) > 0:
997
- error("Local scorers are not supported in async execution")
859
+ judgeval_logger.error("Local scorers are not supported in async execution")
998
860
  raise ValueError("Local scorers are not supported in async execution")
999
861
 
1000
862
  check_examples(evaluation_run.examples, evaluation_run.scorers)
1001
- info("Starting async evaluation")
1002
863
 
1003
864
  async def _async_evaluation_workflow():
1004
865
  # Create a payload
@@ -1021,11 +882,11 @@ def run_eval(
1021
882
  error_message = response.json().get(
1022
883
  "detail", "An unknown error occurred."
1023
884
  )
1024
- error(f"Error adding evaluation to queue: {error_message}")
885
+ judgeval_logger.error(
886
+ f"Error adding evaluation to queue: {error_message}"
887
+ )
1025
888
  raise JudgmentAPIError(error_message)
1026
889
 
1027
- info(f"Successfully added evaluation '{evaluation_run.eval_name}' to queue")
1028
-
1029
890
  # Poll until the evaluation is complete
1030
891
  results = await _poll_evaluation_until_complete(
1031
892
  eval_name=evaluation_run.eval_name,
@@ -1033,6 +894,7 @@ def run_eval(
1033
894
  judgment_api_key=evaluation_run.judgment_api_key,
1034
895
  organization_id=evaluation_run.organization_id,
1035
896
  original_examples=evaluation_run.examples, # Pass the original examples
897
+ expected_scorer_count=len(evaluation_run.scorers),
1036
898
  )
1037
899
 
1038
900
  pretty_str_to_print = None
@@ -1047,7 +909,9 @@ def run_eval(
1047
909
  log_evaluation_results, send_results, evaluation_run
1048
910
  )
1049
911
  except Exception as e:
1050
- error(f"Error logging results after async evaluation: {str(e)}")
912
+ judgeval_logger.error(
913
+ f"Error logging results after async evaluation: {str(e)}"
914
+ )
1051
915
 
1052
916
  return results, pretty_str_to_print
1053
917
 
@@ -1062,8 +926,6 @@ def run_eval(
1062
926
  check_examples(evaluation_run.examples, evaluation_run.scorers)
1063
927
  if judgment_scorers:
1064
928
  # Execute evaluation using Judgment API
1065
- info("Starting API evaluation")
1066
- debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
1067
929
  try: # execute an EvaluationRun with just JudgmentScorers
1068
930
  api_evaluation_run: EvaluationRun = EvaluationRun(
1069
931
  eval_name=evaluation_run.eval_name,
@@ -1074,13 +936,11 @@ def run_eval(
1074
936
  judgment_api_key=evaluation_run.judgment_api_key,
1075
937
  organization_id=evaluation_run.organization_id,
1076
938
  )
1077
- debug("Sending request to Judgment API")
1078
939
  response_data: Dict = run_with_spinner(
1079
940
  "Running Evaluation: ", execute_api_eval, api_evaluation_run
1080
941
  )
1081
- info(f"Received {len(response_data['results'])} results from API")
1082
942
  except JudgmentAPIError as e:
1083
- error(
943
+ judgeval_logger.error(
1084
944
  f"An error occurred while executing the Judgment API request: {str(e)}"
1085
945
  )
1086
946
  raise JudgmentAPIError(
@@ -1092,39 +952,25 @@ def run_eval(
1092
952
  )
1093
953
 
1094
954
  # Convert the response data to `ScoringResult` objects
1095
- debug("Processing API results")
1096
955
  api_results = [
1097
956
  ScoringResult(**result) for result in response_data["results"]
1098
957
  ]
1099
958
  # Run local evals
1100
- if local_scorers: # List[JudgevalScorer]
1101
- # We should be removing local scorers soon
1102
- info("Starting local evaluation")
1103
- for example in evaluation_run.examples:
1104
- with example_logging_context(example.created_at, example.example_id):
1105
- debug(f"Processing example {example.example_id}: {example.input}")
1106
-
959
+ if local_scorers: # List[BaseScorer]
1107
960
  results: List[ScoringResult] = safe_run_async(
1108
961
  a_execute_scoring(
1109
962
  evaluation_run.examples,
1110
963
  local_scorers,
1111
964
  model=evaluation_run.model,
1112
- skip_on_missing_params=True,
1113
- show_indicator=True,
1114
- _use_bar_indicator=True,
1115
965
  throttle_value=0,
1116
966
  max_concurrent=MAX_CONCURRENT_EVALUATIONS,
1117
967
  )
1118
968
  )
1119
969
  local_results = results
1120
- info(f"Local evaluation complete with {len(local_results)} results")
1121
970
  # Aggregate the ScorerData from the API and local evaluations
1122
- debug("Merging API and local results")
1123
971
  merged_results: List[ScoringResult] = merge_results(api_results, local_results)
1124
972
  merged_results = check_missing_scorer_data(merged_results)
1125
973
 
1126
- info(f"Successfully merged {len(merged_results)} results")
1127
-
1128
974
  # Evaluate rules against local scoring results if rules exist (this cant be done just yet)
1129
975
  # if evaluation_run.rules and merged_results:
1130
976
  # run_rules(
@@ -1146,13 +992,6 @@ def run_eval(
1146
992
  )
1147
993
  rprint(pretty_str)
1148
994
 
1149
- for i, result in enumerate(merged_results):
1150
- if (
1151
- not result.scorers_data
1152
- ): # none of the scorers could be executed on this example
1153
- info(
1154
- f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers."
1155
- )
1156
995
  return merged_results
1157
996
 
1158
997
 
@@ -1205,8 +1044,6 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
1205
1044
  f"Strict Mode: {fail_scorer.strict_mode}\n"
1206
1045
  f"Evaluation Model: {fail_scorer.evaluation_model}\n"
1207
1046
  f"Error: {fail_scorer.error}\n"
1208
- f"Evaluation Cost: {fail_scorer.evaluation_cost}\n"
1209
- f"Verbose Logs: {fail_scorer.verbose_logs}\n"
1210
1047
  f"Additional Metadata: {fail_scorer.additional_metadata}\n"
1211
1048
  )
1212
1049
  error_msg += "-" * 100
@@ -1,20 +1,12 @@
1
- from judgeval.scorers.api_scorer import APIJudgmentScorer
2
- from judgeval.scorers.judgeval_scorer import JudgevalScorer
3
- from judgeval.scorers.prompt_scorer import PromptScorer
1
+ from judgeval.scorers.api_scorer import APIScorerConfig
2
+ from judgeval.scorers.base_scorer import BaseScorer
4
3
  from judgeval.scorers.judgeval_scorers.api_scorers import (
5
4
  ExecutionOrderScorer,
6
- JSONCorrectnessScorer,
7
- SummarizationScorer,
8
5
  HallucinationScorer,
9
6
  FaithfulnessScorer,
10
- ContextualRelevancyScorer,
11
- ContextualPrecisionScorer,
12
- ContextualRecallScorer,
13
7
  AnswerRelevancyScorer,
14
8
  AnswerCorrectnessScorer,
15
- ComparisonScorer,
16
9
  InstructionAdherenceScorer,
17
- GroundednessScorer,
18
10
  DerailmentScorer,
19
11
  ToolOrderScorer,
20
12
  ClassifierScorer,
@@ -25,24 +17,16 @@ from judgeval.scorers.judgeval_scorers.classifiers import (
25
17
  )
26
18
 
27
19
  __all__ = [
28
- "APIJudgmentScorer",
29
- "JudgevalScorer",
30
- "PromptScorer",
20
+ "APIScorerConfig",
21
+ "BaseScorer",
31
22
  "ClassifierScorer",
32
23
  "ExecutionOrderScorer",
33
- "JSONCorrectnessScorer",
34
- "SummarizationScorer",
35
24
  "HallucinationScorer",
36
25
  "FaithfulnessScorer",
37
- "ContextualRelevancyScorer",
38
- "ContextualPrecisionScorer",
39
- "ContextualRecallScorer",
40
26
  "AnswerRelevancyScorer",
41
27
  "AnswerCorrectnessScorer",
42
28
  "Text2SQLScorer",
43
- "ComparisonScorer",
44
29
  "InstructionAdherenceScorer",
45
- "GroundednessScorer",
46
30
  "DerailmentScorer",
47
31
  "ToolOrderScorer",
48
32
  "ToolDependencyScorer",