judgeval 0.0.52__py3-none-any.whl → 0.0.54__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/logger.py +46 -199
- judgeval/common/s3_storage.py +2 -6
- judgeval/common/tracer.py +182 -262
- judgeval/common/utils.py +16 -36
- judgeval/constants.py +14 -20
- judgeval/data/__init__.py +0 -2
- judgeval/data/datasets/dataset.py +6 -10
- judgeval/data/datasets/eval_dataset_client.py +25 -27
- judgeval/data/example.py +5 -138
- judgeval/data/judgment_types.py +214 -0
- judgeval/data/result.py +7 -25
- judgeval/data/scorer_data.py +28 -40
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/tool.py +3 -54
- judgeval/data/trace.py +31 -50
- judgeval/data/trace_run.py +3 -3
- judgeval/evaluation_run.py +16 -23
- judgeval/integrations/langgraph.py +11 -12
- judgeval/judges/litellm_judge.py +3 -6
- judgeval/judges/mixture_of_judges.py +8 -25
- judgeval/judges/together_judge.py +3 -6
- judgeval/judgment_client.py +22 -24
- judgeval/rules.py +7 -19
- judgeval/run_evaluation.py +79 -242
- judgeval/scorers/__init__.py +4 -20
- judgeval/scorers/agent_scorer.py +21 -0
- judgeval/scorers/api_scorer.py +28 -38
- judgeval/scorers/base_scorer.py +98 -0
- judgeval/scorers/example_scorer.py +19 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
- judgeval/scorers/score.py +45 -330
- judgeval/scorers/utils.py +6 -88
- judgeval/utils/file_utils.py +4 -6
- judgeval/version_check.py +3 -2
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/METADATA +6 -5
- judgeval-0.0.54.dist-info/RECORD +65 -0
- judgeval/data/custom_example.py +0 -19
- judgeval/scorers/judgeval_scorer.py +0 -177
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
- judgeval/scorers/prompt_scorer.py +0 -296
- judgeval-0.0.52.dist-info/RECORD +0 -69
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/WHEEL +0 -0
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
CHANGED
@@ -11,7 +11,7 @@ from typing import List, Dict, Any, Union, Optional, Callable
|
|
11
11
|
from rich import print as rprint
|
12
12
|
|
13
13
|
from judgeval.data import ScorerData, ScoringResult, Example, Trace
|
14
|
-
from judgeval.scorers import
|
14
|
+
from judgeval.scorers import BaseScorer, APIScorerConfig
|
15
15
|
from judgeval.scorers.score import a_execute_scoring
|
16
16
|
from judgeval.constants import (
|
17
17
|
ROOT_API,
|
@@ -24,7 +24,7 @@ from judgeval.constants import (
|
|
24
24
|
JUDGMENT_EVAL_FETCH_API_URL,
|
25
25
|
)
|
26
26
|
from judgeval.common.exceptions import JudgmentAPIError
|
27
|
-
from judgeval.common.logger import
|
27
|
+
from judgeval.common.logger import judgeval_logger
|
28
28
|
from judgeval.evaluation_run import EvaluationRun
|
29
29
|
from judgeval.data.trace_run import TraceRun
|
30
30
|
from judgeval.common.tracer import Tracer
|
@@ -86,7 +86,7 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
|
|
86
86
|
|
87
87
|
try:
|
88
88
|
# submit API request to execute evals
|
89
|
-
payload = evaluation_run.model_dump(
|
89
|
+
payload = evaluation_run.model_dump()
|
90
90
|
response = requests.post(
|
91
91
|
JUDGMENT_EVAL_API_URL,
|
92
92
|
headers={
|
@@ -99,7 +99,7 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
|
|
99
99
|
)
|
100
100
|
response_data = response.json()
|
101
101
|
except Exception as e:
|
102
|
-
error(f"Error: {e}")
|
102
|
+
judgeval_logger.error(f"Error: {e}")
|
103
103
|
details = response.json().get("detail", "No details provided")
|
104
104
|
raise JudgmentAPIError(
|
105
105
|
"An error occurred while executing the Judgment API request: " + details
|
@@ -108,7 +108,7 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
|
|
108
108
|
# Add check for the duplicate eval run name
|
109
109
|
if not response.ok:
|
110
110
|
error_message = response_data.get("detail", "An unknown error occurred.")
|
111
|
-
error(f"Error: {error_message=}")
|
111
|
+
judgeval_logger.error(f"Error: {error_message=}")
|
112
112
|
raise JudgmentAPIError(error_message)
|
113
113
|
return response_data
|
114
114
|
|
@@ -133,7 +133,7 @@ def execute_api_trace_eval(trace_run: TraceRun) -> Dict:
|
|
133
133
|
)
|
134
134
|
response_data = response.json()
|
135
135
|
except Exception as e:
|
136
|
-
error(f"Error: {e}")
|
136
|
+
judgeval_logger.error(f"Error: {e}")
|
137
137
|
details = response.json().get("detail", "No details provided")
|
138
138
|
raise JudgmentAPIError(
|
139
139
|
"An error occurred while executing the Judgment API request: " + details
|
@@ -142,7 +142,7 @@ def execute_api_trace_eval(trace_run: TraceRun) -> Dict:
|
|
142
142
|
# Add check for the duplicate eval run name
|
143
143
|
if not response.ok:
|
144
144
|
error_message = response_data.get("detail", "An unknown error occurred.")
|
145
|
-
error(f"Error: {error_message=}")
|
145
|
+
judgeval_logger.error(f"Error: {error_message=}")
|
146
146
|
raise JudgmentAPIError(error_message)
|
147
147
|
return response_data
|
148
148
|
|
@@ -235,7 +235,7 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
|
|
235
235
|
"""
|
236
236
|
for i, result in enumerate(results):
|
237
237
|
if not result.scorers_data:
|
238
|
-
error(
|
238
|
+
judgeval_logger.error(
|
239
239
|
f"Scorer data is missing for example {i}. "
|
240
240
|
"This is usually caused when the example does not contain "
|
241
241
|
"the fields required by the scorer. "
|
@@ -273,17 +273,17 @@ def check_experiment_type(
|
|
273
273
|
)
|
274
274
|
|
275
275
|
if response.status_code == 422:
|
276
|
-
error(f"{response.json()}")
|
276
|
+
judgeval_logger.error(f"{response.json()}")
|
277
277
|
raise ValueError(f"{response.json()}")
|
278
278
|
|
279
279
|
if not response.ok:
|
280
280
|
response_data = response.json()
|
281
281
|
error_message = response_data.get("detail", "An unknown error occurred.")
|
282
|
-
error(f"Error checking eval run name: {error_message}")
|
282
|
+
judgeval_logger.error(f"Error checking eval run name: {error_message}")
|
283
283
|
raise JudgmentAPIError(error_message)
|
284
284
|
|
285
285
|
except exceptions.RequestException as e:
|
286
|
-
error(f"Failed to check if experiment type exists: {str(e)}")
|
286
|
+
judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
|
287
287
|
raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
|
288
288
|
|
289
289
|
|
@@ -319,7 +319,7 @@ def check_eval_run_name_exists(
|
|
319
319
|
)
|
320
320
|
|
321
321
|
if response.status_code == 409:
|
322
|
-
error(
|
322
|
+
judgeval_logger.error(
|
323
323
|
f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true."
|
324
324
|
)
|
325
325
|
raise ValueError(
|
@@ -329,11 +329,11 @@ def check_eval_run_name_exists(
|
|
329
329
|
if not response.ok:
|
330
330
|
response_data = response.json()
|
331
331
|
error_message = response_data.get("detail", "An unknown error occurred.")
|
332
|
-
error(f"Error checking eval run name: {error_message}")
|
332
|
+
judgeval_logger.error(f"Error checking eval run name: {error_message}")
|
333
333
|
raise JudgmentAPIError(error_message)
|
334
334
|
|
335
335
|
except exceptions.RequestException as e:
|
336
|
-
error(f"Failed to check if eval run name exists: {str(e)}")
|
336
|
+
judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
|
337
337
|
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
338
338
|
|
339
339
|
|
@@ -366,7 +366,7 @@ def log_evaluation_results(
|
|
366
366
|
if not res.ok:
|
367
367
|
response_data = res.json()
|
368
368
|
error_message = response_data.get("detail", "An unknown error occurred.")
|
369
|
-
error(f"Error {res.status_code}: {error_message}")
|
369
|
+
judgeval_logger.error(f"Error {res.status_code}: {error_message}")
|
370
370
|
raise JudgmentAPIError(error_message)
|
371
371
|
|
372
372
|
if "ui_results_url" in res.json():
|
@@ -377,12 +377,14 @@ def log_evaluation_results(
|
|
377
377
|
return None
|
378
378
|
|
379
379
|
except exceptions.RequestException as e:
|
380
|
-
error(
|
380
|
+
judgeval_logger.error(
|
381
|
+
f"Request failed while saving evaluation results to DB: {str(e)}"
|
382
|
+
)
|
381
383
|
raise JudgmentAPIError(
|
382
384
|
f"Request failed while saving evaluation results to DB: {str(e)}"
|
383
385
|
)
|
384
386
|
except Exception as e:
|
385
|
-
error(f"Failed to save evaluation results to DB: {str(e)}")
|
387
|
+
judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
|
386
388
|
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
387
389
|
|
388
390
|
|
@@ -407,7 +409,7 @@ def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
|
|
407
409
|
else:
|
408
410
|
result = func(*args, **kwargs)
|
409
411
|
except Exception as e:
|
410
|
-
error(f"An error occurred: {str(e)}")
|
412
|
+
judgeval_logger.error(f"An error occurred: {str(e)}")
|
411
413
|
stop_spinner_event.set()
|
412
414
|
spinner_thread.join()
|
413
415
|
raise e
|
@@ -422,7 +424,7 @@ def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
|
|
422
424
|
|
423
425
|
|
424
426
|
def check_examples(
|
425
|
-
examples: List[Example], scorers: List[Union[
|
427
|
+
examples: List[Example], scorers: List[Union[APIScorerConfig, BaseScorer]]
|
426
428
|
) -> None:
|
427
429
|
"""
|
428
430
|
Checks if the example contains the necessary parameters for the scorer.
|
@@ -513,18 +515,14 @@ def run_trace_eval(
|
|
513
515
|
actual_tracer.traces = []
|
514
516
|
|
515
517
|
# Execute evaluation using Judgment API
|
516
|
-
info("Starting API evaluation")
|
517
518
|
try: # execute an EvaluationRun with just JudgmentScorers
|
518
|
-
debug("Sending request to Judgment API")
|
519
519
|
response_data: Dict = run_with_spinner(
|
520
520
|
"Running Trace Evaluation: ", execute_api_trace_eval, trace_run
|
521
521
|
)
|
522
522
|
scoring_results = [
|
523
523
|
ScoringResult(**result) for result in response_data["results"]
|
524
524
|
]
|
525
|
-
info(f"Received {len(scoring_results)} results from API")
|
526
525
|
except JudgmentAPIError as e:
|
527
|
-
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
528
526
|
raise JudgmentAPIError(
|
529
527
|
f"An error occurred while executing the Judgment API request: {str(e)}"
|
530
528
|
)
|
@@ -534,7 +532,6 @@ def run_trace_eval(
|
|
534
532
|
)
|
535
533
|
|
536
534
|
# Convert the response data to `ScoringResult` objects
|
537
|
-
debug("Processing API results")
|
538
535
|
# TODO: allow for custom scorer on traces
|
539
536
|
|
540
537
|
pretty_str = run_with_spinner(
|
@@ -583,12 +580,12 @@ async def get_evaluation_status(
|
|
583
580
|
|
584
581
|
if not response.ok:
|
585
582
|
error_message = response.json().get("detail", "An unknown error occurred.")
|
586
|
-
error(f"Error checking evaluation status: {error_message}")
|
583
|
+
judgeval_logger.error(f"Error checking evaluation status: {error_message}")
|
587
584
|
raise JudgmentAPIError(error_message)
|
588
585
|
|
589
586
|
return response.json()
|
590
587
|
except exceptions.RequestException as e:
|
591
|
-
error(f"Failed to check evaluation status: {str(e)}")
|
588
|
+
judgeval_logger.error(f"Failed to check evaluation status: {str(e)}")
|
592
589
|
raise JudgmentAPIError(f"Failed to check evaluation status: {str(e)}")
|
593
590
|
|
594
591
|
|
@@ -597,8 +594,9 @@ async def _poll_evaluation_until_complete(
|
|
597
594
|
project_name: str,
|
598
595
|
judgment_api_key: str,
|
599
596
|
organization_id: str,
|
597
|
+
expected_scorer_count: int,
|
598
|
+
original_examples: List[Example],
|
600
599
|
poll_interval_seconds: int = 5,
|
601
|
-
original_examples: Optional[List[Example]] = None,
|
602
600
|
) -> List[ScoringResult]:
|
603
601
|
"""
|
604
602
|
Polls until the evaluation is complete and returns the results.
|
@@ -616,23 +614,10 @@ async def _poll_evaluation_until_complete(
|
|
616
614
|
List[ScoringResult]: The evaluation results
|
617
615
|
"""
|
618
616
|
poll_count = 0
|
619
|
-
|
620
|
-
original_example_map = {}
|
621
|
-
if original_examples:
|
622
|
-
for example in original_examples:
|
623
|
-
original_example_map[example.example_id] = example
|
624
|
-
|
625
|
-
# Remove the expected scorer names extraction and checking
|
626
|
-
# We'll instead verify all examples have consistent scorer data
|
617
|
+
|
627
618
|
while True:
|
628
619
|
poll_count += 1
|
629
620
|
try:
|
630
|
-
# Log polling attempt
|
631
|
-
if poll_count % 4 == 0: # Log every 4th poll to avoid excess logging
|
632
|
-
info(
|
633
|
-
f"Polling for evaluation '{eval_name}' in project '{project_name}' (attempt {poll_count})"
|
634
|
-
)
|
635
|
-
|
636
621
|
# Check status
|
637
622
|
response = await asyncio.to_thread(
|
638
623
|
requests.get,
|
@@ -650,7 +635,9 @@ async def _poll_evaluation_until_complete(
|
|
650
635
|
error_message = response.json().get(
|
651
636
|
"detail", "An unknown error occurred."
|
652
637
|
)
|
653
|
-
error(
|
638
|
+
judgeval_logger.error(
|
639
|
+
f"Error checking evaluation status: {error_message}"
|
640
|
+
)
|
654
641
|
# Don't raise exception immediately, just log and continue polling
|
655
642
|
await asyncio.sleep(poll_interval_seconds)
|
656
643
|
continue
|
@@ -660,9 +647,6 @@ async def _poll_evaluation_until_complete(
|
|
660
647
|
|
661
648
|
# If complete, get results and return
|
662
649
|
if status == "completed" or status == "complete":
|
663
|
-
info(
|
664
|
-
f"Evaluation '{eval_name}' reported as completed, fetching and verifying results..."
|
665
|
-
)
|
666
650
|
results_response = await asyncio.to_thread(
|
667
651
|
requests.post,
|
668
652
|
JUDGMENT_EVAL_FETCH_API_URL,
|
@@ -679,143 +663,55 @@ async def _poll_evaluation_until_complete(
|
|
679
663
|
error_message = results_response.json().get(
|
680
664
|
"detail", "An unknown error occurred."
|
681
665
|
)
|
682
|
-
error(
|
666
|
+
judgeval_logger.error(
|
667
|
+
f"Error fetching evaluation results: {error_message}"
|
668
|
+
)
|
683
669
|
raise JudgmentAPIError(error_message)
|
684
670
|
|
685
671
|
result_data = results_response.json()
|
686
672
|
|
687
|
-
if "examples"
|
688
|
-
|
673
|
+
if result_data.get("examples") is None:
|
674
|
+
continue
|
689
675
|
|
690
|
-
|
691
|
-
|
692
|
-
|
676
|
+
examples_data = result_data.get("examples", [])
|
677
|
+
scoring_results = []
|
678
|
+
|
679
|
+
for example_data in examples_data:
|
680
|
+
# Create ScorerData objects
|
681
|
+
scorer_data_list = []
|
682
|
+
for raw_scorer_data in example_data.get("scorer_data", []):
|
683
|
+
scorer_data_list.append(ScorerData(**raw_scorer_data))
|
684
|
+
|
685
|
+
if len(scorer_data_list) != expected_scorer_count:
|
686
|
+
# This means that not all scorers were loading for a specific example
|
687
|
+
continue
|
693
688
|
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
for
|
699
|
-
example_id = example_data.get("example_id")
|
700
|
-
|
701
|
-
if example_id not in original_example_map:
|
702
|
-
warning(
|
703
|
-
f"Server returned example with ID {example_id} not found in original examples. "
|
704
|
-
+ "This indicates stale or incorrect data. Continuing to poll..."
|
705
|
-
)
|
706
|
-
has_invalid_results = True
|
707
|
-
break
|
708
|
-
|
709
|
-
# If any invalid examples found, continue polling
|
710
|
-
if has_invalid_results:
|
711
|
-
info("Detected stale data. Waiting before polling again...")
|
712
|
-
await asyncio.sleep(poll_interval_seconds)
|
713
|
-
continue
|
714
|
-
|
715
|
-
# Check if we received the expected number of results
|
716
|
-
if original_examples and len(original_examples) != len(
|
717
|
-
examples_data
|
718
|
-
):
|
719
|
-
warning(
|
720
|
-
f"Expected {len(original_examples)} results but got {len(examples_data)} results. "
|
721
|
-
+ "This indicates incomplete data. Continuing to poll..."
|
722
|
-
)
|
723
|
-
await asyncio.sleep(poll_interval_seconds)
|
724
|
-
continue
|
725
|
-
|
726
|
-
# Collect all example IDs from scorer data
|
727
|
-
scorer_example_ids = set()
|
728
|
-
for example_data in examples_data:
|
729
|
-
scorer_data_list = example_data.get("scorer_data", [])
|
730
|
-
for scorer_data in scorer_data_list:
|
731
|
-
if "example_id" in scorer_data:
|
732
|
-
scorer_example_ids.add(scorer_data["example_id"])
|
733
|
-
|
734
|
-
# Get the set of original example IDs
|
735
|
-
original_example_ids = set(original_example_map.keys())
|
736
|
-
|
737
|
-
# Check if the sets are equal
|
738
|
-
missing_in_scorer = original_example_ids - scorer_example_ids
|
739
|
-
extra_in_scorer = scorer_example_ids - original_example_ids
|
740
|
-
|
741
|
-
if missing_in_scorer or extra_in_scorer:
|
742
|
-
if missing_in_scorer:
|
743
|
-
warning(
|
744
|
-
f"Examples missing in scorer data: {missing_in_scorer}"
|
745
|
-
)
|
746
|
-
if extra_in_scorer:
|
747
|
-
warning(
|
748
|
-
f"Extra examples in scorer data: {extra_in_scorer}"
|
749
|
-
)
|
750
|
-
info(
|
751
|
-
"Detected mismatched example IDs in scorer data. Waiting before polling again..."
|
752
|
-
)
|
753
|
-
await asyncio.sleep(poll_interval_seconds)
|
754
|
-
continue
|
755
|
-
|
756
|
-
# Create ScoringResult objects from the raw data
|
757
|
-
scoring_results = []
|
758
|
-
|
759
|
-
for example_data in examples_data:
|
760
|
-
# Extract example_id from the server response
|
761
|
-
example_id = example_data.get("example_id")
|
762
|
-
|
763
|
-
# Create ScorerData objects
|
764
|
-
scorer_data_list = []
|
765
|
-
for raw_scorer_data in example_data.get("scorer_data", []):
|
766
|
-
scorer_data_list.append(ScorerData(**raw_scorer_data))
|
767
|
-
|
768
|
-
# Use the original Example object if we have it and the ID matches
|
769
|
-
if original_example_map:
|
770
|
-
example = original_example_map[example_id]
|
771
|
-
debug(f"Matched result with original example {example_id}")
|
772
|
-
else:
|
773
|
-
# Create Example from example data (excluding scorer_data) if no original examples provided
|
774
|
-
example_dict = {
|
775
|
-
k: v
|
776
|
-
for k, v in example_data.items()
|
777
|
-
if k != "scorer_data"
|
778
|
-
}
|
779
|
-
example = Example(**example_dict)
|
780
|
-
|
781
|
-
# Calculate success based on whether all scorer_data entries were successful
|
782
|
-
success = (
|
783
|
-
all(scorer_data.success for scorer_data in scorer_data_list)
|
784
|
-
if scorer_data_list
|
785
|
-
else False
|
786
|
-
)
|
787
|
-
|
788
|
-
# Create ScoringResult
|
789
|
-
scoring_result = ScoringResult(
|
790
|
-
success=success, # Set based on all scorer data success values
|
791
|
-
scorers_data=scorer_data_list,
|
792
|
-
data_object=example,
|
793
|
-
)
|
794
|
-
scoring_results.append(scoring_result)
|
795
|
-
|
796
|
-
# If we got here, all validation checks passed
|
797
|
-
info(
|
798
|
-
f"Verified complete results for all {len(scoring_results)} examples with all expected scorer data"
|
689
|
+
example = Example(**example_data)
|
690
|
+
|
691
|
+
# Calculate success based on whether all scorer_data entries were successful
|
692
|
+
success = all(
|
693
|
+
scorer_data.success for scorer_data in scorer_data_list
|
799
694
|
)
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
f"No example results found for completed evaluation '{eval_name}'"
|
695
|
+
scoring_result = ScoringResult(
|
696
|
+
success=success, # Set based on all scorer data success values
|
697
|
+
scorers_data=scorer_data_list,
|
698
|
+
data_object=example,
|
805
699
|
)
|
806
|
-
|
700
|
+
scoring_results.append(scoring_result)
|
701
|
+
|
702
|
+
if len(scoring_results) != len(original_examples):
|
703
|
+
# This means that not all examples were evaluated
|
704
|
+
continue
|
807
705
|
|
706
|
+
return scoring_results
|
808
707
|
elif status == "failed":
|
809
708
|
# Evaluation failed
|
810
709
|
error_message = status_data.get("error", "Unknown error")
|
811
|
-
error(
|
710
|
+
judgeval_logger.error(
|
711
|
+
f"Evaluation '{eval_name}' failed: {error_message}"
|
712
|
+
)
|
812
713
|
raise JudgmentAPIError(f"Evaluation failed: {error_message}")
|
813
714
|
|
814
|
-
elif status == "pending" or status == "running":
|
815
|
-
# Only log occasionally for pending/running to avoid flooding logs
|
816
|
-
if poll_count % 4 == 0:
|
817
|
-
info(f"Evaluation '{eval_name}' status: {status}")
|
818
|
-
|
819
715
|
# Wait before checking again
|
820
716
|
await asyncio.sleep(poll_interval_seconds)
|
821
717
|
|
@@ -824,7 +720,7 @@ async def _poll_evaluation_until_complete(
|
|
824
720
|
raise
|
825
721
|
|
826
722
|
# For other exceptions, log and continue polling
|
827
|
-
error(f"Error checking evaluation status: {str(e)}")
|
723
|
+
judgeval_logger.error(f"Error checking evaluation status: {str(e)}")
|
828
724
|
if poll_count > 20: # Only raise exception after many failed attempts
|
829
725
|
raise JudgmentAPIError(
|
830
726
|
f"Error checking evaluation status after {poll_count} attempts: {str(e)}"
|
@@ -944,61 +840,26 @@ def run_eval(
|
|
944
840
|
)
|
945
841
|
|
946
842
|
# Set example IDs if not already set
|
947
|
-
debug("Initializing examples with IDs and timestamps")
|
948
843
|
for idx, example in enumerate(evaluation_run.examples):
|
949
844
|
example.example_index = idx # Set numeric index
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
)
|
954
|
-
debug(f"Input: {example.input}")
|
955
|
-
debug(f"Actual output: {example.actual_output}")
|
956
|
-
if example.expected_output:
|
957
|
-
debug(f"Expected output: {example.expected_output}")
|
958
|
-
if example.context:
|
959
|
-
debug(f"Context: {example.context}")
|
960
|
-
if example.retrieval_context:
|
961
|
-
debug(f"Retrieval context: {example.retrieval_context}")
|
962
|
-
if example.additional_metadata:
|
963
|
-
debug(f"Additional metadata: {example.additional_metadata}")
|
964
|
-
if example.tools_called:
|
965
|
-
debug(f"Tools called: {example.tools_called}")
|
966
|
-
if example.expected_tools:
|
967
|
-
debug(f"Expected tools: {example.expected_tools}")
|
968
|
-
|
969
|
-
debug(f"Starting evaluation run with {len(evaluation_run.examples)} examples")
|
970
|
-
|
971
|
-
# Group APIJudgmentScorers and JudgevalScorers, then evaluate them in parallel
|
972
|
-
debug("Grouping scorers by type")
|
973
|
-
judgment_scorers: List[APIJudgmentScorer] = []
|
974
|
-
local_scorers: List[JudgevalScorer] = []
|
845
|
+
|
846
|
+
judgment_scorers: List[APIScorerConfig] = []
|
847
|
+
local_scorers: List[BaseScorer] = []
|
975
848
|
for scorer in evaluation_run.scorers:
|
976
|
-
if isinstance(scorer,
|
849
|
+
if isinstance(scorer, APIScorerConfig):
|
977
850
|
judgment_scorers.append(scorer)
|
978
|
-
debug(f"Added judgment scorer: {type(scorer).__name__}")
|
979
851
|
else:
|
980
852
|
local_scorers.append(scorer)
|
981
|
-
debug(f"Added local scorer: {type(scorer).__name__}")
|
982
|
-
|
983
|
-
custom_example_check = [scorer.custom_example for scorer in local_scorers]
|
984
|
-
if any(custom_example_check) and not all(custom_example_check):
|
985
|
-
error("All scorers must be custom scorers if using custom examples")
|
986
|
-
raise ValueError("All scorers must be custom scorers if using custom examples")
|
987
|
-
|
988
|
-
debug(
|
989
|
-
f"Found {len(judgment_scorers)} judgment scorers and {len(local_scorers)} local scorers"
|
990
|
-
)
|
991
853
|
|
992
854
|
api_results: List[ScoringResult] = []
|
993
855
|
local_results: List[ScoringResult] = []
|
994
856
|
|
995
857
|
if async_execution:
|
996
858
|
if len(local_scorers) > 0:
|
997
|
-
error("Local scorers are not supported in async execution")
|
859
|
+
judgeval_logger.error("Local scorers are not supported in async execution")
|
998
860
|
raise ValueError("Local scorers are not supported in async execution")
|
999
861
|
|
1000
862
|
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
1001
|
-
info("Starting async evaluation")
|
1002
863
|
|
1003
864
|
async def _async_evaluation_workflow():
|
1004
865
|
# Create a payload
|
@@ -1021,11 +882,11 @@ def run_eval(
|
|
1021
882
|
error_message = response.json().get(
|
1022
883
|
"detail", "An unknown error occurred."
|
1023
884
|
)
|
1024
|
-
error(
|
885
|
+
judgeval_logger.error(
|
886
|
+
f"Error adding evaluation to queue: {error_message}"
|
887
|
+
)
|
1025
888
|
raise JudgmentAPIError(error_message)
|
1026
889
|
|
1027
|
-
info(f"Successfully added evaluation '{evaluation_run.eval_name}' to queue")
|
1028
|
-
|
1029
890
|
# Poll until the evaluation is complete
|
1030
891
|
results = await _poll_evaluation_until_complete(
|
1031
892
|
eval_name=evaluation_run.eval_name,
|
@@ -1033,6 +894,7 @@ def run_eval(
|
|
1033
894
|
judgment_api_key=evaluation_run.judgment_api_key,
|
1034
895
|
organization_id=evaluation_run.organization_id,
|
1035
896
|
original_examples=evaluation_run.examples, # Pass the original examples
|
897
|
+
expected_scorer_count=len(evaluation_run.scorers),
|
1036
898
|
)
|
1037
899
|
|
1038
900
|
pretty_str_to_print = None
|
@@ -1047,7 +909,9 @@ def run_eval(
|
|
1047
909
|
log_evaluation_results, send_results, evaluation_run
|
1048
910
|
)
|
1049
911
|
except Exception as e:
|
1050
|
-
error(
|
912
|
+
judgeval_logger.error(
|
913
|
+
f"Error logging results after async evaluation: {str(e)}"
|
914
|
+
)
|
1051
915
|
|
1052
916
|
return results, pretty_str_to_print
|
1053
917
|
|
@@ -1062,8 +926,6 @@ def run_eval(
|
|
1062
926
|
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
1063
927
|
if judgment_scorers:
|
1064
928
|
# Execute evaluation using Judgment API
|
1065
|
-
info("Starting API evaluation")
|
1066
|
-
debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
|
1067
929
|
try: # execute an EvaluationRun with just JudgmentScorers
|
1068
930
|
api_evaluation_run: EvaluationRun = EvaluationRun(
|
1069
931
|
eval_name=evaluation_run.eval_name,
|
@@ -1074,13 +936,11 @@ def run_eval(
|
|
1074
936
|
judgment_api_key=evaluation_run.judgment_api_key,
|
1075
937
|
organization_id=evaluation_run.organization_id,
|
1076
938
|
)
|
1077
|
-
debug("Sending request to Judgment API")
|
1078
939
|
response_data: Dict = run_with_spinner(
|
1079
940
|
"Running Evaluation: ", execute_api_eval, api_evaluation_run
|
1080
941
|
)
|
1081
|
-
info(f"Received {len(response_data['results'])} results from API")
|
1082
942
|
except JudgmentAPIError as e:
|
1083
|
-
error(
|
943
|
+
judgeval_logger.error(
|
1084
944
|
f"An error occurred while executing the Judgment API request: {str(e)}"
|
1085
945
|
)
|
1086
946
|
raise JudgmentAPIError(
|
@@ -1092,39 +952,25 @@ def run_eval(
|
|
1092
952
|
)
|
1093
953
|
|
1094
954
|
# Convert the response data to `ScoringResult` objects
|
1095
|
-
debug("Processing API results")
|
1096
955
|
api_results = [
|
1097
956
|
ScoringResult(**result) for result in response_data["results"]
|
1098
957
|
]
|
1099
958
|
# Run local evals
|
1100
|
-
if local_scorers: # List[
|
1101
|
-
# We should be removing local scorers soon
|
1102
|
-
info("Starting local evaluation")
|
1103
|
-
for example in evaluation_run.examples:
|
1104
|
-
with example_logging_context(example.created_at, example.example_id):
|
1105
|
-
debug(f"Processing example {example.example_id}: {example.input}")
|
1106
|
-
|
959
|
+
if local_scorers: # List[BaseScorer]
|
1107
960
|
results: List[ScoringResult] = safe_run_async(
|
1108
961
|
a_execute_scoring(
|
1109
962
|
evaluation_run.examples,
|
1110
963
|
local_scorers,
|
1111
964
|
model=evaluation_run.model,
|
1112
|
-
skip_on_missing_params=True,
|
1113
|
-
show_indicator=True,
|
1114
|
-
_use_bar_indicator=True,
|
1115
965
|
throttle_value=0,
|
1116
966
|
max_concurrent=MAX_CONCURRENT_EVALUATIONS,
|
1117
967
|
)
|
1118
968
|
)
|
1119
969
|
local_results = results
|
1120
|
-
info(f"Local evaluation complete with {len(local_results)} results")
|
1121
970
|
# Aggregate the ScorerData from the API and local evaluations
|
1122
|
-
debug("Merging API and local results")
|
1123
971
|
merged_results: List[ScoringResult] = merge_results(api_results, local_results)
|
1124
972
|
merged_results = check_missing_scorer_data(merged_results)
|
1125
973
|
|
1126
|
-
info(f"Successfully merged {len(merged_results)} results")
|
1127
|
-
|
1128
974
|
# Evaluate rules against local scoring results if rules exist (this cant be done just yet)
|
1129
975
|
# if evaluation_run.rules and merged_results:
|
1130
976
|
# run_rules(
|
@@ -1146,13 +992,6 @@ def run_eval(
|
|
1146
992
|
)
|
1147
993
|
rprint(pretty_str)
|
1148
994
|
|
1149
|
-
for i, result in enumerate(merged_results):
|
1150
|
-
if (
|
1151
|
-
not result.scorers_data
|
1152
|
-
): # none of the scorers could be executed on this example
|
1153
|
-
info(
|
1154
|
-
f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers."
|
1155
|
-
)
|
1156
995
|
return merged_results
|
1157
996
|
|
1158
997
|
|
@@ -1205,8 +1044,6 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
1205
1044
|
f"Strict Mode: {fail_scorer.strict_mode}\n"
|
1206
1045
|
f"Evaluation Model: {fail_scorer.evaluation_model}\n"
|
1207
1046
|
f"Error: {fail_scorer.error}\n"
|
1208
|
-
f"Evaluation Cost: {fail_scorer.evaluation_cost}\n"
|
1209
|
-
f"Verbose Logs: {fail_scorer.verbose_logs}\n"
|
1210
1047
|
f"Additional Metadata: {fail_scorer.additional_metadata}\n"
|
1211
1048
|
)
|
1212
1049
|
error_msg += "-" * 100
|
judgeval/scorers/__init__.py
CHANGED
@@ -1,20 +1,12 @@
|
|
1
|
-
from judgeval.scorers.api_scorer import
|
2
|
-
from judgeval.scorers.
|
3
|
-
from judgeval.scorers.prompt_scorer import PromptScorer
|
1
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
2
|
+
from judgeval.scorers.base_scorer import BaseScorer
|
4
3
|
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
5
4
|
ExecutionOrderScorer,
|
6
|
-
JSONCorrectnessScorer,
|
7
|
-
SummarizationScorer,
|
8
5
|
HallucinationScorer,
|
9
6
|
FaithfulnessScorer,
|
10
|
-
ContextualRelevancyScorer,
|
11
|
-
ContextualPrecisionScorer,
|
12
|
-
ContextualRecallScorer,
|
13
7
|
AnswerRelevancyScorer,
|
14
8
|
AnswerCorrectnessScorer,
|
15
|
-
ComparisonScorer,
|
16
9
|
InstructionAdherenceScorer,
|
17
|
-
GroundednessScorer,
|
18
10
|
DerailmentScorer,
|
19
11
|
ToolOrderScorer,
|
20
12
|
ClassifierScorer,
|
@@ -25,24 +17,16 @@ from judgeval.scorers.judgeval_scorers.classifiers import (
|
|
25
17
|
)
|
26
18
|
|
27
19
|
__all__ = [
|
28
|
-
"
|
29
|
-
"
|
30
|
-
"PromptScorer",
|
20
|
+
"APIScorerConfig",
|
21
|
+
"BaseScorer",
|
31
22
|
"ClassifierScorer",
|
32
23
|
"ExecutionOrderScorer",
|
33
|
-
"JSONCorrectnessScorer",
|
34
|
-
"SummarizationScorer",
|
35
24
|
"HallucinationScorer",
|
36
25
|
"FaithfulnessScorer",
|
37
|
-
"ContextualRelevancyScorer",
|
38
|
-
"ContextualPrecisionScorer",
|
39
|
-
"ContextualRecallScorer",
|
40
26
|
"AnswerRelevancyScorer",
|
41
27
|
"AnswerCorrectnessScorer",
|
42
28
|
"Text2SQLScorer",
|
43
|
-
"ComparisonScorer",
|
44
29
|
"InstructionAdherenceScorer",
|
45
|
-
"GroundednessScorer",
|
46
30
|
"DerailmentScorer",
|
47
31
|
"ToolOrderScorer",
|
48
32
|
"ToolDependencyScorer",
|