judgeval 0.0.38__py3-none-any.whl → 0.0.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/clients.py +6 -4
- judgeval/common/tracer.py +361 -236
- judgeval/constants.py +3 -0
- judgeval/data/__init__.py +2 -1
- judgeval/data/example.py +14 -13
- judgeval/data/tool.py +47 -0
- judgeval/data/trace.py +28 -39
- judgeval/data/trace_run.py +2 -1
- judgeval/evaluation_run.py +4 -7
- judgeval/judgment_client.py +27 -6
- judgeval/run_evaluation.py +395 -37
- judgeval/scorers/__init__.py +4 -1
- judgeval/scorers/judgeval_scorer.py +8 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +4 -0
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +124 -0
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +20 -0
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +1 -1
- judgeval/scorers/prompt_scorer.py +5 -164
- judgeval/scorers/score.py +15 -15
- judgeval-0.0.40.dist-info/METADATA +1441 -0
- {judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/RECORD +23 -20
- judgeval-0.0.38.dist-info/METADATA +0 -247
- {judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/WHEEL +0 -0
- {judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
CHANGED
@@ -28,12 +28,15 @@ from judgeval.constants import (
|
|
28
28
|
JUDGMENT_EVAL_LOG_API_URL,
|
29
29
|
MAX_CONCURRENT_EVALUATIONS,
|
30
30
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
31
|
+
JUDGMENT_GET_EVAL_STATUS_API_URL,
|
32
|
+
JUDGMENT_EVAL_FETCH_API_URL
|
31
33
|
)
|
32
34
|
from judgeval.common.exceptions import JudgmentAPIError
|
33
35
|
from judgeval.common.logger import (
|
34
36
|
debug,
|
35
37
|
info,
|
36
|
-
error,
|
38
|
+
error,
|
39
|
+
warning,
|
37
40
|
example_logging_context
|
38
41
|
)
|
39
42
|
from judgeval.evaluation_run import EvaluationRun
|
@@ -201,9 +204,9 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
|
|
201
204
|
)
|
202
205
|
return results
|
203
206
|
|
204
|
-
def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str,
|
207
|
+
def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_trace: bool) -> None:
|
205
208
|
"""
|
206
|
-
Checks if the current experiment, if one exists, has the same type (examples of
|
209
|
+
Checks if the current experiment, if one exists, has the same type (examples of traces)
|
207
210
|
"""
|
208
211
|
try:
|
209
212
|
response = requests.post(
|
@@ -217,7 +220,7 @@ def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: s
|
|
217
220
|
"eval_name": eval_name,
|
218
221
|
"project_name": project_name,
|
219
222
|
"judgment_api_key": judgment_api_key,
|
220
|
-
"
|
223
|
+
"is_trace": is_trace
|
221
224
|
},
|
222
225
|
verify=True
|
223
226
|
)
|
@@ -379,7 +382,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
379
382
|
)
|
380
383
|
|
381
384
|
if trace_run.append:
|
382
|
-
# Check that the current experiment, if one exists, has the same type (examples
|
385
|
+
# Check that the current experiment, if one exists, has the same type (examples or traces)
|
383
386
|
check_experiment_type(
|
384
387
|
trace_run.eval_name,
|
385
388
|
trace_run.project_name,
|
@@ -387,13 +390,18 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
387
390
|
trace_run.organization_id,
|
388
391
|
True
|
389
392
|
)
|
390
|
-
|
391
393
|
if function and tracer:
|
392
394
|
new_traces: List[Trace] = []
|
393
395
|
tracer.offline_mode = True
|
396
|
+
tracer.traces = []
|
394
397
|
for example in examples:
|
395
398
|
if example.input:
|
396
|
-
|
399
|
+
if isinstance(example.input, str):
|
400
|
+
result = run_with_spinner("Running agent function: ", function, example.input)
|
401
|
+
elif isinstance(example.input, dict):
|
402
|
+
result = run_with_spinner("Running agent function: ", function, **example.input)
|
403
|
+
else:
|
404
|
+
raise ValueError(f"Input must be string or dict, got {type(example.input)}")
|
397
405
|
else:
|
398
406
|
result = run_with_spinner("Running agent function: ", function)
|
399
407
|
for i, trace in enumerate(tracer.traces):
|
@@ -402,6 +410,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
402
410
|
trace.entries[0].expected_tools = examples[i].expected_tools
|
403
411
|
new_traces.append(trace)
|
404
412
|
trace_run.traces = new_traces
|
413
|
+
tracer.traces = []
|
405
414
|
|
406
415
|
# Execute evaluation using Judgment API
|
407
416
|
info("Starting API evaluation")
|
@@ -420,14 +429,327 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
|
|
420
429
|
debug("Processing API results")
|
421
430
|
# TODO: allow for custom scorer on traces
|
422
431
|
if trace_run.log_results:
|
423
|
-
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["
|
432
|
+
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["agent_results"], trace_run)
|
424
433
|
rprint(pretty_str)
|
425
434
|
|
426
435
|
return scoring_results
|
427
436
|
|
428
437
|
|
429
438
|
|
430
|
-
def
|
439
|
+
async def get_evaluation_status(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str) -> Dict:
|
440
|
+
"""
|
441
|
+
Gets the status of an async evaluation run.
|
442
|
+
|
443
|
+
Args:
|
444
|
+
eval_name (str): Name of the evaluation run
|
445
|
+
project_name (str): Name of the project
|
446
|
+
judgment_api_key (str): API key for authentication
|
447
|
+
organization_id (str): Organization ID for the evaluation
|
448
|
+
|
449
|
+
Returns:
|
450
|
+
Dict: Status information including:
|
451
|
+
- status: 'pending', 'running', 'completed', or 'failed'
|
452
|
+
- results: List of ScoringResult objects if completed
|
453
|
+
- error: Error message if failed
|
454
|
+
"""
|
455
|
+
try:
|
456
|
+
response = requests.get(
|
457
|
+
JUDGMENT_GET_EVAL_STATUS_API_URL,
|
458
|
+
headers={
|
459
|
+
"Content-Type": "application/json",
|
460
|
+
"Authorization": f"Bearer {judgment_api_key}",
|
461
|
+
"X-Organization-Id": organization_id
|
462
|
+
},
|
463
|
+
params={
|
464
|
+
"eval_name": eval_name,
|
465
|
+
"project_name": project_name,
|
466
|
+
},
|
467
|
+
verify=True
|
468
|
+
)
|
469
|
+
|
470
|
+
if not response.ok:
|
471
|
+
error_message = response.json().get('detail', 'An unknown error occurred.')
|
472
|
+
error(f"Error checking evaluation status: {error_message}")
|
473
|
+
raise JudgmentAPIError(error_message)
|
474
|
+
|
475
|
+
return response.json()
|
476
|
+
except requests.exceptions.RequestException as e:
|
477
|
+
error(f"Failed to check evaluation status: {str(e)}")
|
478
|
+
raise JudgmentAPIError(f"Failed to check evaluation status: {str(e)}")
|
479
|
+
|
480
|
+
async def _poll_evaluation_until_complete(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, poll_interval_seconds: int = 5, original_examples: Optional[List[Example]] = None) -> List[ScoringResult]:
|
481
|
+
"""
|
482
|
+
Polls until the evaluation is complete and returns the results.
|
483
|
+
|
484
|
+
Args:
|
485
|
+
eval_name (str): Name of the evaluation run
|
486
|
+
project_name (str): Name of the project
|
487
|
+
judgment_api_key (str): API key for authentication
|
488
|
+
organization_id (str): Organization ID for the evaluation
|
489
|
+
poll_interval_seconds (int, optional): Time between status checks in seconds. Defaults to 5.
|
490
|
+
original_examples (List[Example], optional): The original examples sent for evaluation.
|
491
|
+
If provided, will match results with original examples.
|
492
|
+
|
493
|
+
Returns:
|
494
|
+
List[ScoringResult]: The evaluation results
|
495
|
+
"""
|
496
|
+
poll_count = 0
|
497
|
+
# Create example_id to Example mapping if original examples are provided
|
498
|
+
original_example_map = {}
|
499
|
+
if original_examples:
|
500
|
+
for example in original_examples:
|
501
|
+
original_example_map[example.example_id] = example
|
502
|
+
|
503
|
+
# Remove the expected scorer names extraction and checking
|
504
|
+
# We'll instead verify all examples have consistent scorer data
|
505
|
+
while True:
|
506
|
+
poll_count += 1
|
507
|
+
try:
|
508
|
+
# Log polling attempt
|
509
|
+
if poll_count % 4 == 0: # Log every 4th poll to avoid excess logging
|
510
|
+
info(f"Polling for evaluation '{eval_name}' in project '{project_name}' (attempt {poll_count})")
|
511
|
+
|
512
|
+
# Check status
|
513
|
+
response = await asyncio.to_thread(
|
514
|
+
requests.get,
|
515
|
+
JUDGMENT_GET_EVAL_STATUS_API_URL,
|
516
|
+
headers={
|
517
|
+
"Content-Type": "application/json",
|
518
|
+
"Authorization": f"Bearer {judgment_api_key}",
|
519
|
+
"X-Organization-Id": organization_id
|
520
|
+
},
|
521
|
+
params={
|
522
|
+
"eval_name": eval_name,
|
523
|
+
"project_name": project_name
|
524
|
+
},
|
525
|
+
verify=True
|
526
|
+
)
|
527
|
+
|
528
|
+
if not response.ok:
|
529
|
+
error_message = response.json().get('detail', 'An unknown error occurred.')
|
530
|
+
error(f"Error checking evaluation status: {error_message}")
|
531
|
+
# Don't raise exception immediately, just log and continue polling
|
532
|
+
await asyncio.sleep(poll_interval_seconds)
|
533
|
+
continue
|
534
|
+
|
535
|
+
status_data = response.json()
|
536
|
+
status = status_data.get("status")
|
537
|
+
|
538
|
+
# If complete, get results and return
|
539
|
+
if status == "completed" or status == "complete":
|
540
|
+
info(f"Evaluation '{eval_name}' reported as completed, fetching and verifying results...")
|
541
|
+
results_response = await asyncio.to_thread(
|
542
|
+
requests.post,
|
543
|
+
JUDGMENT_EVAL_FETCH_API_URL,
|
544
|
+
headers={
|
545
|
+
"Content-Type": "application/json",
|
546
|
+
"Authorization": f"Bearer {judgment_api_key}",
|
547
|
+
"X-Organization-Id": organization_id
|
548
|
+
},
|
549
|
+
json={
|
550
|
+
"project_name": project_name,
|
551
|
+
"eval_name": eval_name
|
552
|
+
},
|
553
|
+
verify=True
|
554
|
+
)
|
555
|
+
|
556
|
+
if not results_response.ok:
|
557
|
+
error_message = results_response.json().get('detail', 'An unknown error occurred.')
|
558
|
+
error(f"Error fetching evaluation results: {error_message}")
|
559
|
+
raise JudgmentAPIError(error_message)
|
560
|
+
|
561
|
+
result_data = results_response.json()
|
562
|
+
|
563
|
+
if "examples" in result_data:
|
564
|
+
examples_data = result_data.get("examples", [])
|
565
|
+
|
566
|
+
|
567
|
+
info(f"Successfully fetched {len(examples_data)} results for evaluation '{eval_name}'")
|
568
|
+
|
569
|
+
# Check for result validity if original examples are provided
|
570
|
+
if original_example_map:
|
571
|
+
# Verify all returned examples have matching original examples
|
572
|
+
has_invalid_results = False
|
573
|
+
for example_data in examples_data:
|
574
|
+
example_id = example_data.get("example_id")
|
575
|
+
|
576
|
+
if example_id not in original_example_map:
|
577
|
+
warning(f"Server returned example with ID {example_id} not found in original examples. " +
|
578
|
+
f"This indicates stale or incorrect data. Continuing to poll...")
|
579
|
+
has_invalid_results = True
|
580
|
+
break
|
581
|
+
|
582
|
+
# If any invalid examples found, continue polling
|
583
|
+
if has_invalid_results:
|
584
|
+
info("Detected stale data. Waiting before polling again...")
|
585
|
+
await asyncio.sleep(poll_interval_seconds)
|
586
|
+
continue
|
587
|
+
|
588
|
+
# Check if we received the expected number of results
|
589
|
+
if len(original_examples) != len(examples_data):
|
590
|
+
warning(f"Expected {len(original_examples)} results but got {len(examples_data)} results. " +
|
591
|
+
f"This indicates incomplete data. Continuing to poll...")
|
592
|
+
await asyncio.sleep(poll_interval_seconds)
|
593
|
+
continue
|
594
|
+
|
595
|
+
# Collect all example IDs from scorer data
|
596
|
+
scorer_example_ids = set()
|
597
|
+
for example_data in examples_data:
|
598
|
+
scorer_data_list = example_data.get("scorer_data", [])
|
599
|
+
for scorer_data in scorer_data_list:
|
600
|
+
if "example_id" in scorer_data:
|
601
|
+
scorer_example_ids.add(scorer_data["example_id"])
|
602
|
+
|
603
|
+
# Get the set of original example IDs
|
604
|
+
original_example_ids = set(original_example_map.keys())
|
605
|
+
|
606
|
+
# Check if the sets are equal
|
607
|
+
missing_in_scorer = original_example_ids - scorer_example_ids
|
608
|
+
extra_in_scorer = scorer_example_ids - original_example_ids
|
609
|
+
|
610
|
+
if missing_in_scorer or extra_in_scorer:
|
611
|
+
if missing_in_scorer:
|
612
|
+
warning(f"Examples missing in scorer data: {missing_in_scorer}")
|
613
|
+
if extra_in_scorer:
|
614
|
+
warning(f"Extra examples in scorer data: {extra_in_scorer}")
|
615
|
+
info("Detected mismatched example IDs in scorer data. Waiting before polling again...")
|
616
|
+
await asyncio.sleep(poll_interval_seconds)
|
617
|
+
continue
|
618
|
+
|
619
|
+
# Create ScoringResult objects from the raw data
|
620
|
+
scoring_results = []
|
621
|
+
|
622
|
+
for example_data in examples_data:
|
623
|
+
# Extract example_id from the server response
|
624
|
+
example_id = example_data.get("example_id")
|
625
|
+
|
626
|
+
# Create ScorerData objects
|
627
|
+
scorer_data_list = []
|
628
|
+
for raw_scorer_data in example_data.get("scorer_data", []):
|
629
|
+
scorer_data_list.append(ScorerData(**raw_scorer_data))
|
630
|
+
|
631
|
+
# Use the original Example object if we have it and the ID matches
|
632
|
+
if original_example_map:
|
633
|
+
example = original_example_map[example_id]
|
634
|
+
debug(f"Matched result with original example {example_id}")
|
635
|
+
else:
|
636
|
+
# Create Example from example data (excluding scorer_data) if no original examples provided
|
637
|
+
example_dict = {k: v for k, v in example_data.items() if k != "scorer_data"}
|
638
|
+
example = Example(**example_dict)
|
639
|
+
|
640
|
+
# Calculate success based on whether all scorer_data entries were successful
|
641
|
+
success = all(scorer_data.success for scorer_data in scorer_data_list) if scorer_data_list else False
|
642
|
+
|
643
|
+
# Create ScoringResult
|
644
|
+
scoring_result = ScoringResult(
|
645
|
+
success=success, # Set based on all scorer data success values
|
646
|
+
scorers_data=scorer_data_list,
|
647
|
+
data_object=example
|
648
|
+
)
|
649
|
+
scoring_results.append(scoring_result)
|
650
|
+
|
651
|
+
# If we got here, all validation checks passed
|
652
|
+
info(f"Verified complete results for all {len(scoring_results)} examples with all expected scorer data")
|
653
|
+
return scoring_results
|
654
|
+
else:
|
655
|
+
# No examples found
|
656
|
+
info(f"No example results found for completed evaluation '{eval_name}'")
|
657
|
+
return []
|
658
|
+
|
659
|
+
elif status == "failed":
|
660
|
+
# Evaluation failed
|
661
|
+
error_message = status_data.get("error", "Unknown error")
|
662
|
+
error(f"Evaluation '{eval_name}' failed: {error_message}")
|
663
|
+
raise JudgmentAPIError(f"Evaluation failed: {error_message}")
|
664
|
+
|
665
|
+
elif status == "pending" or status == "running":
|
666
|
+
# Only log occasionally for pending/running to avoid flooding logs
|
667
|
+
if poll_count % 4 == 0:
|
668
|
+
info(f"Evaluation '{eval_name}' status: {status}")
|
669
|
+
|
670
|
+
# Wait before checking again
|
671
|
+
await asyncio.sleep(poll_interval_seconds)
|
672
|
+
|
673
|
+
except Exception as e:
|
674
|
+
if isinstance(e, JudgmentAPIError):
|
675
|
+
raise
|
676
|
+
|
677
|
+
# For other exceptions, log and continue polling
|
678
|
+
error(f"Error checking evaluation status: {str(e)}")
|
679
|
+
if poll_count > 20: # Only raise exception after many failed attempts
|
680
|
+
raise JudgmentAPIError(f"Error checking evaluation status after {poll_count} attempts: {str(e)}")
|
681
|
+
|
682
|
+
# Continue polling after a delay
|
683
|
+
await asyncio.sleep(poll_interval_seconds)
|
684
|
+
|
685
|
+
async def await_with_spinner(task, message: str = "Awaiting async task: "):
|
686
|
+
"""
|
687
|
+
Display a spinner while awaiting an async task.
|
688
|
+
|
689
|
+
Args:
|
690
|
+
task: The asyncio task to await
|
691
|
+
message (str): Message to display with the spinner
|
692
|
+
|
693
|
+
Returns:
|
694
|
+
Any: The result of the awaited task
|
695
|
+
"""
|
696
|
+
spinner = itertools.cycle(['|', '/', '-', '\\'])
|
697
|
+
|
698
|
+
# Create an event to signal when to stop the spinner
|
699
|
+
stop_spinner_event = asyncio.Event()
|
700
|
+
|
701
|
+
async def display_spinner():
|
702
|
+
while not stop_spinner_event.is_set():
|
703
|
+
sys.stdout.write(f'\r{message}{next(spinner)}')
|
704
|
+
sys.stdout.flush()
|
705
|
+
await asyncio.sleep(0.1)
|
706
|
+
|
707
|
+
# Start the spinner in a separate task
|
708
|
+
spinner_task = asyncio.create_task(display_spinner())
|
709
|
+
|
710
|
+
try:
|
711
|
+
# Await the actual task
|
712
|
+
result = await task
|
713
|
+
finally:
|
714
|
+
# Signal the spinner to stop and wait for it to finish
|
715
|
+
stop_spinner_event.set()
|
716
|
+
await spinner_task
|
717
|
+
|
718
|
+
# Clear the spinner line
|
719
|
+
sys.stdout.write('\r' + ' ' * (len(message) + 1) + '\r')
|
720
|
+
sys.stdout.flush()
|
721
|
+
|
722
|
+
return result
|
723
|
+
|
724
|
+
class SpinnerWrappedTask:
|
725
|
+
"""
|
726
|
+
A wrapper for an asyncio task that displays a spinner when awaited.
|
727
|
+
"""
|
728
|
+
def __init__(self, task, message: str):
|
729
|
+
self.task = task
|
730
|
+
self.message = message
|
731
|
+
|
732
|
+
def __await__(self):
|
733
|
+
async def _spin_and_await():
|
734
|
+
# self.task resolves to (scoring_results, pretty_str_to_print)
|
735
|
+
task_result_tuple = await await_with_spinner(self.task, self.message)
|
736
|
+
|
737
|
+
# Unpack the tuple
|
738
|
+
scoring_results, pretty_str_to_print = task_result_tuple
|
739
|
+
|
740
|
+
# Print the pretty string if it exists, after spinner is cleared
|
741
|
+
if pretty_str_to_print:
|
742
|
+
rprint(pretty_str_to_print)
|
743
|
+
|
744
|
+
# Return only the scoring_results to the original awaiter
|
745
|
+
return scoring_results
|
746
|
+
return _spin_and_await().__await__()
|
747
|
+
|
748
|
+
# Proxy all Task attributes and methods to the underlying task
|
749
|
+
def __getattr__(self, name):
|
750
|
+
return getattr(self.task, name)
|
751
|
+
|
752
|
+
def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> Union[List[ScoringResult], asyncio.Task]:
|
431
753
|
"""
|
432
754
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
433
755
|
|
@@ -435,21 +757,12 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
435
757
|
evaluation_run (EvaluationRun): Stores example and evaluation together for running
|
436
758
|
override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
|
437
759
|
ignore_errors (bool, optional): Whether to ignore scorer errors during evaluation. Defaults to True.
|
760
|
+
async_execution (bool, optional): Whether to execute the evaluation asynchronously. Defaults to False.
|
438
761
|
|
439
|
-
Args:
|
440
|
-
project_name (str): The name of the project the evaluation results belong to
|
441
|
-
eval_name (str): The name of the evaluation run
|
442
|
-
examples (List[Example]): The examples to evaluate
|
443
|
-
scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
444
|
-
model (str): The model used as a judge when using LLM as a Judge
|
445
|
-
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
446
|
-
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
447
|
-
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
448
|
-
log_results (bool): Whether to log the results to the Judgment API
|
449
|
-
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
450
|
-
|
451
762
|
Returns:
|
452
|
-
List[ScoringResult]
|
763
|
+
Union[List[ScoringResult], Union[asyncio.Task, SpinnerWrappedTask]]:
|
764
|
+
- If async_execution is False, returns a list of ScoringResult objects
|
765
|
+
- If async_execution is True, returns a Task that will resolve to a list of ScoringResult objects when awaited
|
453
766
|
"""
|
454
767
|
|
455
768
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
@@ -462,7 +775,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
462
775
|
)
|
463
776
|
|
464
777
|
if evaluation_run.append:
|
465
|
-
# Check that the current experiment, if one exists, has the same type (examples of
|
778
|
+
# Check that the current experiment, if one exists, has the same type (examples of traces)
|
466
779
|
check_experiment_type(
|
467
780
|
evaluation_run.eval_name,
|
468
781
|
evaluation_run.project_name,
|
@@ -475,8 +788,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
475
788
|
debug("Initializing examples with IDs and timestamps")
|
476
789
|
for idx, example in enumerate(evaluation_run.examples):
|
477
790
|
example.example_index = idx # Set numeric index
|
478
|
-
example.
|
479
|
-
with example_logging_context(example.timestamp, example.example_id):
|
791
|
+
with example_logging_context(example.created_at, example.example_id):
|
480
792
|
debug(f"Initialized example {example.example_id} (index: {example.example_index})")
|
481
793
|
debug(f"Input: {example.input}")
|
482
794
|
debug(f"Actual output: {example.actual_output}")
|
@@ -520,21 +832,67 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
520
832
|
if async_execution:
|
521
833
|
if len(local_scorers) > 0:
|
522
834
|
error("Local scorers are not supported in async execution")
|
835
|
+
raise ValueError("Local scorers are not supported in async execution")
|
523
836
|
|
524
837
|
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
525
838
|
info("Starting async evaluation")
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
839
|
+
|
840
|
+
async def _async_evaluation_workflow():
|
841
|
+
# Create a payload
|
842
|
+
payload = evaluation_run.model_dump(warnings=False)
|
843
|
+
|
844
|
+
# Send the evaluation to the queue
|
845
|
+
response = await asyncio.to_thread(
|
846
|
+
requests.post,
|
847
|
+
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
848
|
+
headers={
|
849
|
+
"Content-Type": "application/json",
|
850
|
+
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
851
|
+
"X-Organization-Id": evaluation_run.organization_id
|
852
|
+
},
|
853
|
+
json=payload,
|
854
|
+
verify=True
|
855
|
+
)
|
856
|
+
|
857
|
+
if not response.ok:
|
858
|
+
error_message = response.json().get('detail', 'An unknown error occurred.')
|
859
|
+
error(f"Error adding evaluation to queue: {error_message}")
|
860
|
+
raise JudgmentAPIError(error_message)
|
861
|
+
|
862
|
+
info(f"Successfully added evaluation '{evaluation_run.eval_name}' to queue")
|
863
|
+
|
864
|
+
# Poll until the evaluation is complete
|
865
|
+
results = await _poll_evaluation_until_complete(
|
866
|
+
eval_name=evaluation_run.eval_name,
|
867
|
+
project_name=evaluation_run.project_name,
|
868
|
+
judgment_api_key=evaluation_run.judgment_api_key,
|
869
|
+
organization_id=evaluation_run.organization_id,
|
870
|
+
original_examples=evaluation_run.examples # Pass the original examples
|
871
|
+
)
|
872
|
+
|
873
|
+
pretty_str_to_print = None
|
874
|
+
if evaluation_run.log_results and results: # Ensure results exist before logging
|
875
|
+
send_results = [scoring_result.model_dump(warnings=False) for scoring_result in results]
|
876
|
+
try:
|
877
|
+
# Run the blocking log_evaluation_results in a separate thread
|
878
|
+
pretty_str_to_print = await asyncio.to_thread(
|
879
|
+
log_evaluation_results,
|
880
|
+
send_results,
|
881
|
+
evaluation_run
|
882
|
+
)
|
883
|
+
except Exception as e:
|
884
|
+
error(f"Error logging results after async evaluation: {str(e)}")
|
885
|
+
|
886
|
+
return results, pretty_str_to_print
|
887
|
+
|
888
|
+
# Create a regular task
|
889
|
+
task = asyncio.create_task(_async_evaluation_workflow())
|
890
|
+
|
891
|
+
# Wrap it in our custom awaitable that will show a spinner only when awaited
|
892
|
+
return SpinnerWrappedTask(
|
893
|
+
task,
|
894
|
+
f"Processing evaluation '{evaluation_run.eval_name}': "
|
536
895
|
)
|
537
|
-
print("Successfully added evaluation to queue")
|
538
896
|
else:
|
539
897
|
if judgment_scorers:
|
540
898
|
# Execute evaluation using Judgment API
|
@@ -571,7 +929,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
571
929
|
# We should be removing local scorers soon
|
572
930
|
info("Starting local evaluation")
|
573
931
|
for example in evaluation_run.examples:
|
574
|
-
with example_logging_context(example.
|
932
|
+
with example_logging_context(example.created_at, example.example_id):
|
575
933
|
debug(f"Processing example {example.example_id}: {example.input}")
|
576
934
|
|
577
935
|
results: List[ScoringResult] = asyncio.run(
|
judgeval/scorers/__init__.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
2
2
|
from judgeval.scorers.judgeval_scorer import JudgevalScorer
|
3
|
-
from judgeval.scorers.prompt_scorer import PromptScorer
|
3
|
+
from judgeval.scorers.prompt_scorer import PromptScorer
|
4
4
|
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
5
5
|
ExecutionOrderScorer,
|
6
6
|
JSONCorrectnessScorer,
|
@@ -17,6 +17,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
|
|
17
17
|
GroundednessScorer,
|
18
18
|
DerailmentScorer,
|
19
19
|
ToolOrderScorer,
|
20
|
+
ClassifierScorer,
|
21
|
+
ToolDependencyScorer,
|
20
22
|
)
|
21
23
|
from judgeval.scorers.judgeval_scorers.classifiers import (
|
22
24
|
Text2SQLScorer,
|
@@ -43,4 +45,5 @@ __all__ = [
|
|
43
45
|
"GroundednessScorer",
|
44
46
|
"DerailmentScorer",
|
45
47
|
"ToolOrderScorer",
|
48
|
+
"ToolDependencyScorer",
|
46
49
|
]
|
@@ -39,6 +39,8 @@ class JudgevalScorer:
|
|
39
39
|
evaluation_cost: Optional[float] = None # The cost of running the scorer
|
40
40
|
verbose_logs: Optional[str] = None # The verbose logs of the scorer
|
41
41
|
additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
|
42
|
+
error: Optional[str] = None
|
43
|
+
success: Optional[bool] = None
|
42
44
|
|
43
45
|
def __init__(
|
44
46
|
self,
|
@@ -145,3 +147,9 @@ class JudgevalScorer:
|
|
145
147
|
"additional_metadata": self.additional_metadata,
|
146
148
|
}
|
147
149
|
return f"JudgevalScorer({attributes})"
|
150
|
+
|
151
|
+
def to_dict(self):
|
152
|
+
return {
|
153
|
+
"score_type": str(self.score_type), # Convert enum to string for serialization
|
154
|
+
"threshold": self.threshold
|
155
|
+
}
|
@@ -13,6 +13,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import
|
|
13
13
|
from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
|
14
14
|
from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
|
15
15
|
from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
|
16
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.classifier_scorer import ClassifierScorer
|
17
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import ToolDependencyScorer
|
16
18
|
__all__ = [
|
17
19
|
"ExecutionOrderScorer",
|
18
20
|
"JSONCorrectnessScorer",
|
@@ -29,4 +31,6 @@ __all__ = [
|
|
29
31
|
"GroundednessScorer",
|
30
32
|
"DerailmentScorer",
|
31
33
|
"ToolOrderScorer",
|
34
|
+
"ClassifierScorer",
|
35
|
+
"ToolDependencyScorer",
|
32
36
|
]
|