judgeval 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +132 -281
- judgeval/common/utils.py +1 -1
- judgeval/constants.py +2 -3
- judgeval/data/__init__.py +0 -2
- judgeval/data/datasets/dataset.py +2 -9
- judgeval/data/datasets/eval_dataset_client.py +1 -62
- judgeval/data/example.py +7 -7
- judgeval/data/result.py +3 -3
- judgeval/data/tool.py +19 -0
- judgeval/data/trace.py +5 -1
- judgeval/data/{sequence_run.py → trace_run.py} +4 -4
- judgeval/evaluation_run.py +1 -1
- judgeval/integrations/langgraph.py +187 -1768
- judgeval/judges/litellm_judge.py +1 -1
- judgeval/judges/mixture_of_judges.py +1 -1
- judgeval/judges/utils.py +1 -1
- judgeval/judgment_client.py +21 -25
- judgeval/run_evaluation.py +381 -107
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +4 -2
- judgeval-0.0.39.dist-info/METADATA +247 -0
- {judgeval-0.0.37.dist-info → judgeval-0.0.39.dist-info}/RECORD +23 -23
- judgeval/data/sequence.py +0 -50
- judgeval-0.0.37.dist-info/METADATA +0 -214
- {judgeval-0.0.37.dist-info → judgeval-0.0.39.dist-info}/WHEEL +0 -0
- {judgeval-0.0.37.dist-info → judgeval-0.0.39.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
CHANGED
@@ -13,7 +13,6 @@ from judgeval.data import (
|
|
13
13
|
ScoringResult,
|
14
14
|
Example,
|
15
15
|
CustomExample,
|
16
|
-
Sequence,
|
17
16
|
Trace
|
18
17
|
)
|
19
18
|
from judgeval.scorers import (
|
@@ -25,21 +24,23 @@ from judgeval.scorers.score import a_execute_scoring
|
|
25
24
|
from judgeval.constants import (
|
26
25
|
ROOT_API,
|
27
26
|
JUDGMENT_EVAL_API_URL,
|
28
|
-
|
27
|
+
JUDGMENT_TRACE_EVAL_API_URL,
|
29
28
|
JUDGMENT_EVAL_LOG_API_URL,
|
30
29
|
MAX_CONCURRENT_EVALUATIONS,
|
31
30
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
32
|
-
|
31
|
+
JUDGMENT_GET_EVAL_STATUS_API_URL,
|
32
|
+
JUDGMENT_EVAL_FETCH_API_URL
|
33
33
|
)
|
34
34
|
from judgeval.common.exceptions import JudgmentAPIError
|
35
35
|
from judgeval.common.logger import (
|
36
36
|
debug,
|
37
37
|
info,
|
38
|
-
error,
|
38
|
+
error,
|
39
|
+
warning,
|
39
40
|
example_logging_context
|
40
41
|
)
|
41
42
|
from judgeval.evaluation_run import EvaluationRun
|
42
|
-
from judgeval.data.
|
43
|
+
from judgeval.data.trace_run import TraceRun
|
43
44
|
from judgeval.common.tracer import Tracer
|
44
45
|
from langchain_core.callbacks import BaseCallbackHandler
|
45
46
|
|
@@ -98,20 +99,20 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
98
99
|
raise JudgmentAPIError(error_message)
|
99
100
|
return response_data
|
100
101
|
|
101
|
-
def
|
102
|
+
def execute_api_trace_eval(trace_run: TraceRun) -> List[Dict]:
|
102
103
|
"""
|
103
104
|
Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
|
104
105
|
"""
|
105
106
|
|
106
107
|
try:
|
107
108
|
# submit API request to execute evals
|
108
|
-
payload =
|
109
|
+
payload = trace_run.model_dump(warnings=False)
|
109
110
|
response = requests.post(
|
110
|
-
|
111
|
+
JUDGMENT_TRACE_EVAL_API_URL,
|
111
112
|
headers={
|
112
113
|
"Content-Type": "application/json",
|
113
|
-
"Authorization": f"Bearer {
|
114
|
-
"X-Organization-Id":
|
114
|
+
"Authorization": f"Bearer {trace_run.judgment_api_key}",
|
115
|
+
"X-Organization-Id": trace_run.organization_id
|
115
116
|
},
|
116
117
|
json=payload,
|
117
118
|
verify=True
|
@@ -282,7 +283,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
282
283
|
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
283
284
|
|
284
285
|
|
285
|
-
def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun,
|
286
|
+
def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, TraceRun]) -> str:
|
286
287
|
"""
|
287
288
|
Logs evaluation results to the Judgment API database.
|
288
289
|
|
@@ -327,51 +328,6 @@ def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[Eval
|
|
327
328
|
error(f"Failed to save evaluation results to DB: {str(e)}")
|
328
329
|
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
329
330
|
|
330
|
-
def retrieve_sequence_from_trace(trace_id: str, parent_span: str, judgment_api_key: str, organization_id: str) -> Sequence:
|
331
|
-
"""
|
332
|
-
Retrieves a sequence from a trace ID.
|
333
|
-
"""
|
334
|
-
"""
|
335
|
-
Logs evaluation results to the Judgment API database.
|
336
|
-
|
337
|
-
Args:
|
338
|
-
merged_results (List[ScoringResult]): The results to log
|
339
|
-
evaluation_run (EvaluationRun): The evaluation run containing project info and API key
|
340
|
-
|
341
|
-
Raises:
|
342
|
-
JudgmentAPIError: If there's an API error during logging
|
343
|
-
ValueError: If there's a validation error with the results
|
344
|
-
"""
|
345
|
-
try:
|
346
|
-
res = requests.post(
|
347
|
-
JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL,
|
348
|
-
headers={
|
349
|
-
"Content-Type": "application/json",
|
350
|
-
"Authorization": f"Bearer {judgment_api_key}",
|
351
|
-
"X-Organization-Id": organization_id
|
352
|
-
},
|
353
|
-
json={
|
354
|
-
"trace_id": trace_id,
|
355
|
-
"trace_span_id": parent_span,
|
356
|
-
},
|
357
|
-
verify=True
|
358
|
-
)
|
359
|
-
|
360
|
-
if not res.ok:
|
361
|
-
response_data = res.json()
|
362
|
-
error_message = response_data.get('detail', 'An unknown error occurred.')
|
363
|
-
error(f"Error {res.status_code}: {error_message}")
|
364
|
-
raise JudgmentAPIError(error_message)
|
365
|
-
|
366
|
-
return Sequence(**res.json())
|
367
|
-
except requests.exceptions.RequestException as e:
|
368
|
-
error(f"Request failed while saving evaluation results to DB: {str(e)}")
|
369
|
-
raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
|
370
|
-
except Exception as e:
|
371
|
-
error(f"Failed to save evaluation results to DB: {str(e)}")
|
372
|
-
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
373
|
-
|
374
|
-
|
375
331
|
def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
|
376
332
|
"""Run a function with a spinner in the terminal."""
|
377
333
|
spinner = itertools.cycle(['|', '/', '-', '\\'])
|
@@ -415,69 +371,366 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
|
|
415
371
|
if missing_params:
|
416
372
|
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
|
417
373
|
|
418
|
-
def
|
374
|
+
def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
|
419
375
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
420
|
-
if not override and
|
376
|
+
if not override and trace_run.log_results and not trace_run.append:
|
421
377
|
check_eval_run_name_exists(
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
378
|
+
trace_run.eval_name,
|
379
|
+
trace_run.project_name,
|
380
|
+
trace_run.judgment_api_key,
|
381
|
+
trace_run.organization_id
|
426
382
|
)
|
427
383
|
|
428
|
-
if
|
384
|
+
if trace_run.append:
|
429
385
|
# Check that the current experiment, if one exists, has the same type (examples of sequences)
|
430
386
|
check_experiment_type(
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
387
|
+
trace_run.eval_name,
|
388
|
+
trace_run.project_name,
|
389
|
+
trace_run.judgment_api_key,
|
390
|
+
trace_run.organization_id,
|
435
391
|
True
|
436
392
|
)
|
437
393
|
|
438
394
|
if function and tracer:
|
439
|
-
|
395
|
+
new_traces: List[Trace] = []
|
396
|
+
tracer.offline_mode = True
|
440
397
|
for example in examples:
|
441
398
|
if example.input:
|
442
399
|
result = run_with_spinner("Running agent function: ", function, **example.input)
|
443
400
|
else:
|
444
401
|
result = run_with_spinner("Running agent function: ", function)
|
445
402
|
for i, trace in enumerate(tracer.traces):
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
sequence_run.sequences = new_sequences
|
452
|
-
|
453
|
-
for sequence in sequence_run.sequences:
|
454
|
-
sequence.scorers = sequence_run.scorers
|
403
|
+
# We set the root-level trace span with the expected tools of the Trace
|
404
|
+
trace = Trace(**trace)
|
405
|
+
trace.entries[0].expected_tools = examples[i].expected_tools
|
406
|
+
new_traces.append(trace)
|
407
|
+
trace_run.traces = new_traces
|
455
408
|
|
456
409
|
# Execute evaluation using Judgment API
|
457
410
|
info("Starting API evaluation")
|
458
411
|
try: # execute an EvaluationRun with just JudgmentScorers
|
459
412
|
debug("Sending request to Judgment API")
|
460
|
-
response_data: List[Dict] = run_with_spinner("Running
|
413
|
+
response_data: List[Dict] = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
|
461
414
|
scoring_results = [ScoringResult(**result) for result in response_data["results"]]
|
462
415
|
info(f"Received {len(scoring_results)} results from API")
|
463
416
|
except JudgmentAPIError as e:
|
464
417
|
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
465
418
|
raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
|
466
419
|
except ValueError as e:
|
467
|
-
raise ValueError(f"Please check your
|
420
|
+
raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: {str(e)}")
|
468
421
|
|
469
422
|
# Convert the response data to `ScoringResult` objects
|
470
423
|
debug("Processing API results")
|
471
|
-
# TODO: allow for custom scorer on
|
472
|
-
if
|
473
|
-
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"],
|
424
|
+
# TODO: allow for custom scorer on traces
|
425
|
+
if trace_run.log_results:
|
426
|
+
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], trace_run)
|
474
427
|
rprint(pretty_str)
|
475
428
|
|
476
429
|
return scoring_results
|
477
430
|
|
478
431
|
|
479
432
|
|
480
|
-
def
|
433
|
+
async def get_evaluation_status(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str) -> Dict:
|
434
|
+
"""
|
435
|
+
Gets the status of an async evaluation run.
|
436
|
+
|
437
|
+
Args:
|
438
|
+
eval_name (str): Name of the evaluation run
|
439
|
+
project_name (str): Name of the project
|
440
|
+
judgment_api_key (str): API key for authentication
|
441
|
+
organization_id (str): Organization ID for the evaluation
|
442
|
+
|
443
|
+
Returns:
|
444
|
+
Dict: Status information including:
|
445
|
+
- status: 'pending', 'running', 'completed', or 'failed'
|
446
|
+
- results: List of ScoringResult objects if completed
|
447
|
+
- error: Error message if failed
|
448
|
+
"""
|
449
|
+
try:
|
450
|
+
response = requests.get(
|
451
|
+
JUDGMENT_GET_EVAL_STATUS_API_URL,
|
452
|
+
headers={
|
453
|
+
"Content-Type": "application/json",
|
454
|
+
"Authorization": f"Bearer {judgment_api_key}",
|
455
|
+
"X-Organization-Id": organization_id
|
456
|
+
},
|
457
|
+
params={
|
458
|
+
"eval_name": eval_name,
|
459
|
+
"project_name": project_name,
|
460
|
+
},
|
461
|
+
verify=True
|
462
|
+
)
|
463
|
+
|
464
|
+
if not response.ok:
|
465
|
+
error_message = response.json().get('detail', 'An unknown error occurred.')
|
466
|
+
error(f"Error checking evaluation status: {error_message}")
|
467
|
+
raise JudgmentAPIError(error_message)
|
468
|
+
|
469
|
+
return response.json()
|
470
|
+
except requests.exceptions.RequestException as e:
|
471
|
+
error(f"Failed to check evaluation status: {str(e)}")
|
472
|
+
raise JudgmentAPIError(f"Failed to check evaluation status: {str(e)}")
|
473
|
+
|
474
|
+
async def _poll_evaluation_until_complete(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, poll_interval_seconds: int = 5, original_examples: Optional[List[Example]] = None) -> List[ScoringResult]:
|
475
|
+
"""
|
476
|
+
Polls until the evaluation is complete and returns the results.
|
477
|
+
|
478
|
+
Args:
|
479
|
+
eval_name (str): Name of the evaluation run
|
480
|
+
project_name (str): Name of the project
|
481
|
+
judgment_api_key (str): API key for authentication
|
482
|
+
organization_id (str): Organization ID for the evaluation
|
483
|
+
poll_interval_seconds (int, optional): Time between status checks in seconds. Defaults to 5.
|
484
|
+
original_examples (List[Example], optional): The original examples sent for evaluation.
|
485
|
+
If provided, will match results with original examples.
|
486
|
+
|
487
|
+
Returns:
|
488
|
+
List[ScoringResult]: The evaluation results
|
489
|
+
"""
|
490
|
+
poll_count = 0
|
491
|
+
# Create example_id to Example mapping if original examples are provided
|
492
|
+
original_example_map = {}
|
493
|
+
if original_examples:
|
494
|
+
for example in original_examples:
|
495
|
+
original_example_map[example.example_id] = example
|
496
|
+
|
497
|
+
# Remove the expected scorer names extraction and checking
|
498
|
+
# We'll instead verify all examples have consistent scorer data
|
499
|
+
while True:
|
500
|
+
poll_count += 1
|
501
|
+
try:
|
502
|
+
# Log polling attempt
|
503
|
+
if poll_count % 4 == 0: # Log every 4th poll to avoid excess logging
|
504
|
+
info(f"Polling for evaluation '{eval_name}' in project '{project_name}' (attempt {poll_count})")
|
505
|
+
|
506
|
+
# Check status
|
507
|
+
response = requests.get(
|
508
|
+
JUDGMENT_GET_EVAL_STATUS_API_URL,
|
509
|
+
headers={
|
510
|
+
"Content-Type": "application/json",
|
511
|
+
"Authorization": f"Bearer {judgment_api_key}",
|
512
|
+
"X-Organization-Id": organization_id
|
513
|
+
},
|
514
|
+
params={
|
515
|
+
"eval_name": eval_name,
|
516
|
+
"project_name": project_name
|
517
|
+
},
|
518
|
+
verify=True
|
519
|
+
)
|
520
|
+
|
521
|
+
if not response.ok:
|
522
|
+
error_message = response.json().get('detail', 'An unknown error occurred.')
|
523
|
+
error(f"Error checking evaluation status: {error_message}")
|
524
|
+
# Don't raise exception immediately, just log and continue polling
|
525
|
+
await asyncio.sleep(poll_interval_seconds)
|
526
|
+
continue
|
527
|
+
|
528
|
+
status_data = response.json()
|
529
|
+
status = status_data.get("status")
|
530
|
+
|
531
|
+
# If complete, get results and return
|
532
|
+
if status == "completed" or status == "complete":
|
533
|
+
info(f"Evaluation '{eval_name}' reported as completed, fetching and verifying results...")
|
534
|
+
results_response = requests.post(
|
535
|
+
JUDGMENT_EVAL_FETCH_API_URL,
|
536
|
+
headers={
|
537
|
+
"Content-Type": "application/json",
|
538
|
+
"Authorization": f"Bearer {judgment_api_key}",
|
539
|
+
"X-Organization-Id": organization_id
|
540
|
+
},
|
541
|
+
json={
|
542
|
+
"project_name": project_name,
|
543
|
+
"eval_name": eval_name
|
544
|
+
},
|
545
|
+
verify=True
|
546
|
+
)
|
547
|
+
|
548
|
+
if not results_response.ok:
|
549
|
+
error_message = results_response.json().get('detail', 'An unknown error occurred.')
|
550
|
+
error(f"Error fetching evaluation results: {error_message}")
|
551
|
+
raise JudgmentAPIError(error_message)
|
552
|
+
|
553
|
+
result_data = results_response.json()
|
554
|
+
|
555
|
+
if "examples" in result_data:
|
556
|
+
examples_data = result_data.get("examples", [])
|
557
|
+
|
558
|
+
|
559
|
+
info(f"Successfully fetched {len(examples_data)} results for evaluation '{eval_name}'")
|
560
|
+
|
561
|
+
# Check for result validity if original examples are provided
|
562
|
+
if original_example_map:
|
563
|
+
# Verify all returned examples have matching original examples
|
564
|
+
has_invalid_results = False
|
565
|
+
for example_data in examples_data:
|
566
|
+
example_id = example_data.get("example_id")
|
567
|
+
|
568
|
+
if example_id not in original_example_map:
|
569
|
+
warning(f"Server returned example with ID {example_id} not found in original examples. " +
|
570
|
+
f"This indicates stale or incorrect data. Continuing to poll...")
|
571
|
+
has_invalid_results = True
|
572
|
+
break
|
573
|
+
|
574
|
+
# If any invalid examples found, continue polling
|
575
|
+
if has_invalid_results:
|
576
|
+
info("Detected stale data. Waiting before polling again...")
|
577
|
+
await asyncio.sleep(poll_interval_seconds)
|
578
|
+
continue
|
579
|
+
|
580
|
+
# Check if we received the expected number of results
|
581
|
+
if len(original_examples) != len(examples_data):
|
582
|
+
warning(f"Expected {len(original_examples)} results but got {len(examples_data)} results. " +
|
583
|
+
f"This indicates incomplete data. Continuing to poll...")
|
584
|
+
await asyncio.sleep(poll_interval_seconds)
|
585
|
+
continue
|
586
|
+
|
587
|
+
# Collect all example IDs from scorer data
|
588
|
+
scorer_example_ids = set()
|
589
|
+
for example_data in examples_data:
|
590
|
+
scorer_data_list = example_data.get("scorer_data", [])
|
591
|
+
for scorer_data in scorer_data_list:
|
592
|
+
if "example_id" in scorer_data:
|
593
|
+
scorer_example_ids.add(scorer_data["example_id"])
|
594
|
+
|
595
|
+
# Get the set of original example IDs
|
596
|
+
original_example_ids = set(original_example_map.keys())
|
597
|
+
|
598
|
+
# Check if the sets are equal
|
599
|
+
missing_in_scorer = original_example_ids - scorer_example_ids
|
600
|
+
extra_in_scorer = scorer_example_ids - original_example_ids
|
601
|
+
|
602
|
+
if missing_in_scorer or extra_in_scorer:
|
603
|
+
if missing_in_scorer:
|
604
|
+
warning(f"Examples missing in scorer data: {missing_in_scorer}")
|
605
|
+
if extra_in_scorer:
|
606
|
+
warning(f"Extra examples in scorer data: {extra_in_scorer}")
|
607
|
+
info("Detected mismatched example IDs in scorer data. Waiting before polling again...")
|
608
|
+
await asyncio.sleep(poll_interval_seconds)
|
609
|
+
continue
|
610
|
+
|
611
|
+
# Create ScoringResult objects from the raw data
|
612
|
+
scoring_results = []
|
613
|
+
|
614
|
+
for example_data in examples_data:
|
615
|
+
# Extract example_id from the server response
|
616
|
+
example_id = example_data.get("example_id")
|
617
|
+
|
618
|
+
# Create ScorerData objects
|
619
|
+
scorer_data_list = []
|
620
|
+
for raw_scorer_data in example_data.get("scorer_data", []):
|
621
|
+
scorer_data_list.append(ScorerData(**raw_scorer_data))
|
622
|
+
|
623
|
+
# Use the original Example object if we have it and the ID matches
|
624
|
+
if original_example_map:
|
625
|
+
example = original_example_map[example_id]
|
626
|
+
debug(f"Matched result with original example {example_id}")
|
627
|
+
else:
|
628
|
+
# Create Example from example data (excluding scorer_data) if no original examples provided
|
629
|
+
example_dict = {k: v for k, v in example_data.items() if k != "scorer_data"}
|
630
|
+
example = Example(**example_dict)
|
631
|
+
|
632
|
+
# Calculate success based on whether all scorer_data entries were successful
|
633
|
+
success = all(scorer_data.success for scorer_data in scorer_data_list) if scorer_data_list else False
|
634
|
+
|
635
|
+
# Create ScoringResult
|
636
|
+
scoring_result = ScoringResult(
|
637
|
+
success=success, # Set based on all scorer data success values
|
638
|
+
scorers_data=scorer_data_list,
|
639
|
+
data_object=example
|
640
|
+
)
|
641
|
+
scoring_results.append(scoring_result)
|
642
|
+
|
643
|
+
# If we got here, all validation checks passed
|
644
|
+
info(f"Verified complete results for all {len(scoring_results)} examples with all expected scorer data")
|
645
|
+
return scoring_results
|
646
|
+
else:
|
647
|
+
# No examples found
|
648
|
+
info(f"No example results found for completed evaluation '{eval_name}'")
|
649
|
+
return []
|
650
|
+
|
651
|
+
elif status == "failed":
|
652
|
+
# Evaluation failed
|
653
|
+
error_message = status_data.get("error", "Unknown error")
|
654
|
+
error(f"Evaluation '{eval_name}' failed: {error_message}")
|
655
|
+
raise JudgmentAPIError(f"Evaluation failed: {error_message}")
|
656
|
+
|
657
|
+
elif status == "pending" or status == "running":
|
658
|
+
# Only log occasionally for pending/running to avoid flooding logs
|
659
|
+
if poll_count % 4 == 0:
|
660
|
+
info(f"Evaluation '{eval_name}' status: {status}")
|
661
|
+
|
662
|
+
# Wait before checking again
|
663
|
+
await asyncio.sleep(poll_interval_seconds)
|
664
|
+
|
665
|
+
except Exception as e:
|
666
|
+
if isinstance(e, JudgmentAPIError):
|
667
|
+
raise
|
668
|
+
|
669
|
+
# For other exceptions, log and continue polling
|
670
|
+
error(f"Error checking evaluation status: {str(e)}")
|
671
|
+
if poll_count > 20: # Only raise exception after many failed attempts
|
672
|
+
raise JudgmentAPIError(f"Error checking evaluation status after {poll_count} attempts: {str(e)}")
|
673
|
+
|
674
|
+
# Continue polling after a delay
|
675
|
+
await asyncio.sleep(poll_interval_seconds)
|
676
|
+
|
677
|
+
async def await_with_spinner(task, message: str = "Awaiting async task: "):
|
678
|
+
"""
|
679
|
+
Display a spinner while awaiting an async task.
|
680
|
+
|
681
|
+
Args:
|
682
|
+
task: The asyncio task to await
|
683
|
+
message (str): Message to display with the spinner
|
684
|
+
|
685
|
+
Returns:
|
686
|
+
Any: The result of the awaited task
|
687
|
+
"""
|
688
|
+
spinner = itertools.cycle(['|', '/', '-', '\\'])
|
689
|
+
|
690
|
+
# Create an event to signal when to stop the spinner
|
691
|
+
stop_spinner_event = asyncio.Event()
|
692
|
+
|
693
|
+
async def display_spinner():
|
694
|
+
while not stop_spinner_event.is_set():
|
695
|
+
sys.stdout.write(f'\r{message}{next(spinner)}')
|
696
|
+
sys.stdout.flush()
|
697
|
+
await asyncio.sleep(0.1)
|
698
|
+
|
699
|
+
# Start the spinner in a separate task
|
700
|
+
spinner_task = asyncio.create_task(display_spinner())
|
701
|
+
|
702
|
+
try:
|
703
|
+
# Await the actual task
|
704
|
+
result = await task
|
705
|
+
finally:
|
706
|
+
# Signal the spinner to stop and wait for it to finish
|
707
|
+
stop_spinner_event.set()
|
708
|
+
await spinner_task
|
709
|
+
|
710
|
+
# Clear the spinner line
|
711
|
+
sys.stdout.write('\r' + ' ' * (len(message) + 1) + '\r')
|
712
|
+
sys.stdout.flush()
|
713
|
+
|
714
|
+
return result
|
715
|
+
|
716
|
+
class SpinnerWrappedTask:
|
717
|
+
"""
|
718
|
+
A wrapper for an asyncio task that displays a spinner when awaited.
|
719
|
+
"""
|
720
|
+
def __init__(self, task, message: str):
|
721
|
+
self.task = task
|
722
|
+
self.message = message
|
723
|
+
|
724
|
+
def __await__(self):
|
725
|
+
async def _spin_and_await():
|
726
|
+
return await await_with_spinner(self.task, self.message)
|
727
|
+
return _spin_and_await().__await__()
|
728
|
+
|
729
|
+
# Proxy all Task attributes and methods to the underlying task
|
730
|
+
def __getattr__(self, name):
|
731
|
+
return getattr(self.task, name)
|
732
|
+
|
733
|
+
def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> Union[List[ScoringResult], asyncio.Task]:
|
481
734
|
"""
|
482
735
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
483
736
|
|
@@ -485,21 +738,12 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
485
738
|
evaluation_run (EvaluationRun): Stores example and evaluation together for running
|
486
739
|
override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
|
487
740
|
ignore_errors (bool, optional): Whether to ignore scorer errors during evaluation. Defaults to True.
|
741
|
+
async_execution (bool, optional): Whether to execute the evaluation asynchronously. Defaults to False.
|
488
742
|
|
489
|
-
Args:
|
490
|
-
project_name (str): The name of the project the evaluation results belong to
|
491
|
-
eval_name (str): The name of the evaluation run
|
492
|
-
examples (List[Example]): The examples to evaluate
|
493
|
-
scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
494
|
-
model (str): The model used as a judge when using LLM as a Judge
|
495
|
-
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
496
|
-
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
497
|
-
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
498
|
-
log_results (bool): Whether to log the results to the Judgment API
|
499
|
-
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
500
|
-
|
501
743
|
Returns:
|
502
|
-
List[ScoringResult]
|
744
|
+
Union[List[ScoringResult], Union[asyncio.Task, SpinnerWrappedTask]]:
|
745
|
+
- If async_execution is False, returns a list of ScoringResult objects
|
746
|
+
- If async_execution is True, returns a Task that will resolve to a list of ScoringResult objects when awaited
|
503
747
|
"""
|
504
748
|
|
505
749
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
@@ -570,21 +814,51 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
570
814
|
if async_execution:
|
571
815
|
if len(local_scorers) > 0:
|
572
816
|
error("Local scorers are not supported in async execution")
|
817
|
+
raise ValueError("Local scorers are not supported in async execution")
|
573
818
|
|
574
819
|
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
575
820
|
info("Starting async evaluation")
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
821
|
+
|
822
|
+
async def _async_evaluation_workflow():
|
823
|
+
# Create a payload
|
824
|
+
payload = evaluation_run.model_dump(warnings=False)
|
825
|
+
|
826
|
+
# Send the evaluation to the queue
|
827
|
+
response = requests.post(
|
828
|
+
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
829
|
+
headers={
|
830
|
+
"Content-Type": "application/json",
|
831
|
+
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
832
|
+
"X-Organization-Id": evaluation_run.organization_id
|
833
|
+
},
|
834
|
+
json=payload,
|
835
|
+
verify=True
|
836
|
+
)
|
837
|
+
|
838
|
+
if not response.ok:
|
839
|
+
error_message = response.json().get('detail', 'An unknown error occurred.')
|
840
|
+
error(f"Error adding evaluation to queue: {error_message}")
|
841
|
+
raise JudgmentAPIError(error_message)
|
842
|
+
|
843
|
+
info(f"Successfully added evaluation '{evaluation_run.eval_name}' to queue")
|
844
|
+
|
845
|
+
# Poll until the evaluation is complete
|
846
|
+
return await _poll_evaluation_until_complete(
|
847
|
+
eval_name=evaluation_run.eval_name,
|
848
|
+
project_name=evaluation_run.project_name,
|
849
|
+
judgment_api_key=evaluation_run.judgment_api_key,
|
850
|
+
organization_id=evaluation_run.organization_id,
|
851
|
+
original_examples=evaluation_run.examples # Pass the original examples
|
852
|
+
)
|
853
|
+
|
854
|
+
# Create a regular task
|
855
|
+
task = asyncio.create_task(_async_evaluation_workflow())
|
856
|
+
|
857
|
+
# Wrap it in our custom awaitable that will show a spinner only when awaited
|
858
|
+
return SpinnerWrappedTask(
|
859
|
+
task,
|
860
|
+
f"Processing evaluation '{evaluation_run.eval_name}': "
|
586
861
|
)
|
587
|
-
print("Successfully added evaluation to queue")
|
588
862
|
else:
|
589
863
|
if judgment_scorers:
|
590
864
|
# Execute evaluation using Judgment API
|
@@ -5,13 +5,15 @@
|
|
5
5
|
# Internal imports
|
6
6
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
7
7
|
from judgeval.constants import APIScorer
|
8
|
-
|
8
|
+
from typing import Optional, Dict
|
9
9
|
class ToolOrderScorer(APIJudgmentScorer):
|
10
|
-
|
10
|
+
kwargs: Optional[Dict] = None
|
11
|
+
def __init__(self, threshold: float=1.0, exact_match: bool=False):
|
11
12
|
super().__init__(
|
12
13
|
threshold=threshold,
|
13
14
|
score_type=APIScorer.TOOL_ORDER,
|
14
15
|
)
|
16
|
+
self.kwargs = {"exact_match": exact_match}
|
15
17
|
|
16
18
|
@property
|
17
19
|
def __name__(self):
|