judgeval 0.0.36__py3-none-any.whl → 0.0.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +565 -858
- judgeval/common/utils.py +18 -0
- judgeval/constants.py +3 -1
- judgeval/data/__init__.py +4 -0
- judgeval/data/datasets/dataset.py +0 -2
- judgeval/data/example.py +29 -7
- judgeval/data/sequence.py +5 -4
- judgeval/data/sequence_run.py +4 -3
- judgeval/data/trace.py +129 -0
- judgeval/evaluation_run.py +1 -1
- judgeval/integrations/langgraph.py +18 -17
- judgeval/judgment_client.py +77 -64
- judgeval/run_evaluation.py +126 -29
- judgeval/scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -0
- judgeval/scorers/score.py +1 -1
- judgeval/utils/data_utils.py +57 -0
- judgeval-0.0.37.dist-info/METADATA +214 -0
- {judgeval-0.0.36.dist-info → judgeval-0.0.37.dist-info}/RECORD +22 -19
- judgeval-0.0.36.dist-info/METADATA +0 -169
- {judgeval-0.0.36.dist-info → judgeval-0.0.37.dist-info}/WHEEL +0 -0
- {judgeval-0.0.36.dist-info → judgeval-0.0.37.dist-info}/licenses/LICENSE.md +0 -0
judgeval/judgment_client.py
CHANGED
@@ -2,7 +2,8 @@
|
|
2
2
|
Implements the JudgmentClient to interact with the Judgment API.
|
3
3
|
"""
|
4
4
|
import os
|
5
|
-
from
|
5
|
+
from uuid import uuid4
|
6
|
+
from typing import Optional, List, Dict, Any, Union, Callable
|
6
7
|
import requests
|
7
8
|
|
8
9
|
from judgeval.constants import ROOT_API
|
@@ -33,7 +34,11 @@ from judgeval.constants import (
|
|
33
34
|
JUDGMENT_PROJECT_DELETE_API_URL,
|
34
35
|
JUDGMENT_PROJECT_CREATE_API_URL
|
35
36
|
)
|
37
|
+
from judgeval.utils.data_utils import add_from_yaml
|
36
38
|
from judgeval.common.exceptions import JudgmentAPIError
|
39
|
+
from langchain_core.callbacks import BaseCallbackHandler
|
40
|
+
from judgeval.common.tracer import Tracer
|
41
|
+
from judgeval.common.utils import validate_api_key
|
37
42
|
from pydantic import BaseModel
|
38
43
|
from judgeval.rules import Rule
|
39
44
|
|
@@ -63,7 +68,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
63
68
|
self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
|
64
69
|
|
65
70
|
# Verify API key is valid
|
66
|
-
result, response =
|
71
|
+
result, response = validate_api_key(judgment_api_key)
|
67
72
|
if not result:
|
68
73
|
# May be bad to output their invalid API key...
|
69
74
|
raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
|
@@ -74,7 +79,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
74
79
|
self,
|
75
80
|
examples: List[Example],
|
76
81
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
77
|
-
model: Union[str, List[str], JudgevalJudge],
|
82
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
78
83
|
aggregator: Optional[str] = None,
|
79
84
|
metadata: Optional[Dict[str, Any]] = None,
|
80
85
|
log_results: bool = True,
|
@@ -102,9 +107,11 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
102
107
|
|
103
108
|
def run_sequence_evaluation(
|
104
109
|
self,
|
105
|
-
sequences: List[Sequence],
|
106
|
-
model: Union[str, List[str], JudgevalJudge],
|
107
110
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
111
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
112
|
+
sequences: Optional[List[Sequence]] = None,
|
113
|
+
examples: Optional[List[Example]] = None,
|
114
|
+
test_file: Optional[str] = None,
|
108
115
|
aggregator: Optional[str] = None,
|
109
116
|
project_name: str = "default_project",
|
110
117
|
eval_run_name: str = "default_eval_sequence",
|
@@ -112,40 +119,40 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
112
119
|
append: bool = False,
|
113
120
|
override: bool = False,
|
114
121
|
ignore_errors: bool = True,
|
115
|
-
rules: Optional[List[Rule]] = None
|
122
|
+
rules: Optional[List[Rule]] = None,
|
123
|
+
function: Optional[Callable] = None,
|
124
|
+
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
|
116
125
|
) -> List[ScoringResult]:
|
117
|
-
try:
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
126
|
+
try:
|
127
|
+
|
128
|
+
if test_file:
|
129
|
+
try:
|
130
|
+
examples = add_from_yaml(test_file)
|
131
|
+
except FileNotFoundError:
|
132
|
+
raise FileNotFoundError(f"Test file not found: {test_file}")
|
133
|
+
|
134
|
+
if examples and not function:
|
135
|
+
raise ValueError("Cannot pass in examples without a function")
|
136
|
+
|
137
|
+
if sequences and function:
|
138
|
+
raise ValueError("Cannot pass in sequences and function")
|
139
|
+
|
140
|
+
if examples and sequences:
|
141
|
+
raise ValueError("Cannot pass in both examples and sequences")
|
132
142
|
|
133
|
-
flattened_sequences = flatten_sequence_list(sequences)
|
134
|
-
for sequence in flattened_sequences:
|
135
|
-
sequence.scorers = scorers
|
136
|
-
|
137
143
|
sequence_run = SequenceRun(
|
138
144
|
project_name=project_name,
|
139
145
|
eval_name=eval_run_name,
|
140
146
|
sequences=sequences,
|
147
|
+
scorers=scorers,
|
141
148
|
model=model,
|
142
149
|
aggregator=aggregator,
|
143
150
|
log_results=log_results,
|
144
151
|
append=append,
|
145
152
|
judgment_api_key=self.judgment_api_key,
|
146
|
-
organization_id=self.organization_id
|
153
|
+
organization_id=self.organization_id,
|
147
154
|
)
|
148
|
-
return run_sequence_eval(sequence_run, override, ignore_errors)
|
155
|
+
return run_sequence_eval(sequence_run, override, ignore_errors, function, tracer, examples)
|
149
156
|
except ValueError as e:
|
150
157
|
raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
|
151
158
|
except Exception as e:
|
@@ -155,7 +162,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
155
162
|
self,
|
156
163
|
examples: Union[List[Example], List[CustomExample]],
|
157
164
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
158
|
-
model: Union[str, List[str], JudgevalJudge],
|
165
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
159
166
|
aggregator: Optional[str] = None,
|
160
167
|
metadata: Optional[Dict[str, Any]] = None,
|
161
168
|
log_results: bool = True,
|
@@ -396,24 +403,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
396
403
|
raise ValueError(f"Error deleting project: {response.json()}")
|
397
404
|
return response.json()
|
398
405
|
|
399
|
-
def _validate_api_key(self):
|
400
|
-
"""
|
401
|
-
Validates that the user api key is valid
|
402
|
-
"""
|
403
|
-
response = requests.post(
|
404
|
-
f"{ROOT_API}/validate_api_key/",
|
405
|
-
headers={
|
406
|
-
"Content-Type": "application/json",
|
407
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
408
|
-
},
|
409
|
-
json={}, # Empty body now
|
410
|
-
verify=True
|
411
|
-
)
|
412
|
-
if response.status_code == 200:
|
413
|
-
return True, response.json()
|
414
|
-
else:
|
415
|
-
return False, response.json().get("detail", "Error validating API key")
|
416
|
-
|
417
406
|
def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
|
418
407
|
"""
|
419
408
|
Fetches a classifier scorer configuration from the Judgment API.
|
@@ -499,22 +488,26 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
499
488
|
|
500
489
|
def assert_test(
|
501
490
|
self,
|
502
|
-
examples: List[Example],
|
503
491
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
504
|
-
|
492
|
+
examples: Optional[List[Example]] = None,
|
493
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
494
|
+
test_file: Optional[str] = None,
|
505
495
|
aggregator: Optional[str] = None,
|
506
496
|
metadata: Optional[Dict[str, Any]] = None,
|
507
497
|
log_results: bool = True,
|
508
|
-
project_name: str = "
|
509
|
-
eval_run_name: str =
|
498
|
+
project_name: str = "default_test",
|
499
|
+
eval_run_name: str = str(uuid4()),
|
510
500
|
override: bool = False,
|
511
|
-
rules: Optional[List[Rule]] = None
|
501
|
+
rules: Optional[List[Rule]] = None,
|
502
|
+
function: Optional[Callable] = None,
|
503
|
+
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
|
512
504
|
) -> None:
|
513
505
|
"""
|
514
506
|
Asserts a test by running the evaluation and checking the results for success
|
515
507
|
|
516
508
|
Args:
|
517
|
-
examples (List[Example]): The examples to evaluate
|
509
|
+
examples (Optional[List[Example]]): The examples to evaluate. Must be provided if test_file is not.
|
510
|
+
test_file (Optional[str]): Path to a YAML file containing test examples. Must be provided if examples is not.
|
518
511
|
scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
519
512
|
model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
|
520
513
|
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
@@ -525,17 +518,37 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
525
518
|
override (bool): Whether to override an existing evaluation run with the same name
|
526
519
|
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
527
520
|
"""
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
521
|
+
# Validate that exactly one of examples or test_file is provided
|
522
|
+
if (examples is None and test_file is None) or (examples is not None and test_file is not None):
|
523
|
+
raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
|
524
|
+
|
525
|
+
if function:
|
526
|
+
results = self.run_sequence_evaluation(
|
527
|
+
examples=examples,
|
528
|
+
scorers=scorers,
|
529
|
+
model=model,
|
530
|
+
aggregator=aggregator,
|
531
|
+
log_results=log_results,
|
532
|
+
project_name=project_name,
|
533
|
+
eval_run_name=eval_run_name,
|
534
|
+
override=override,
|
535
|
+
rules=rules,
|
536
|
+
function=function,
|
537
|
+
tracer=tracer,
|
538
|
+
test_file=test_file
|
539
|
+
)
|
540
|
+
else:
|
541
|
+
results = self.run_evaluation(
|
542
|
+
examples=examples,
|
543
|
+
scorers=scorers,
|
544
|
+
model=model,
|
545
|
+
aggregator=aggregator,
|
546
|
+
metadata=metadata,
|
547
|
+
log_results=log_results,
|
548
|
+
project_name=project_name,
|
549
|
+
eval_run_name=eval_run_name,
|
550
|
+
override=override,
|
551
|
+
rules=rules
|
552
|
+
)
|
540
553
|
|
541
554
|
assert_test(results)
|
judgeval/run_evaluation.py
CHANGED
@@ -4,7 +4,7 @@ import time
|
|
4
4
|
import sys
|
5
5
|
import itertools
|
6
6
|
import threading
|
7
|
-
from typing import List, Dict, Any, Union
|
7
|
+
from typing import List, Dict, Any, Union, Optional, Callable
|
8
8
|
from datetime import datetime
|
9
9
|
from rich import print as rprint
|
10
10
|
|
@@ -12,7 +12,9 @@ from judgeval.data import (
|
|
12
12
|
ScorerData,
|
13
13
|
ScoringResult,
|
14
14
|
Example,
|
15
|
-
CustomExample
|
15
|
+
CustomExample,
|
16
|
+
Sequence,
|
17
|
+
Trace
|
16
18
|
)
|
17
19
|
from judgeval.scorers import (
|
18
20
|
JudgevalScorer,
|
@@ -26,7 +28,8 @@ from judgeval.constants import (
|
|
26
28
|
JUDGMENT_SEQUENCE_EVAL_API_URL,
|
27
29
|
JUDGMENT_EVAL_LOG_API_URL,
|
28
30
|
MAX_CONCURRENT_EVALUATIONS,
|
29
|
-
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
|
31
|
+
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
32
|
+
JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL
|
30
33
|
)
|
31
34
|
from judgeval.common.exceptions import JudgmentAPIError
|
32
35
|
from judgeval.common.logger import (
|
@@ -37,6 +40,8 @@ from judgeval.common.logger import (
|
|
37
40
|
)
|
38
41
|
from judgeval.evaluation_run import EvaluationRun
|
39
42
|
from judgeval.data.sequence_run import SequenceRun
|
43
|
+
from judgeval.common.tracer import Tracer
|
44
|
+
from langchain_core.callbacks import BaseCallbackHandler
|
40
45
|
|
41
46
|
def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
|
42
47
|
"""
|
@@ -277,7 +282,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
277
282
|
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
278
283
|
|
279
284
|
|
280
|
-
def log_evaluation_results(
|
285
|
+
def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
|
281
286
|
"""
|
282
287
|
Logs evaluation results to the Judgment API database.
|
283
288
|
|
@@ -298,7 +303,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
|
|
298
303
|
"X-Organization-Id": run.organization_id
|
299
304
|
},
|
300
305
|
json={
|
301
|
-
"results":
|
306
|
+
"results": scoring_results,
|
302
307
|
"run": run.model_dump(warnings=False)
|
303
308
|
},
|
304
309
|
verify=True
|
@@ -322,6 +327,51 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
|
|
322
327
|
error(f"Failed to save evaluation results to DB: {str(e)}")
|
323
328
|
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
324
329
|
|
330
|
+
def retrieve_sequence_from_trace(trace_id: str, parent_span: str, judgment_api_key: str, organization_id: str) -> Sequence:
|
331
|
+
"""
|
332
|
+
Retrieves a sequence from a trace ID.
|
333
|
+
"""
|
334
|
+
"""
|
335
|
+
Logs evaluation results to the Judgment API database.
|
336
|
+
|
337
|
+
Args:
|
338
|
+
merged_results (List[ScoringResult]): The results to log
|
339
|
+
evaluation_run (EvaluationRun): The evaluation run containing project info and API key
|
340
|
+
|
341
|
+
Raises:
|
342
|
+
JudgmentAPIError: If there's an API error during logging
|
343
|
+
ValueError: If there's a validation error with the results
|
344
|
+
"""
|
345
|
+
try:
|
346
|
+
res = requests.post(
|
347
|
+
JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL,
|
348
|
+
headers={
|
349
|
+
"Content-Type": "application/json",
|
350
|
+
"Authorization": f"Bearer {judgment_api_key}",
|
351
|
+
"X-Organization-Id": organization_id
|
352
|
+
},
|
353
|
+
json={
|
354
|
+
"trace_id": trace_id,
|
355
|
+
"trace_span_id": parent_span,
|
356
|
+
},
|
357
|
+
verify=True
|
358
|
+
)
|
359
|
+
|
360
|
+
if not res.ok:
|
361
|
+
response_data = res.json()
|
362
|
+
error_message = response_data.get('detail', 'An unknown error occurred.')
|
363
|
+
error(f"Error {res.status_code}: {error_message}")
|
364
|
+
raise JudgmentAPIError(error_message)
|
365
|
+
|
366
|
+
return Sequence(**res.json())
|
367
|
+
except requests.exceptions.RequestException as e:
|
368
|
+
error(f"Request failed while saving evaluation results to DB: {str(e)}")
|
369
|
+
raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
|
370
|
+
except Exception as e:
|
371
|
+
error(f"Failed to save evaluation results to DB: {str(e)}")
|
372
|
+
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
373
|
+
|
374
|
+
|
325
375
|
def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
|
326
376
|
"""Run a function with a spinner in the terminal."""
|
327
377
|
spinner = itertools.cycle(['|', '/', '-', '\\'])
|
@@ -365,7 +415,7 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
|
|
365
415
|
if missing_params:
|
366
416
|
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
|
367
417
|
|
368
|
-
def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True) -> List[ScoringResult]:
|
418
|
+
def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
|
369
419
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
370
420
|
if not override and sequence_run.log_results and not sequence_run.append:
|
371
421
|
check_eval_run_name_exists(
|
@@ -384,15 +434,32 @@ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_
|
|
384
434
|
sequence_run.organization_id,
|
385
435
|
True
|
386
436
|
)
|
387
|
-
|
388
437
|
|
438
|
+
if function and tracer:
|
439
|
+
new_sequences: List[Sequence] = []
|
440
|
+
for example in examples:
|
441
|
+
if example.input:
|
442
|
+
result = run_with_spinner("Running agent function: ", function, **example.input)
|
443
|
+
else:
|
444
|
+
result = run_with_spinner("Running agent function: ", function)
|
445
|
+
for i, trace in enumerate(tracer.traces):
|
446
|
+
trace_id = trace['trace_id']
|
447
|
+
parent_span = trace['entries'][0]['span_id']
|
448
|
+
new_sequence = retrieve_sequence_from_trace(trace_id, parent_span, sequence_run.judgment_api_key, sequence_run.organization_id)
|
449
|
+
new_sequence.expected_tools = examples[i].expected_tools
|
450
|
+
new_sequences.append(new_sequence)
|
451
|
+
sequence_run.sequences = new_sequences
|
452
|
+
|
453
|
+
for sequence in sequence_run.sequences:
|
454
|
+
sequence.scorers = sequence_run.scorers
|
455
|
+
|
389
456
|
# Execute evaluation using Judgment API
|
390
457
|
info("Starting API evaluation")
|
391
458
|
try: # execute an EvaluationRun with just JudgmentScorers
|
392
459
|
debug("Sending request to Judgment API")
|
393
460
|
response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
|
394
|
-
|
395
|
-
info(f"Received {len(
|
461
|
+
scoring_results = [ScoringResult(**result) for result in response_data["results"]]
|
462
|
+
info(f"Received {len(scoring_results)} results from API")
|
396
463
|
except JudgmentAPIError as e:
|
397
464
|
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
398
465
|
raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
|
@@ -405,6 +472,8 @@ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_
|
|
405
472
|
if sequence_run.log_results:
|
406
473
|
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], sequence_run)
|
407
474
|
rprint(pretty_str)
|
475
|
+
|
476
|
+
return scoring_results
|
408
477
|
|
409
478
|
|
410
479
|
|
@@ -587,7 +656,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
587
656
|
# )
|
588
657
|
# print(merged_results)
|
589
658
|
if evaluation_run.log_results:
|
590
|
-
send_results = [
|
659
|
+
send_results = [scoring_result.model_dump(warnings=False) for scoring_result in merged_results]
|
591
660
|
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, send_results, evaluation_run)
|
592
661
|
rprint(pretty_str)
|
593
662
|
|
@@ -613,34 +682,31 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
613
682
|
|
614
683
|
# Create a test case context with all relevant fields
|
615
684
|
test_case = {
|
616
|
-
|
617
|
-
'actual_output': result.data_object.actual_output,
|
618
|
-
'expected_output': result.data_object.expected_output,
|
619
|
-
'context': result.data_object.context,
|
620
|
-
'retrieval_context': result.data_object.retrieval_context,
|
621
|
-
'additional_metadata': result.data_object.additional_metadata,
|
622
|
-
'tools_called': result.data_object.tools_called,
|
623
|
-
'expected_tools': result.data_object.expected_tools,
|
624
|
-
'failed_scorers': []
|
685
|
+
"failed_scorers": []
|
625
686
|
}
|
626
687
|
if result.scorers_data:
|
627
688
|
# If the result was not successful, check each scorer_data
|
628
689
|
for scorer_data in result.scorers_data:
|
629
690
|
if not scorer_data.success:
|
691
|
+
if scorer_data.name == "Tool Order":
|
692
|
+
# Remove threshold, evaluation model for Tool Order scorer
|
693
|
+
scorer_data.threshold = None
|
694
|
+
scorer_data.evaluation_model = None
|
630
695
|
test_case['failed_scorers'].append(scorer_data)
|
631
696
|
failed_cases.append(test_case)
|
632
697
|
|
633
698
|
if failed_cases:
|
699
|
+
|
634
700
|
error_msg = f"The following test cases failed: \n"
|
635
701
|
for fail_case in failed_cases:
|
636
|
-
error_msg += f"\nInput: {fail_case['input']}\n"
|
637
|
-
error_msg += f"Actual Output: {fail_case['actual_output']}\n"
|
638
|
-
error_msg += f"Expected Output: {fail_case['expected_output']}\n"
|
639
|
-
error_msg += f"Context: {fail_case['context']}\n"
|
640
|
-
error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
|
641
|
-
error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
|
642
|
-
error_msg += f"Tools Called: {fail_case['tools_called']}\n"
|
643
|
-
error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
|
702
|
+
# error_msg += f"\nInput: {fail_case['input']}\n"
|
703
|
+
# error_msg += f"Actual Output: {fail_case['actual_output']}\n"
|
704
|
+
# error_msg += f"Expected Output: {fail_case['expected_output']}\n"
|
705
|
+
# error_msg += f"Context: {fail_case['context']}\n"
|
706
|
+
# error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
|
707
|
+
# error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
|
708
|
+
# error_msg += f"Tools Called: {fail_case['tools_called']}\n"
|
709
|
+
# error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
|
644
710
|
|
645
711
|
for fail_scorer in fail_case['failed_scorers']:
|
646
712
|
|
@@ -658,6 +724,37 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
658
724
|
f"Additional Metadata: {fail_scorer.additional_metadata}\n"
|
659
725
|
)
|
660
726
|
error_msg += "-"*100
|
661
|
-
|
662
|
-
|
727
|
+
|
728
|
+
total_tests = len(scoring_results)
|
729
|
+
failed_tests = len(failed_cases)
|
730
|
+
passed_tests = total_tests - failed_tests
|
731
|
+
|
732
|
+
# Print summary with colors
|
733
|
+
rprint("\n" + "="*80)
|
734
|
+
if failed_tests == 0:
|
735
|
+
rprint(f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]")
|
736
|
+
else:
|
737
|
+
rprint(f"[bold red]⚠️ TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]")
|
738
|
+
rprint("="*80 + "\n")
|
739
|
+
|
740
|
+
# Print individual test cases
|
741
|
+
for i, result in enumerate(scoring_results):
|
742
|
+
test_num = i + 1
|
743
|
+
if result.success:
|
744
|
+
rprint(f"[green]✓ Test {test_num}: PASSED[/green]")
|
745
|
+
else:
|
746
|
+
rprint(f"[red]✗ Test {test_num}: FAILED[/red]")
|
747
|
+
if result.scorers_data:
|
748
|
+
for scorer_data in result.scorers_data:
|
749
|
+
if not scorer_data.success:
|
750
|
+
rprint(f" [yellow]Scorer: {scorer_data.name}[/yellow]")
|
751
|
+
rprint(f" [red] Score: {scorer_data.score}[/red]")
|
752
|
+
rprint(f" [red] Reason: {scorer_data.reason}[/red]")
|
753
|
+
if scorer_data.error:
|
754
|
+
rprint(f" [red] Error: {scorer_data.error}[/red]")
|
755
|
+
rprint(" " + "-"*40)
|
756
|
+
|
757
|
+
rprint("\n" + "="*80)
|
758
|
+
if failed_tests > 0:
|
759
|
+
raise AssertionError(failed_cases)
|
663
760
|
|
judgeval/scorers/__init__.py
CHANGED
@@ -16,6 +16,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
|
|
16
16
|
InstructionAdherenceScorer,
|
17
17
|
GroundednessScorer,
|
18
18
|
DerailmentScorer,
|
19
|
+
ToolOrderScorer,
|
19
20
|
)
|
20
21
|
from judgeval.scorers.judgeval_scorers.classifiers import (
|
21
22
|
Text2SQLScorer,
|
@@ -41,4 +42,5 @@ __all__ = [
|
|
41
42
|
"InstructionAdherenceScorer",
|
42
43
|
"GroundednessScorer",
|
43
44
|
"DerailmentScorer",
|
45
|
+
"ToolOrderScorer",
|
44
46
|
]
|
@@ -12,6 +12,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonS
|
|
12
12
|
from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
|
13
13
|
from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
|
14
14
|
from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
|
15
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
|
15
16
|
__all__ = [
|
16
17
|
"ExecutionOrderScorer",
|
17
18
|
"JSONCorrectnessScorer",
|
@@ -27,4 +28,5 @@ __all__ = [
|
|
27
28
|
"InstructionAdherenceScorer",
|
28
29
|
"GroundednessScorer",
|
29
30
|
"DerailmentScorer",
|
31
|
+
"ToolOrderScorer",
|
30
32
|
]
|
@@ -0,0 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` tool order scorer
|
3
|
+
"""
|
4
|
+
|
5
|
+
# Internal imports
|
6
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
7
|
+
from judgeval.constants import APIScorer
|
8
|
+
|
9
|
+
class ToolOrderScorer(APIJudgmentScorer):
|
10
|
+
def __init__(self, threshold: float=1.0):
|
11
|
+
super().__init__(
|
12
|
+
threshold=threshold,
|
13
|
+
score_type=APIScorer.TOOL_ORDER,
|
14
|
+
)
|
15
|
+
|
16
|
+
@property
|
17
|
+
def __name__(self):
|
18
|
+
return "Tool Order"
|
judgeval/scorers/score.py
CHANGED
@@ -243,7 +243,7 @@ async def score_with_indicator(
|
|
243
243
|
async def a_execute_scoring(
|
244
244
|
examples: Union[List[Example], List[CustomExample]],
|
245
245
|
scorers: List[JudgevalScorer],
|
246
|
-
model: Optional[Union[str, List[str], JudgevalJudge]] =
|
246
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
247
247
|
ignore_errors: bool = True,
|
248
248
|
skip_on_missing_params: bool = True,
|
249
249
|
show_indicator: bool = True,
|
@@ -0,0 +1,57 @@
|
|
1
|
+
import yaml
|
2
|
+
from judgeval.common.logger import (
|
3
|
+
debug,
|
4
|
+
info,
|
5
|
+
error,
|
6
|
+
example_logging_context
|
7
|
+
)
|
8
|
+
|
9
|
+
from judgeval.data import Example
|
10
|
+
|
11
|
+
|
12
|
+
def add_from_yaml(file_path: str) -> None:
|
13
|
+
debug(f"Loading dataset from YAML file: {file_path}")
|
14
|
+
"""
|
15
|
+
Adds examples from a YAML file.
|
16
|
+
|
17
|
+
The format of the YAML file is expected to be a dictionary with one key: "examples".
|
18
|
+
The value of the key is a list of dictionaries, where each dictionary represents an example.
|
19
|
+
|
20
|
+
The YAML file is expected to have the following format:
|
21
|
+
examples:
|
22
|
+
- input: "test input"
|
23
|
+
actual_output: "test output"
|
24
|
+
expected_output: "expected output"
|
25
|
+
context:
|
26
|
+
- "context1"
|
27
|
+
- "context2"
|
28
|
+
retrieval_context:
|
29
|
+
- "retrieval1"
|
30
|
+
additional_metadata:
|
31
|
+
key: "value"
|
32
|
+
tools_called:
|
33
|
+
- "tool1"
|
34
|
+
expected_tools:
|
35
|
+
- {tool_name: "tool1", parameters: {"query": "test query 1"}}
|
36
|
+
- {tool_name: "tool2", parameters: {"query": "test query 2"}}
|
37
|
+
name: "test example"
|
38
|
+
example_id: null
|
39
|
+
timestamp: "20241230_160117"
|
40
|
+
trace_id: "123"
|
41
|
+
"""
|
42
|
+
try:
|
43
|
+
with open(file_path, "r") as file:
|
44
|
+
payload = yaml.safe_load(file)
|
45
|
+
if payload is None:
|
46
|
+
raise ValueError("The YAML file is empty.")
|
47
|
+
examples = payload.get("examples", [])
|
48
|
+
except FileNotFoundError:
|
49
|
+
error(f"YAML file not found: {file_path}")
|
50
|
+
raise FileNotFoundError(f"The file {file_path} was not found.")
|
51
|
+
except yaml.YAMLError:
|
52
|
+
error(f"Invalid YAML file: {file_path}")
|
53
|
+
raise ValueError(f"The file {file_path} is not a valid YAML file.")
|
54
|
+
|
55
|
+
info(f"Added {len(examples)} examples from YAML")
|
56
|
+
new_examples = [Example(**e) for e in examples]
|
57
|
+
return new_examples
|