judgeval 0.0.36__py3-none-any.whl → 0.0.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +663 -1105
- judgeval/common/utils.py +19 -1
- judgeval/constants.py +3 -3
- judgeval/data/__init__.py +4 -2
- judgeval/data/datasets/dataset.py +2 -11
- judgeval/data/datasets/eval_dataset_client.py +1 -62
- judgeval/data/example.py +29 -8
- judgeval/data/result.py +3 -3
- judgeval/data/trace.py +132 -0
- judgeval/data/{sequence_run.py → trace_run.py} +7 -6
- judgeval/evaluation_run.py +2 -2
- judgeval/integrations/langgraph.py +189 -1769
- judgeval/judges/litellm_judge.py +1 -1
- judgeval/judges/mixture_of_judges.py +1 -1
- judgeval/judges/utils.py +1 -1
- judgeval/judgment_client.py +85 -78
- judgeval/run_evaluation.py +98 -51
- judgeval/scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +20 -0
- judgeval/scorers/score.py +1 -1
- judgeval/utils/data_utils.py +57 -0
- judgeval-0.0.38.dist-info/METADATA +247 -0
- {judgeval-0.0.36.dist-info → judgeval-0.0.38.dist-info}/RECORD +26 -24
- judgeval/data/sequence.py +0 -49
- judgeval-0.0.36.dist-info/METADATA +0 -169
- {judgeval-0.0.36.dist-info → judgeval-0.0.38.dist-info}/WHEEL +0 -0
- {judgeval-0.0.36.dist-info → judgeval-0.0.38.dist-info}/licenses/LICENSE.md +0 -0
judgeval/judges/litellm_judge.py
CHANGED
@@ -12,7 +12,7 @@ BASE_CONVERSATION = [
|
|
12
12
|
|
13
13
|
|
14
14
|
class LiteLLMJudge(JudgevalJudge):
|
15
|
-
def __init__(self, model: str = "gpt-
|
15
|
+
def __init__(self, model: str = "gpt-4.1-mini", **kwargs):
|
16
16
|
debug(f"Initializing LiteLLMJudge with model={model}")
|
17
17
|
self.model = model
|
18
18
|
self.kwargs = kwargs
|
@@ -136,7 +136,7 @@ class MixtureOfJudges(JudgevalJudge):
|
|
136
136
|
"""
|
137
137
|
def __init__(self,
|
138
138
|
models: List[str] = ['QWEN', 'LLAMA3_70B_INSTRUCT_TURBO', 'MISTRAL_8x22B_INSTRUCT'],
|
139
|
-
aggregator: str = 'gpt-
|
139
|
+
aggregator: str = 'gpt-4.1',
|
140
140
|
**kwargs):
|
141
141
|
"""
|
142
142
|
`models` are the individual judge models to be used for generating responses.
|
judgeval/judges/utils.py
CHANGED
@@ -23,7 +23,7 @@ def create_judge(
|
|
23
23
|
If no model is provided, uses GPT4o as the default judge.
|
24
24
|
"""
|
25
25
|
if model is None: # default option
|
26
|
-
return LiteLLMJudge(model="gpt-
|
26
|
+
return LiteLLMJudge(model="gpt-4.1"), True
|
27
27
|
if not isinstance(model, (str, list, JudgevalJudge)):
|
28
28
|
raise InvalidJudgeModelError(f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead.")
|
29
29
|
# If model is already a valid judge type, return it and mark native
|
judgeval/judgment_client.py
CHANGED
@@ -2,7 +2,8 @@
|
|
2
2
|
Implements the JudgmentClient to interact with the Judgment API.
|
3
3
|
"""
|
4
4
|
import os
|
5
|
-
from
|
5
|
+
from uuid import uuid4
|
6
|
+
from typing import Optional, List, Dict, Any, Union, Callable
|
6
7
|
import requests
|
7
8
|
|
8
9
|
from judgeval.constants import ROOT_API
|
@@ -11,7 +12,7 @@ from judgeval.data import (
|
|
11
12
|
ScoringResult,
|
12
13
|
Example,
|
13
14
|
CustomExample,
|
14
|
-
|
15
|
+
Trace,
|
15
16
|
)
|
16
17
|
from judgeval.scorers import (
|
17
18
|
APIJudgmentScorer,
|
@@ -22,9 +23,9 @@ from judgeval.evaluation_run import EvaluationRun
|
|
22
23
|
from judgeval.run_evaluation import (
|
23
24
|
run_eval,
|
24
25
|
assert_test,
|
25
|
-
|
26
|
+
run_trace_eval
|
26
27
|
)
|
27
|
-
from judgeval.data.
|
28
|
+
from judgeval.data.trace_run import TraceRun
|
28
29
|
from judgeval.judges import JudgevalJudge
|
29
30
|
from judgeval.constants import (
|
30
31
|
JUDGMENT_EVAL_FETCH_API_URL,
|
@@ -33,7 +34,11 @@ from judgeval.constants import (
|
|
33
34
|
JUDGMENT_PROJECT_DELETE_API_URL,
|
34
35
|
JUDGMENT_PROJECT_CREATE_API_URL
|
35
36
|
)
|
37
|
+
from judgeval.utils.data_utils import add_from_yaml
|
36
38
|
from judgeval.common.exceptions import JudgmentAPIError
|
39
|
+
from langchain_core.callbacks import BaseCallbackHandler
|
40
|
+
from judgeval.common.tracer import Tracer
|
41
|
+
from judgeval.common.utils import validate_api_key
|
37
42
|
from pydantic import BaseModel
|
38
43
|
from judgeval.rules import Rule
|
39
44
|
|
@@ -63,7 +68,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
63
68
|
self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
|
64
69
|
|
65
70
|
# Verify API key is valid
|
66
|
-
result, response =
|
71
|
+
result, response = validate_api_key(judgment_api_key)
|
67
72
|
if not result:
|
68
73
|
# May be bad to output their invalid API key...
|
69
74
|
raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
|
@@ -74,7 +79,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
74
79
|
self,
|
75
80
|
examples: List[Example],
|
76
81
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
77
|
-
model: Union[str, List[str], JudgevalJudge],
|
82
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
78
83
|
aggregator: Optional[str] = None,
|
79
84
|
metadata: Optional[Dict[str, Any]] = None,
|
80
85
|
log_results: bool = True,
|
@@ -100,54 +105,56 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
100
105
|
rules=rules
|
101
106
|
)
|
102
107
|
|
103
|
-
def
|
108
|
+
def run_trace_evaluation(
|
104
109
|
self,
|
105
|
-
sequences: List[Sequence],
|
106
|
-
model: Union[str, List[str], JudgevalJudge],
|
107
110
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
111
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
112
|
+
traces: Optional[List[Trace]] = None,
|
113
|
+
examples: Optional[List[Example]] = None,
|
114
|
+
test_file: Optional[str] = None,
|
108
115
|
aggregator: Optional[str] = None,
|
109
116
|
project_name: str = "default_project",
|
110
|
-
eval_run_name: str = "
|
117
|
+
eval_run_name: str = "default_eval_trace",
|
111
118
|
log_results: bool = True,
|
112
119
|
append: bool = False,
|
113
120
|
override: bool = False,
|
114
121
|
ignore_errors: bool = True,
|
115
|
-
rules: Optional[List[Rule]] = None
|
122
|
+
rules: Optional[List[Rule]] = None,
|
123
|
+
function: Optional[Callable] = None,
|
124
|
+
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
|
116
125
|
) -> List[ScoringResult]:
|
117
|
-
try:
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
flattened.extend(get_all_sequences(seq))
|
131
|
-
return flattened
|
126
|
+
try:
|
127
|
+
|
128
|
+
if test_file:
|
129
|
+
try:
|
130
|
+
examples = add_from_yaml(test_file)
|
131
|
+
except FileNotFoundError:
|
132
|
+
raise FileNotFoundError(f"Test file not found: {test_file}")
|
133
|
+
|
134
|
+
if examples and not function:
|
135
|
+
raise ValueError("Cannot pass in examples without a function")
|
136
|
+
|
137
|
+
if traces and function:
|
138
|
+
raise ValueError("Cannot pass in traces and function")
|
132
139
|
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
sequence_run = SequenceRun(
|
140
|
+
if examples and traces:
|
141
|
+
raise ValueError("Cannot pass in both examples and traces")
|
142
|
+
|
143
|
+
trace_run = TraceRun(
|
138
144
|
project_name=project_name,
|
139
145
|
eval_name=eval_run_name,
|
140
|
-
|
146
|
+
traces=traces,
|
147
|
+
scorers=scorers,
|
141
148
|
model=model,
|
142
149
|
aggregator=aggregator,
|
143
150
|
log_results=log_results,
|
144
151
|
append=append,
|
145
152
|
judgment_api_key=self.judgment_api_key,
|
146
|
-
organization_id=self.organization_id
|
153
|
+
organization_id=self.organization_id,
|
147
154
|
)
|
148
|
-
return
|
155
|
+
return run_trace_eval(trace_run, override, ignore_errors, function, tracer, examples)
|
149
156
|
except ValueError as e:
|
150
|
-
raise ValueError(f"Please check your
|
157
|
+
raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}")
|
151
158
|
except Exception as e:
|
152
159
|
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
153
160
|
|
@@ -155,7 +162,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
155
162
|
self,
|
156
163
|
examples: Union[List[Example], List[CustomExample]],
|
157
164
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
158
|
-
model: Union[str, List[str], JudgevalJudge],
|
165
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
159
166
|
aggregator: Optional[str] = None,
|
160
167
|
metadata: Optional[Dict[str, Any]] = None,
|
161
168
|
log_results: bool = True,
|
@@ -238,12 +245,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
238
245
|
"""
|
239
246
|
return self.eval_dataset_client.append_examples(alias, examples, project_name)
|
240
247
|
|
241
|
-
def append_sequence_dataset(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
|
242
|
-
"""
|
243
|
-
Appends a `Sequence` to the Judgment platform for storage.
|
244
|
-
"""
|
245
|
-
return self.eval_dataset_client.append_sequences(alias, sequences, project_name)
|
246
|
-
|
247
248
|
def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
|
248
249
|
"""
|
249
250
|
Retrieves a saved `EvalDataset` from the Judgment platform.
|
@@ -396,24 +397,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
396
397
|
raise ValueError(f"Error deleting project: {response.json()}")
|
397
398
|
return response.json()
|
398
399
|
|
399
|
-
def _validate_api_key(self):
|
400
|
-
"""
|
401
|
-
Validates that the user api key is valid
|
402
|
-
"""
|
403
|
-
response = requests.post(
|
404
|
-
f"{ROOT_API}/validate_api_key/",
|
405
|
-
headers={
|
406
|
-
"Content-Type": "application/json",
|
407
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
408
|
-
},
|
409
|
-
json={}, # Empty body now
|
410
|
-
verify=True
|
411
|
-
)
|
412
|
-
if response.status_code == 200:
|
413
|
-
return True, response.json()
|
414
|
-
else:
|
415
|
-
return False, response.json().get("detail", "Error validating API key")
|
416
|
-
|
417
400
|
def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
|
418
401
|
"""
|
419
402
|
Fetches a classifier scorer configuration from the Judgment API.
|
@@ -499,22 +482,26 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
499
482
|
|
500
483
|
def assert_test(
|
501
484
|
self,
|
502
|
-
examples: List[Example],
|
503
485
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
504
|
-
|
486
|
+
examples: Optional[List[Example]] = None,
|
487
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
488
|
+
test_file: Optional[str] = None,
|
505
489
|
aggregator: Optional[str] = None,
|
506
490
|
metadata: Optional[Dict[str, Any]] = None,
|
507
491
|
log_results: bool = True,
|
508
|
-
project_name: str = "
|
509
|
-
eval_run_name: str =
|
492
|
+
project_name: str = "default_test",
|
493
|
+
eval_run_name: str = str(uuid4()),
|
510
494
|
override: bool = False,
|
511
|
-
rules: Optional[List[Rule]] = None
|
495
|
+
rules: Optional[List[Rule]] = None,
|
496
|
+
function: Optional[Callable] = None,
|
497
|
+
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
|
512
498
|
) -> None:
|
513
499
|
"""
|
514
500
|
Asserts a test by running the evaluation and checking the results for success
|
515
501
|
|
516
502
|
Args:
|
517
|
-
examples (List[Example]): The examples to evaluate
|
503
|
+
examples (Optional[List[Example]]): The examples to evaluate. Must be provided if test_file is not.
|
504
|
+
test_file (Optional[str]): Path to a YAML file containing test examples. Must be provided if examples is not.
|
518
505
|
scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
519
506
|
model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
|
520
507
|
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
@@ -525,17 +512,37 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
525
512
|
override (bool): Whether to override an existing evaluation run with the same name
|
526
513
|
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
527
514
|
"""
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
515
|
+
# Validate that exactly one of examples or test_file is provided
|
516
|
+
if (examples is None and test_file is None) or (examples is not None and test_file is not None):
|
517
|
+
raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
|
518
|
+
|
519
|
+
if function:
|
520
|
+
results = self.run_trace_evaluation(
|
521
|
+
examples=examples,
|
522
|
+
scorers=scorers,
|
523
|
+
model=model,
|
524
|
+
aggregator=aggregator,
|
525
|
+
log_results=log_results,
|
526
|
+
project_name=project_name,
|
527
|
+
eval_run_name=eval_run_name,
|
528
|
+
override=override,
|
529
|
+
rules=rules,
|
530
|
+
function=function,
|
531
|
+
tracer=tracer,
|
532
|
+
test_file=test_file
|
533
|
+
)
|
534
|
+
else:
|
535
|
+
results = self.run_evaluation(
|
536
|
+
examples=examples,
|
537
|
+
scorers=scorers,
|
538
|
+
model=model,
|
539
|
+
aggregator=aggregator,
|
540
|
+
metadata=metadata,
|
541
|
+
log_results=log_results,
|
542
|
+
project_name=project_name,
|
543
|
+
eval_run_name=eval_run_name,
|
544
|
+
override=override,
|
545
|
+
rules=rules
|
546
|
+
)
|
540
547
|
|
541
548
|
assert_test(results)
|
judgeval/run_evaluation.py
CHANGED
@@ -4,7 +4,7 @@ import time
|
|
4
4
|
import sys
|
5
5
|
import itertools
|
6
6
|
import threading
|
7
|
-
from typing import List, Dict, Any, Union
|
7
|
+
from typing import List, Dict, Any, Union, Optional, Callable
|
8
8
|
from datetime import datetime
|
9
9
|
from rich import print as rprint
|
10
10
|
|
@@ -12,7 +12,8 @@ from judgeval.data import (
|
|
12
12
|
ScorerData,
|
13
13
|
ScoringResult,
|
14
14
|
Example,
|
15
|
-
CustomExample
|
15
|
+
CustomExample,
|
16
|
+
Trace
|
16
17
|
)
|
17
18
|
from judgeval.scorers import (
|
18
19
|
JudgevalScorer,
|
@@ -23,10 +24,10 @@ from judgeval.scorers.score import a_execute_scoring
|
|
23
24
|
from judgeval.constants import (
|
24
25
|
ROOT_API,
|
25
26
|
JUDGMENT_EVAL_API_URL,
|
26
|
-
|
27
|
+
JUDGMENT_TRACE_EVAL_API_URL,
|
27
28
|
JUDGMENT_EVAL_LOG_API_URL,
|
28
29
|
MAX_CONCURRENT_EVALUATIONS,
|
29
|
-
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
|
30
|
+
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
30
31
|
)
|
31
32
|
from judgeval.common.exceptions import JudgmentAPIError
|
32
33
|
from judgeval.common.logger import (
|
@@ -36,7 +37,9 @@ from judgeval.common.logger import (
|
|
36
37
|
example_logging_context
|
37
38
|
)
|
38
39
|
from judgeval.evaluation_run import EvaluationRun
|
39
|
-
from judgeval.data.
|
40
|
+
from judgeval.data.trace_run import TraceRun
|
41
|
+
from judgeval.common.tracer import Tracer
|
42
|
+
from langchain_core.callbacks import BaseCallbackHandler
|
40
43
|
|
41
44
|
def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
|
42
45
|
"""
|
@@ -93,20 +96,20 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
93
96
|
raise JudgmentAPIError(error_message)
|
94
97
|
return response_data
|
95
98
|
|
96
|
-
def
|
99
|
+
def execute_api_trace_eval(trace_run: TraceRun) -> List[Dict]:
|
97
100
|
"""
|
98
101
|
Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
|
99
102
|
"""
|
100
103
|
|
101
104
|
try:
|
102
105
|
# submit API request to execute evals
|
103
|
-
payload =
|
106
|
+
payload = trace_run.model_dump(warnings=False)
|
104
107
|
response = requests.post(
|
105
|
-
|
108
|
+
JUDGMENT_TRACE_EVAL_API_URL,
|
106
109
|
headers={
|
107
110
|
"Content-Type": "application/json",
|
108
|
-
"Authorization": f"Bearer {
|
109
|
-
"X-Organization-Id":
|
111
|
+
"Authorization": f"Bearer {trace_run.judgment_api_key}",
|
112
|
+
"X-Organization-Id": trace_run.organization_id
|
110
113
|
},
|
111
114
|
json=payload,
|
112
115
|
verify=True
|
@@ -277,7 +280,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
277
280
|
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
278
281
|
|
279
282
|
|
280
|
-
def log_evaluation_results(
|
283
|
+
def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, TraceRun]) -> str:
|
281
284
|
"""
|
282
285
|
Logs evaluation results to the Judgment API database.
|
283
286
|
|
@@ -298,7 +301,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
|
|
298
301
|
"X-Organization-Id": run.organization_id
|
299
302
|
},
|
300
303
|
json={
|
301
|
-
"results":
|
304
|
+
"results": scoring_results,
|
302
305
|
"run": run.model_dump(warnings=False)
|
303
306
|
},
|
304
307
|
verify=True
|
@@ -365,46 +368,62 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
|
|
365
368
|
if missing_params:
|
366
369
|
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
|
367
370
|
|
368
|
-
def
|
371
|
+
def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
|
369
372
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
370
|
-
if not override and
|
373
|
+
if not override and trace_run.log_results and not trace_run.append:
|
371
374
|
check_eval_run_name_exists(
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
375
|
+
trace_run.eval_name,
|
376
|
+
trace_run.project_name,
|
377
|
+
trace_run.judgment_api_key,
|
378
|
+
trace_run.organization_id
|
376
379
|
)
|
377
380
|
|
378
|
-
if
|
381
|
+
if trace_run.append:
|
379
382
|
# Check that the current experiment, if one exists, has the same type (examples of sequences)
|
380
383
|
check_experiment_type(
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
384
|
+
trace_run.eval_name,
|
385
|
+
trace_run.project_name,
|
386
|
+
trace_run.judgment_api_key,
|
387
|
+
trace_run.organization_id,
|
385
388
|
True
|
386
389
|
)
|
387
|
-
|
388
390
|
|
391
|
+
if function and tracer:
|
392
|
+
new_traces: List[Trace] = []
|
393
|
+
tracer.offline_mode = True
|
394
|
+
for example in examples:
|
395
|
+
if example.input:
|
396
|
+
result = run_with_spinner("Running agent function: ", function, **example.input)
|
397
|
+
else:
|
398
|
+
result = run_with_spinner("Running agent function: ", function)
|
399
|
+
for i, trace in enumerate(tracer.traces):
|
400
|
+
# We set the root-level trace span with the expected tools of the Trace
|
401
|
+
trace = Trace(**trace)
|
402
|
+
trace.entries[0].expected_tools = examples[i].expected_tools
|
403
|
+
new_traces.append(trace)
|
404
|
+
trace_run.traces = new_traces
|
405
|
+
|
389
406
|
# Execute evaluation using Judgment API
|
390
407
|
info("Starting API evaluation")
|
391
408
|
try: # execute an EvaluationRun with just JudgmentScorers
|
392
409
|
debug("Sending request to Judgment API")
|
393
|
-
response_data: List[Dict] = run_with_spinner("Running
|
394
|
-
|
395
|
-
info(f"Received {len(
|
410
|
+
response_data: List[Dict] = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
|
411
|
+
scoring_results = [ScoringResult(**result) for result in response_data["results"]]
|
412
|
+
info(f"Received {len(scoring_results)} results from API")
|
396
413
|
except JudgmentAPIError as e:
|
397
414
|
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
398
415
|
raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
|
399
416
|
except ValueError as e:
|
400
|
-
raise ValueError(f"Please check your
|
417
|
+
raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: {str(e)}")
|
401
418
|
|
402
419
|
# Convert the response data to `ScoringResult` objects
|
403
420
|
debug("Processing API results")
|
404
|
-
# TODO: allow for custom scorer on
|
405
|
-
if
|
406
|
-
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"],
|
421
|
+
# TODO: allow for custom scorer on traces
|
422
|
+
if trace_run.log_results:
|
423
|
+
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], trace_run)
|
407
424
|
rprint(pretty_str)
|
425
|
+
|
426
|
+
return scoring_results
|
408
427
|
|
409
428
|
|
410
429
|
|
@@ -587,7 +606,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
587
606
|
# )
|
588
607
|
# print(merged_results)
|
589
608
|
if evaluation_run.log_results:
|
590
|
-
send_results = [
|
609
|
+
send_results = [scoring_result.model_dump(warnings=False) for scoring_result in merged_results]
|
591
610
|
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, send_results, evaluation_run)
|
592
611
|
rprint(pretty_str)
|
593
612
|
|
@@ -613,34 +632,31 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
613
632
|
|
614
633
|
# Create a test case context with all relevant fields
|
615
634
|
test_case = {
|
616
|
-
|
617
|
-
'actual_output': result.data_object.actual_output,
|
618
|
-
'expected_output': result.data_object.expected_output,
|
619
|
-
'context': result.data_object.context,
|
620
|
-
'retrieval_context': result.data_object.retrieval_context,
|
621
|
-
'additional_metadata': result.data_object.additional_metadata,
|
622
|
-
'tools_called': result.data_object.tools_called,
|
623
|
-
'expected_tools': result.data_object.expected_tools,
|
624
|
-
'failed_scorers': []
|
635
|
+
"failed_scorers": []
|
625
636
|
}
|
626
637
|
if result.scorers_data:
|
627
638
|
# If the result was not successful, check each scorer_data
|
628
639
|
for scorer_data in result.scorers_data:
|
629
640
|
if not scorer_data.success:
|
641
|
+
if scorer_data.name == "Tool Order":
|
642
|
+
# Remove threshold, evaluation model for Tool Order scorer
|
643
|
+
scorer_data.threshold = None
|
644
|
+
scorer_data.evaluation_model = None
|
630
645
|
test_case['failed_scorers'].append(scorer_data)
|
631
646
|
failed_cases.append(test_case)
|
632
647
|
|
633
648
|
if failed_cases:
|
649
|
+
|
634
650
|
error_msg = f"The following test cases failed: \n"
|
635
651
|
for fail_case in failed_cases:
|
636
|
-
error_msg += f"\nInput: {fail_case['input']}\n"
|
637
|
-
error_msg += f"Actual Output: {fail_case['actual_output']}\n"
|
638
|
-
error_msg += f"Expected Output: {fail_case['expected_output']}\n"
|
639
|
-
error_msg += f"Context: {fail_case['context']}\n"
|
640
|
-
error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
|
641
|
-
error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
|
642
|
-
error_msg += f"Tools Called: {fail_case['tools_called']}\n"
|
643
|
-
error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
|
652
|
+
# error_msg += f"\nInput: {fail_case['input']}\n"
|
653
|
+
# error_msg += f"Actual Output: {fail_case['actual_output']}\n"
|
654
|
+
# error_msg += f"Expected Output: {fail_case['expected_output']}\n"
|
655
|
+
# error_msg += f"Context: {fail_case['context']}\n"
|
656
|
+
# error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
|
657
|
+
# error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
|
658
|
+
# error_msg += f"Tools Called: {fail_case['tools_called']}\n"
|
659
|
+
# error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
|
644
660
|
|
645
661
|
for fail_scorer in fail_case['failed_scorers']:
|
646
662
|
|
@@ -658,6 +674,37 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
658
674
|
f"Additional Metadata: {fail_scorer.additional_metadata}\n"
|
659
675
|
)
|
660
676
|
error_msg += "-"*100
|
661
|
-
|
662
|
-
|
677
|
+
|
678
|
+
total_tests = len(scoring_results)
|
679
|
+
failed_tests = len(failed_cases)
|
680
|
+
passed_tests = total_tests - failed_tests
|
681
|
+
|
682
|
+
# Print summary with colors
|
683
|
+
rprint("\n" + "="*80)
|
684
|
+
if failed_tests == 0:
|
685
|
+
rprint(f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]")
|
686
|
+
else:
|
687
|
+
rprint(f"[bold red]⚠️ TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]")
|
688
|
+
rprint("="*80 + "\n")
|
689
|
+
|
690
|
+
# Print individual test cases
|
691
|
+
for i, result in enumerate(scoring_results):
|
692
|
+
test_num = i + 1
|
693
|
+
if result.success:
|
694
|
+
rprint(f"[green]✓ Test {test_num}: PASSED[/green]")
|
695
|
+
else:
|
696
|
+
rprint(f"[red]✗ Test {test_num}: FAILED[/red]")
|
697
|
+
if result.scorers_data:
|
698
|
+
for scorer_data in result.scorers_data:
|
699
|
+
if not scorer_data.success:
|
700
|
+
rprint(f" [yellow]Scorer: {scorer_data.name}[/yellow]")
|
701
|
+
rprint(f" [red] Score: {scorer_data.score}[/red]")
|
702
|
+
rprint(f" [red] Reason: {scorer_data.reason}[/red]")
|
703
|
+
if scorer_data.error:
|
704
|
+
rprint(f" [red] Error: {scorer_data.error}[/red]")
|
705
|
+
rprint(" " + "-"*40)
|
706
|
+
|
707
|
+
rprint("\n" + "="*80)
|
708
|
+
if failed_tests > 0:
|
709
|
+
raise AssertionError(failed_cases)
|
663
710
|
|
judgeval/scorers/__init__.py
CHANGED
@@ -16,6 +16,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
|
|
16
16
|
InstructionAdherenceScorer,
|
17
17
|
GroundednessScorer,
|
18
18
|
DerailmentScorer,
|
19
|
+
ToolOrderScorer,
|
19
20
|
)
|
20
21
|
from judgeval.scorers.judgeval_scorers.classifiers import (
|
21
22
|
Text2SQLScorer,
|
@@ -41,4 +42,5 @@ __all__ = [
|
|
41
42
|
"InstructionAdherenceScorer",
|
42
43
|
"GroundednessScorer",
|
43
44
|
"DerailmentScorer",
|
45
|
+
"ToolOrderScorer",
|
44
46
|
]
|
@@ -12,6 +12,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonS
|
|
12
12
|
from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
|
13
13
|
from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
|
14
14
|
from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
|
15
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
|
15
16
|
__all__ = [
|
16
17
|
"ExecutionOrderScorer",
|
17
18
|
"JSONCorrectnessScorer",
|
@@ -27,4 +28,5 @@ __all__ = [
|
|
27
28
|
"InstructionAdherenceScorer",
|
28
29
|
"GroundednessScorer",
|
29
30
|
"DerailmentScorer",
|
31
|
+
"ToolOrderScorer",
|
30
32
|
]
|
@@ -0,0 +1,20 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` tool order scorer
|
3
|
+
"""
|
4
|
+
|
5
|
+
# Internal imports
|
6
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
7
|
+
from judgeval.constants import APIScorer
|
8
|
+
from typing import Optional, Dict
|
9
|
+
class ToolOrderScorer(APIJudgmentScorer):
|
10
|
+
kwargs: Optional[Dict] = None
|
11
|
+
def __init__(self, threshold: float=1.0, exact_match: bool=False):
|
12
|
+
super().__init__(
|
13
|
+
threshold=threshold,
|
14
|
+
score_type=APIScorer.TOOL_ORDER,
|
15
|
+
)
|
16
|
+
self.kwargs = {"exact_match": exact_match}
|
17
|
+
|
18
|
+
@property
|
19
|
+
def __name__(self):
|
20
|
+
return "Tool Order"
|
judgeval/scorers/score.py
CHANGED
@@ -243,7 +243,7 @@ async def score_with_indicator(
|
|
243
243
|
async def a_execute_scoring(
|
244
244
|
examples: Union[List[Example], List[CustomExample]],
|
245
245
|
scorers: List[JudgevalScorer],
|
246
|
-
model: Optional[Union[str, List[str], JudgevalJudge]] =
|
246
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
247
247
|
ignore_errors: bool = True,
|
248
248
|
skip_on_missing_params: bool = True,
|
249
249
|
show_indicator: bool = True,
|