judgeval 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +869 -928
- judgeval/common/utils.py +18 -0
- judgeval/constants.py +6 -3
- judgeval/data/__init__.py +4 -0
- judgeval/data/datasets/dataset.py +3 -2
- judgeval/data/datasets/eval_dataset_client.py +63 -3
- judgeval/data/example.py +29 -7
- judgeval/data/sequence.py +5 -4
- judgeval/data/sequence_run.py +4 -3
- judgeval/data/trace.py +129 -0
- judgeval/evaluation_run.py +1 -1
- judgeval/integrations/langgraph.py +1962 -299
- judgeval/judgment_client.py +85 -66
- judgeval/run_evaluation.py +191 -45
- judgeval/scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -0
- judgeval/scorers/score.py +2 -1
- judgeval/utils/data_utils.py +57 -0
- judgeval-0.0.37.dist-info/METADATA +214 -0
- {judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/RECORD +23 -20
- judgeval-0.0.35.dist-info/METADATA +0 -170
- {judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/WHEEL +0 -0
- {judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/licenses/LICENSE.md +0 -0
judgeval/judgment_client.py
CHANGED
@@ -2,7 +2,8 @@
|
|
2
2
|
Implements the JudgmentClient to interact with the Judgment API.
|
3
3
|
"""
|
4
4
|
import os
|
5
|
-
from
|
5
|
+
from uuid import uuid4
|
6
|
+
from typing import Optional, List, Dict, Any, Union, Callable
|
6
7
|
import requests
|
7
8
|
|
8
9
|
from judgeval.constants import ROOT_API
|
@@ -33,7 +34,11 @@ from judgeval.constants import (
|
|
33
34
|
JUDGMENT_PROJECT_DELETE_API_URL,
|
34
35
|
JUDGMENT_PROJECT_CREATE_API_URL
|
35
36
|
)
|
37
|
+
from judgeval.utils.data_utils import add_from_yaml
|
36
38
|
from judgeval.common.exceptions import JudgmentAPIError
|
39
|
+
from langchain_core.callbacks import BaseCallbackHandler
|
40
|
+
from judgeval.common.tracer import Tracer
|
41
|
+
from judgeval.common.utils import validate_api_key
|
37
42
|
from pydantic import BaseModel
|
38
43
|
from judgeval.rules import Rule
|
39
44
|
|
@@ -63,7 +68,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
63
68
|
self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
|
64
69
|
|
65
70
|
# Verify API key is valid
|
66
|
-
result, response =
|
71
|
+
result, response = validate_api_key(judgment_api_key)
|
67
72
|
if not result:
|
68
73
|
# May be bad to output their invalid API key...
|
69
74
|
raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
|
@@ -74,7 +79,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
74
79
|
self,
|
75
80
|
examples: List[Example],
|
76
81
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
77
|
-
model: Union[str, List[str], JudgevalJudge],
|
82
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
78
83
|
aggregator: Optional[str] = None,
|
79
84
|
metadata: Optional[Dict[str, Any]] = None,
|
80
85
|
log_results: bool = True,
|
@@ -102,9 +107,11 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
102
107
|
|
103
108
|
def run_sequence_evaluation(
|
104
109
|
self,
|
105
|
-
sequences: List[Sequence],
|
106
|
-
model: Union[str, List[str], JudgevalJudge],
|
107
110
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
111
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
112
|
+
sequences: Optional[List[Sequence]] = None,
|
113
|
+
examples: Optional[List[Example]] = None,
|
114
|
+
test_file: Optional[str] = None,
|
108
115
|
aggregator: Optional[str] = None,
|
109
116
|
project_name: str = "default_project",
|
110
117
|
eval_run_name: str = "default_eval_sequence",
|
@@ -112,40 +119,40 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
112
119
|
append: bool = False,
|
113
120
|
override: bool = False,
|
114
121
|
ignore_errors: bool = True,
|
115
|
-
rules: Optional[List[Rule]] = None
|
122
|
+
rules: Optional[List[Rule]] = None,
|
123
|
+
function: Optional[Callable] = None,
|
124
|
+
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
|
116
125
|
) -> List[ScoringResult]:
|
117
|
-
try:
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
126
|
+
try:
|
127
|
+
|
128
|
+
if test_file:
|
129
|
+
try:
|
130
|
+
examples = add_from_yaml(test_file)
|
131
|
+
except FileNotFoundError:
|
132
|
+
raise FileNotFoundError(f"Test file not found: {test_file}")
|
133
|
+
|
134
|
+
if examples and not function:
|
135
|
+
raise ValueError("Cannot pass in examples without a function")
|
136
|
+
|
137
|
+
if sequences and function:
|
138
|
+
raise ValueError("Cannot pass in sequences and function")
|
139
|
+
|
140
|
+
if examples and sequences:
|
141
|
+
raise ValueError("Cannot pass in both examples and sequences")
|
132
142
|
|
133
|
-
flattened_sequences = flatten_sequence_list(sequences)
|
134
|
-
for sequence in flattened_sequences:
|
135
|
-
sequence.scorers = scorers
|
136
|
-
|
137
143
|
sequence_run = SequenceRun(
|
138
144
|
project_name=project_name,
|
139
145
|
eval_name=eval_run_name,
|
140
146
|
sequences=sequences,
|
147
|
+
scorers=scorers,
|
141
148
|
model=model,
|
142
149
|
aggregator=aggregator,
|
143
150
|
log_results=log_results,
|
144
151
|
append=append,
|
145
152
|
judgment_api_key=self.judgment_api_key,
|
146
|
-
organization_id=self.organization_id
|
153
|
+
organization_id=self.organization_id,
|
147
154
|
)
|
148
|
-
return run_sequence_eval(sequence_run, override, ignore_errors)
|
155
|
+
return run_sequence_eval(sequence_run, override, ignore_errors, function, tracer, examples)
|
149
156
|
except ValueError as e:
|
150
157
|
raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
|
151
158
|
except Exception as e:
|
@@ -155,7 +162,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
155
162
|
self,
|
156
163
|
examples: Union[List[Example], List[CustomExample]],
|
157
164
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
158
|
-
model: Union[str, List[str], JudgevalJudge],
|
165
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
159
166
|
aggregator: Optional[str] = None,
|
160
167
|
metadata: Optional[Dict[str, Any]] = None,
|
161
168
|
log_results: bool = True,
|
@@ -232,11 +239,17 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
232
239
|
dataset.judgment_api_key = self.judgment_api_key
|
233
240
|
return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
|
234
241
|
|
235
|
-
def
|
242
|
+
def append_example_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
|
236
243
|
"""
|
237
244
|
Appends an `EvalDataset` to the Judgment platform for storage.
|
238
245
|
"""
|
239
|
-
return self.eval_dataset_client.
|
246
|
+
return self.eval_dataset_client.append_examples(alias, examples, project_name)
|
247
|
+
|
248
|
+
def append_sequence_dataset(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
|
249
|
+
"""
|
250
|
+
Appends a `Sequence` to the Judgment platform for storage.
|
251
|
+
"""
|
252
|
+
return self.eval_dataset_client.append_sequences(alias, sequences, project_name)
|
240
253
|
|
241
254
|
def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
|
242
255
|
"""
|
@@ -390,24 +403,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
390
403
|
raise ValueError(f"Error deleting project: {response.json()}")
|
391
404
|
return response.json()
|
392
405
|
|
393
|
-
def _validate_api_key(self):
|
394
|
-
"""
|
395
|
-
Validates that the user api key is valid
|
396
|
-
"""
|
397
|
-
response = requests.post(
|
398
|
-
f"{ROOT_API}/validate_api_key/",
|
399
|
-
headers={
|
400
|
-
"Content-Type": "application/json",
|
401
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
402
|
-
},
|
403
|
-
json={}, # Empty body now
|
404
|
-
verify=True
|
405
|
-
)
|
406
|
-
if response.status_code == 200:
|
407
|
-
return True, response.json()
|
408
|
-
else:
|
409
|
-
return False, response.json().get("detail", "Error validating API key")
|
410
|
-
|
411
406
|
def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
|
412
407
|
"""
|
413
408
|
Fetches a classifier scorer configuration from the Judgment API.
|
@@ -493,22 +488,26 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
493
488
|
|
494
489
|
def assert_test(
|
495
490
|
self,
|
496
|
-
examples: List[Example],
|
497
491
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
498
|
-
|
492
|
+
examples: Optional[List[Example]] = None,
|
493
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
494
|
+
test_file: Optional[str] = None,
|
499
495
|
aggregator: Optional[str] = None,
|
500
496
|
metadata: Optional[Dict[str, Any]] = None,
|
501
497
|
log_results: bool = True,
|
502
|
-
project_name: str = "
|
503
|
-
eval_run_name: str =
|
498
|
+
project_name: str = "default_test",
|
499
|
+
eval_run_name: str = str(uuid4()),
|
504
500
|
override: bool = False,
|
505
|
-
rules: Optional[List[Rule]] = None
|
501
|
+
rules: Optional[List[Rule]] = None,
|
502
|
+
function: Optional[Callable] = None,
|
503
|
+
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
|
506
504
|
) -> None:
|
507
505
|
"""
|
508
506
|
Asserts a test by running the evaluation and checking the results for success
|
509
507
|
|
510
508
|
Args:
|
511
|
-
examples (List[Example]): The examples to evaluate
|
509
|
+
examples (Optional[List[Example]]): The examples to evaluate. Must be provided if test_file is not.
|
510
|
+
test_file (Optional[str]): Path to a YAML file containing test examples. Must be provided if examples is not.
|
512
511
|
scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
513
512
|
model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
|
514
513
|
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
@@ -519,17 +518,37 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
519
518
|
override (bool): Whether to override an existing evaluation run with the same name
|
520
519
|
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
521
520
|
"""
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
521
|
+
# Validate that exactly one of examples or test_file is provided
|
522
|
+
if (examples is None and test_file is None) or (examples is not None and test_file is not None):
|
523
|
+
raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
|
524
|
+
|
525
|
+
if function:
|
526
|
+
results = self.run_sequence_evaluation(
|
527
|
+
examples=examples,
|
528
|
+
scorers=scorers,
|
529
|
+
model=model,
|
530
|
+
aggregator=aggregator,
|
531
|
+
log_results=log_results,
|
532
|
+
project_name=project_name,
|
533
|
+
eval_run_name=eval_run_name,
|
534
|
+
override=override,
|
535
|
+
rules=rules,
|
536
|
+
function=function,
|
537
|
+
tracer=tracer,
|
538
|
+
test_file=test_file
|
539
|
+
)
|
540
|
+
else:
|
541
|
+
results = self.run_evaluation(
|
542
|
+
examples=examples,
|
543
|
+
scorers=scorers,
|
544
|
+
model=model,
|
545
|
+
aggregator=aggregator,
|
546
|
+
metadata=metadata,
|
547
|
+
log_results=log_results,
|
548
|
+
project_name=project_name,
|
549
|
+
eval_run_name=eval_run_name,
|
550
|
+
override=override,
|
551
|
+
rules=rules
|
552
|
+
)
|
534
553
|
|
535
554
|
assert_test(results)
|
judgeval/run_evaluation.py
CHANGED
@@ -4,7 +4,7 @@ import time
|
|
4
4
|
import sys
|
5
5
|
import itertools
|
6
6
|
import threading
|
7
|
-
from typing import List, Dict, Any, Union
|
7
|
+
from typing import List, Dict, Any, Union, Optional, Callable
|
8
8
|
from datetime import datetime
|
9
9
|
from rich import print as rprint
|
10
10
|
|
@@ -12,7 +12,9 @@ from judgeval.data import (
|
|
12
12
|
ScorerData,
|
13
13
|
ScoringResult,
|
14
14
|
Example,
|
15
|
-
CustomExample
|
15
|
+
CustomExample,
|
16
|
+
Sequence,
|
17
|
+
Trace
|
16
18
|
)
|
17
19
|
from judgeval.scorers import (
|
18
20
|
JudgevalScorer,
|
@@ -26,7 +28,8 @@ from judgeval.constants import (
|
|
26
28
|
JUDGMENT_SEQUENCE_EVAL_API_URL,
|
27
29
|
JUDGMENT_EVAL_LOG_API_URL,
|
28
30
|
MAX_CONCURRENT_EVALUATIONS,
|
29
|
-
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
|
31
|
+
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
32
|
+
JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL
|
30
33
|
)
|
31
34
|
from judgeval.common.exceptions import JudgmentAPIError
|
32
35
|
from judgeval.common.logger import (
|
@@ -37,6 +40,8 @@ from judgeval.common.logger import (
|
|
37
40
|
)
|
38
41
|
from judgeval.evaluation_run import EvaluationRun
|
39
42
|
from judgeval.data.sequence_run import SequenceRun
|
43
|
+
from judgeval.common.tracer import Tracer
|
44
|
+
from langchain_core.callbacks import BaseCallbackHandler
|
40
45
|
|
41
46
|
def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
|
42
47
|
"""
|
@@ -198,6 +203,40 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
|
|
198
203
|
)
|
199
204
|
return results
|
200
205
|
|
206
|
+
def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_sequence: bool) -> None:
|
207
|
+
"""
|
208
|
+
Checks if the current experiment, if one exists, has the same type (examples of sequences)
|
209
|
+
"""
|
210
|
+
try:
|
211
|
+
response = requests.post(
|
212
|
+
f"{ROOT_API}/check_experiment_type/",
|
213
|
+
headers={
|
214
|
+
"Content-Type": "application/json",
|
215
|
+
"Authorization": f"Bearer {judgment_api_key}",
|
216
|
+
"X-Organization-Id": organization_id
|
217
|
+
},
|
218
|
+
json={
|
219
|
+
"eval_name": eval_name,
|
220
|
+
"project_name": project_name,
|
221
|
+
"judgment_api_key": judgment_api_key,
|
222
|
+
"is_sequence": is_sequence
|
223
|
+
},
|
224
|
+
verify=True
|
225
|
+
)
|
226
|
+
|
227
|
+
if response.status_code == 422:
|
228
|
+
error(f"{response.json()}")
|
229
|
+
raise ValueError(f"{response.json()}")
|
230
|
+
|
231
|
+
if not response.ok:
|
232
|
+
response_data = response.json()
|
233
|
+
error_message = response_data.get('detail', 'An unknown error occurred.')
|
234
|
+
error(f"Error checking eval run name: {error_message}")
|
235
|
+
raise JudgmentAPIError(error_message)
|
236
|
+
|
237
|
+
except requests.exceptions.RequestException as e:
|
238
|
+
error(f"Failed to check if experiment type exists: {str(e)}")
|
239
|
+
raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
|
201
240
|
|
202
241
|
def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str) -> None:
|
203
242
|
"""
|
@@ -243,7 +282,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
243
282
|
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
244
283
|
|
245
284
|
|
246
|
-
def log_evaluation_results(
|
285
|
+
def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
|
247
286
|
"""
|
248
287
|
Logs evaluation results to the Judgment API database.
|
249
288
|
|
@@ -264,7 +303,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
|
|
264
303
|
"X-Organization-Id": run.organization_id
|
265
304
|
},
|
266
305
|
json={
|
267
|
-
"results":
|
306
|
+
"results": scoring_results,
|
268
307
|
"run": run.model_dump(warnings=False)
|
269
308
|
},
|
270
309
|
verify=True
|
@@ -288,6 +327,51 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
|
|
288
327
|
error(f"Failed to save evaluation results to DB: {str(e)}")
|
289
328
|
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
290
329
|
|
330
|
+
def retrieve_sequence_from_trace(trace_id: str, parent_span: str, judgment_api_key: str, organization_id: str) -> Sequence:
|
331
|
+
"""
|
332
|
+
Retrieves a sequence from a trace ID.
|
333
|
+
"""
|
334
|
+
"""
|
335
|
+
Logs evaluation results to the Judgment API database.
|
336
|
+
|
337
|
+
Args:
|
338
|
+
merged_results (List[ScoringResult]): The results to log
|
339
|
+
evaluation_run (EvaluationRun): The evaluation run containing project info and API key
|
340
|
+
|
341
|
+
Raises:
|
342
|
+
JudgmentAPIError: If there's an API error during logging
|
343
|
+
ValueError: If there's a validation error with the results
|
344
|
+
"""
|
345
|
+
try:
|
346
|
+
res = requests.post(
|
347
|
+
JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL,
|
348
|
+
headers={
|
349
|
+
"Content-Type": "application/json",
|
350
|
+
"Authorization": f"Bearer {judgment_api_key}",
|
351
|
+
"X-Organization-Id": organization_id
|
352
|
+
},
|
353
|
+
json={
|
354
|
+
"trace_id": trace_id,
|
355
|
+
"trace_span_id": parent_span,
|
356
|
+
},
|
357
|
+
verify=True
|
358
|
+
)
|
359
|
+
|
360
|
+
if not res.ok:
|
361
|
+
response_data = res.json()
|
362
|
+
error_message = response_data.get('detail', 'An unknown error occurred.')
|
363
|
+
error(f"Error {res.status_code}: {error_message}")
|
364
|
+
raise JudgmentAPIError(error_message)
|
365
|
+
|
366
|
+
return Sequence(**res.json())
|
367
|
+
except requests.exceptions.RequestException as e:
|
368
|
+
error(f"Request failed while saving evaluation results to DB: {str(e)}")
|
369
|
+
raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
|
370
|
+
except Exception as e:
|
371
|
+
error(f"Failed to save evaluation results to DB: {str(e)}")
|
372
|
+
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
373
|
+
|
374
|
+
|
291
375
|
def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
|
292
376
|
"""Run a function with a spinner in the terminal."""
|
293
377
|
spinner = itertools.cycle(['|', '/', '-', '\\'])
|
@@ -318,23 +402,20 @@ def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
|
|
318
402
|
|
319
403
|
return result
|
320
404
|
|
321
|
-
def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) -> None:
|
405
|
+
def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]) -> None:
|
322
406
|
"""
|
323
407
|
Checks if the example contains the necessary parameters for the scorer.
|
324
408
|
"""
|
325
409
|
for scorer in scorers:
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
|
336
|
-
|
337
|
-
def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True) -> List[ScoringResult]:
|
410
|
+
for example in examples:
|
411
|
+
missing_params = []
|
412
|
+
for param in scorer.required_params:
|
413
|
+
if getattr(example, param.value) is None:
|
414
|
+
missing_params.append(f"'{param.value}'")
|
415
|
+
if missing_params:
|
416
|
+
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
|
417
|
+
|
418
|
+
def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
|
338
419
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
339
420
|
if not override and sequence_run.log_results and not sequence_run.append:
|
340
421
|
check_eval_run_name_exists(
|
@@ -344,13 +425,41 @@ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_
|
|
344
425
|
sequence_run.organization_id
|
345
426
|
)
|
346
427
|
|
428
|
+
if sequence_run.append:
|
429
|
+
# Check that the current experiment, if one exists, has the same type (examples of sequences)
|
430
|
+
check_experiment_type(
|
431
|
+
sequence_run.eval_name,
|
432
|
+
sequence_run.project_name,
|
433
|
+
sequence_run.judgment_api_key,
|
434
|
+
sequence_run.organization_id,
|
435
|
+
True
|
436
|
+
)
|
437
|
+
|
438
|
+
if function and tracer:
|
439
|
+
new_sequences: List[Sequence] = []
|
440
|
+
for example in examples:
|
441
|
+
if example.input:
|
442
|
+
result = run_with_spinner("Running agent function: ", function, **example.input)
|
443
|
+
else:
|
444
|
+
result = run_with_spinner("Running agent function: ", function)
|
445
|
+
for i, trace in enumerate(tracer.traces):
|
446
|
+
trace_id = trace['trace_id']
|
447
|
+
parent_span = trace['entries'][0]['span_id']
|
448
|
+
new_sequence = retrieve_sequence_from_trace(trace_id, parent_span, sequence_run.judgment_api_key, sequence_run.organization_id)
|
449
|
+
new_sequence.expected_tools = examples[i].expected_tools
|
450
|
+
new_sequences.append(new_sequence)
|
451
|
+
sequence_run.sequences = new_sequences
|
452
|
+
|
453
|
+
for sequence in sequence_run.sequences:
|
454
|
+
sequence.scorers = sequence_run.scorers
|
455
|
+
|
347
456
|
# Execute evaluation using Judgment API
|
348
457
|
info("Starting API evaluation")
|
349
458
|
try: # execute an EvaluationRun with just JudgmentScorers
|
350
459
|
debug("Sending request to Judgment API")
|
351
460
|
response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
|
352
|
-
|
353
|
-
info(f"Received {len(
|
461
|
+
scoring_results = [ScoringResult(**result) for result in response_data["results"]]
|
462
|
+
info(f"Received {len(scoring_results)} results from API")
|
354
463
|
except JudgmentAPIError as e:
|
355
464
|
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
356
465
|
raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
|
@@ -359,14 +468,12 @@ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_
|
|
359
468
|
|
360
469
|
# Convert the response data to `ScoringResult` objects
|
361
470
|
debug("Processing API results")
|
362
|
-
api_results = []
|
363
|
-
for result in response_data["results"]:
|
364
|
-
api_results.append(ScoringResult(**result))
|
365
|
-
|
366
471
|
# TODO: allow for custom scorer on sequences
|
367
472
|
if sequence_run.log_results:
|
368
|
-
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results,
|
473
|
+
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], sequence_run)
|
369
474
|
rprint(pretty_str)
|
475
|
+
|
476
|
+
return scoring_results
|
370
477
|
|
371
478
|
|
372
479
|
|
@@ -404,6 +511,16 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
404
511
|
evaluation_run.organization_id
|
405
512
|
)
|
406
513
|
|
514
|
+
if evaluation_run.append:
|
515
|
+
# Check that the current experiment, if one exists, has the same type (examples of sequences)
|
516
|
+
check_experiment_type(
|
517
|
+
evaluation_run.eval_name,
|
518
|
+
evaluation_run.project_name,
|
519
|
+
evaluation_run.judgment_api_key,
|
520
|
+
evaluation_run.organization_id,
|
521
|
+
False
|
522
|
+
)
|
523
|
+
|
407
524
|
# Set example IDs if not already set
|
408
525
|
debug("Initializing examples with IDs and timestamps")
|
409
526
|
for idx, example in enumerate(evaluation_run.examples):
|
@@ -539,7 +656,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
539
656
|
# )
|
540
657
|
# print(merged_results)
|
541
658
|
if evaluation_run.log_results:
|
542
|
-
|
659
|
+
send_results = [scoring_result.model_dump(warnings=False) for scoring_result in merged_results]
|
660
|
+
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, send_results, evaluation_run)
|
543
661
|
rprint(pretty_str)
|
544
662
|
|
545
663
|
for i, result in enumerate(merged_results):
|
@@ -564,34 +682,31 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
564
682
|
|
565
683
|
# Create a test case context with all relevant fields
|
566
684
|
test_case = {
|
567
|
-
|
568
|
-
'actual_output': result.data_object.actual_output,
|
569
|
-
'expected_output': result.data_object.expected_output,
|
570
|
-
'context': result.data_object.context,
|
571
|
-
'retrieval_context': result.data_object.retrieval_context,
|
572
|
-
'additional_metadata': result.data_object.additional_metadata,
|
573
|
-
'tools_called': result.data_object.tools_called,
|
574
|
-
'expected_tools': result.data_object.expected_tools,
|
575
|
-
'failed_scorers': []
|
685
|
+
"failed_scorers": []
|
576
686
|
}
|
577
687
|
if result.scorers_data:
|
578
688
|
# If the result was not successful, check each scorer_data
|
579
689
|
for scorer_data in result.scorers_data:
|
580
690
|
if not scorer_data.success:
|
691
|
+
if scorer_data.name == "Tool Order":
|
692
|
+
# Remove threshold, evaluation model for Tool Order scorer
|
693
|
+
scorer_data.threshold = None
|
694
|
+
scorer_data.evaluation_model = None
|
581
695
|
test_case['failed_scorers'].append(scorer_data)
|
582
696
|
failed_cases.append(test_case)
|
583
697
|
|
584
698
|
if failed_cases:
|
699
|
+
|
585
700
|
error_msg = f"The following test cases failed: \n"
|
586
701
|
for fail_case in failed_cases:
|
587
|
-
error_msg += f"\nInput: {fail_case['input']}\n"
|
588
|
-
error_msg += f"Actual Output: {fail_case['actual_output']}\n"
|
589
|
-
error_msg += f"Expected Output: {fail_case['expected_output']}\n"
|
590
|
-
error_msg += f"Context: {fail_case['context']}\n"
|
591
|
-
error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
|
592
|
-
error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
|
593
|
-
error_msg += f"Tools Called: {fail_case['tools_called']}\n"
|
594
|
-
error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
|
702
|
+
# error_msg += f"\nInput: {fail_case['input']}\n"
|
703
|
+
# error_msg += f"Actual Output: {fail_case['actual_output']}\n"
|
704
|
+
# error_msg += f"Expected Output: {fail_case['expected_output']}\n"
|
705
|
+
# error_msg += f"Context: {fail_case['context']}\n"
|
706
|
+
# error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
|
707
|
+
# error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
|
708
|
+
# error_msg += f"Tools Called: {fail_case['tools_called']}\n"
|
709
|
+
# error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
|
595
710
|
|
596
711
|
for fail_scorer in fail_case['failed_scorers']:
|
597
712
|
|
@@ -609,6 +724,37 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
609
724
|
f"Additional Metadata: {fail_scorer.additional_metadata}\n"
|
610
725
|
)
|
611
726
|
error_msg += "-"*100
|
612
|
-
|
613
|
-
|
727
|
+
|
728
|
+
total_tests = len(scoring_results)
|
729
|
+
failed_tests = len(failed_cases)
|
730
|
+
passed_tests = total_tests - failed_tests
|
731
|
+
|
732
|
+
# Print summary with colors
|
733
|
+
rprint("\n" + "="*80)
|
734
|
+
if failed_tests == 0:
|
735
|
+
rprint(f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]")
|
736
|
+
else:
|
737
|
+
rprint(f"[bold red]⚠️ TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]")
|
738
|
+
rprint("="*80 + "\n")
|
739
|
+
|
740
|
+
# Print individual test cases
|
741
|
+
for i, result in enumerate(scoring_results):
|
742
|
+
test_num = i + 1
|
743
|
+
if result.success:
|
744
|
+
rprint(f"[green]✓ Test {test_num}: PASSED[/green]")
|
745
|
+
else:
|
746
|
+
rprint(f"[red]✗ Test {test_num}: FAILED[/red]")
|
747
|
+
if result.scorers_data:
|
748
|
+
for scorer_data in result.scorers_data:
|
749
|
+
if not scorer_data.success:
|
750
|
+
rprint(f" [yellow]Scorer: {scorer_data.name}[/yellow]")
|
751
|
+
rprint(f" [red] Score: {scorer_data.score}[/red]")
|
752
|
+
rprint(f" [red] Reason: {scorer_data.reason}[/red]")
|
753
|
+
if scorer_data.error:
|
754
|
+
rprint(f" [red] Error: {scorer_data.error}[/red]")
|
755
|
+
rprint(" " + "-"*40)
|
756
|
+
|
757
|
+
rprint("\n" + "="*80)
|
758
|
+
if failed_tests > 0:
|
759
|
+
raise AssertionError(failed_cases)
|
614
760
|
|
judgeval/scorers/__init__.py
CHANGED
@@ -16,6 +16,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
|
|
16
16
|
InstructionAdherenceScorer,
|
17
17
|
GroundednessScorer,
|
18
18
|
DerailmentScorer,
|
19
|
+
ToolOrderScorer,
|
19
20
|
)
|
20
21
|
from judgeval.scorers.judgeval_scorers.classifiers import (
|
21
22
|
Text2SQLScorer,
|
@@ -41,4 +42,5 @@ __all__ = [
|
|
41
42
|
"InstructionAdherenceScorer",
|
42
43
|
"GroundednessScorer",
|
43
44
|
"DerailmentScorer",
|
45
|
+
"ToolOrderScorer",
|
44
46
|
]
|
@@ -12,6 +12,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonS
|
|
12
12
|
from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
|
13
13
|
from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
|
14
14
|
from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
|
15
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
|
15
16
|
__all__ = [
|
16
17
|
"ExecutionOrderScorer",
|
17
18
|
"JSONCorrectnessScorer",
|
@@ -27,4 +28,5 @@ __all__ = [
|
|
27
28
|
"InstructionAdherenceScorer",
|
28
29
|
"GroundednessScorer",
|
29
30
|
"DerailmentScorer",
|
31
|
+
"ToolOrderScorer",
|
30
32
|
]
|
@@ -0,0 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` tool order scorer
|
3
|
+
"""
|
4
|
+
|
5
|
+
# Internal imports
|
6
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
7
|
+
from judgeval.constants import APIScorer
|
8
|
+
|
9
|
+
class ToolOrderScorer(APIJudgmentScorer):
|
10
|
+
def __init__(self, threshold: float=1.0):
|
11
|
+
super().__init__(
|
12
|
+
threshold=threshold,
|
13
|
+
score_type=APIScorer.TOOL_ORDER,
|
14
|
+
)
|
15
|
+
|
16
|
+
@property
|
17
|
+
def __name__(self):
|
18
|
+
return "Tool Order"
|