judgeval 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,8 @@
2
2
  Implements the JudgmentClient to interact with the Judgment API.
3
3
  """
4
4
  import os
5
- from typing import Optional, List, Dict, Any, Union
5
+ from uuid import uuid4
6
+ from typing import Optional, List, Dict, Any, Union, Callable
6
7
  import requests
7
8
 
8
9
  from judgeval.constants import ROOT_API
@@ -33,7 +34,11 @@ from judgeval.constants import (
33
34
  JUDGMENT_PROJECT_DELETE_API_URL,
34
35
  JUDGMENT_PROJECT_CREATE_API_URL
35
36
  )
37
+ from judgeval.utils.data_utils import add_from_yaml
36
38
  from judgeval.common.exceptions import JudgmentAPIError
39
+ from langchain_core.callbacks import BaseCallbackHandler
40
+ from judgeval.common.tracer import Tracer
41
+ from judgeval.common.utils import validate_api_key
37
42
  from pydantic import BaseModel
38
43
  from judgeval.rules import Rule
39
44
 
@@ -63,7 +68,7 @@ class JudgmentClient(metaclass=SingletonMeta):
63
68
  self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
64
69
 
65
70
  # Verify API key is valid
66
- result, response = self._validate_api_key()
71
+ result, response = validate_api_key(judgment_api_key)
67
72
  if not result:
68
73
  # May be bad to output their invalid API key...
69
74
  raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
@@ -74,7 +79,7 @@ class JudgmentClient(metaclass=SingletonMeta):
74
79
  self,
75
80
  examples: List[Example],
76
81
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
77
- model: Union[str, List[str], JudgevalJudge],
82
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
78
83
  aggregator: Optional[str] = None,
79
84
  metadata: Optional[Dict[str, Any]] = None,
80
85
  log_results: bool = True,
@@ -102,9 +107,11 @@ class JudgmentClient(metaclass=SingletonMeta):
102
107
 
103
108
  def run_sequence_evaluation(
104
109
  self,
105
- sequences: List[Sequence],
106
- model: Union[str, List[str], JudgevalJudge],
107
110
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
111
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
112
+ sequences: Optional[List[Sequence]] = None,
113
+ examples: Optional[List[Example]] = None,
114
+ test_file: Optional[str] = None,
108
115
  aggregator: Optional[str] = None,
109
116
  project_name: str = "default_project",
110
117
  eval_run_name: str = "default_eval_sequence",
@@ -112,40 +119,40 @@ class JudgmentClient(metaclass=SingletonMeta):
112
119
  append: bool = False,
113
120
  override: bool = False,
114
121
  ignore_errors: bool = True,
115
- rules: Optional[List[Rule]] = None
122
+ rules: Optional[List[Rule]] = None,
123
+ function: Optional[Callable] = None,
124
+ tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
116
125
  ) -> List[ScoringResult]:
117
- try:
118
- def get_all_sequences(root: Sequence) -> List[Sequence]:
119
- all_sequences = [root]
120
-
121
- for item in root.items:
122
- if isinstance(item, Sequence):
123
- all_sequences.extend(get_all_sequences(item))
124
-
125
- return all_sequences
126
-
127
- def flatten_sequence_list(sequences: List[Sequence]) -> List[Sequence]:
128
- flattened = []
129
- for seq in sequences:
130
- flattened.extend(get_all_sequences(seq))
131
- return flattened
126
+ try:
127
+
128
+ if test_file:
129
+ try:
130
+ examples = add_from_yaml(test_file)
131
+ except FileNotFoundError:
132
+ raise FileNotFoundError(f"Test file not found: {test_file}")
133
+
134
+ if examples and not function:
135
+ raise ValueError("Cannot pass in examples without a function")
136
+
137
+ if sequences and function:
138
+ raise ValueError("Cannot pass in sequences and function")
139
+
140
+ if examples and sequences:
141
+ raise ValueError("Cannot pass in both examples and sequences")
132
142
 
133
- flattened_sequences = flatten_sequence_list(sequences)
134
- for sequence in flattened_sequences:
135
- sequence.scorers = scorers
136
-
137
143
  sequence_run = SequenceRun(
138
144
  project_name=project_name,
139
145
  eval_name=eval_run_name,
140
146
  sequences=sequences,
147
+ scorers=scorers,
141
148
  model=model,
142
149
  aggregator=aggregator,
143
150
  log_results=log_results,
144
151
  append=append,
145
152
  judgment_api_key=self.judgment_api_key,
146
- organization_id=self.organization_id
153
+ organization_id=self.organization_id,
147
154
  )
148
- return run_sequence_eval(sequence_run, override, ignore_errors)
155
+ return run_sequence_eval(sequence_run, override, ignore_errors, function, tracer, examples)
149
156
  except ValueError as e:
150
157
  raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
151
158
  except Exception as e:
@@ -155,7 +162,7 @@ class JudgmentClient(metaclass=SingletonMeta):
155
162
  self,
156
163
  examples: Union[List[Example], List[CustomExample]],
157
164
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
158
- model: Union[str, List[str], JudgevalJudge],
165
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
159
166
  aggregator: Optional[str] = None,
160
167
  metadata: Optional[Dict[str, Any]] = None,
161
168
  log_results: bool = True,
@@ -232,11 +239,17 @@ class JudgmentClient(metaclass=SingletonMeta):
232
239
  dataset.judgment_api_key = self.judgment_api_key
233
240
  return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
234
241
 
235
- def append_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
242
+ def append_example_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
236
243
  """
237
244
  Appends an `EvalDataset` to the Judgment platform for storage.
238
245
  """
239
- return self.eval_dataset_client.append(alias, examples, project_name)
246
+ return self.eval_dataset_client.append_examples(alias, examples, project_name)
247
+
248
+ def append_sequence_dataset(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
249
+ """
250
+ Appends a `Sequence` to the Judgment platform for storage.
251
+ """
252
+ return self.eval_dataset_client.append_sequences(alias, sequences, project_name)
240
253
 
241
254
  def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
242
255
  """
@@ -390,24 +403,6 @@ class JudgmentClient(metaclass=SingletonMeta):
390
403
  raise ValueError(f"Error deleting project: {response.json()}")
391
404
  return response.json()
392
405
 
393
- def _validate_api_key(self):
394
- """
395
- Validates that the user api key is valid
396
- """
397
- response = requests.post(
398
- f"{ROOT_API}/validate_api_key/",
399
- headers={
400
- "Content-Type": "application/json",
401
- "Authorization": f"Bearer {self.judgment_api_key}",
402
- },
403
- json={}, # Empty body now
404
- verify=True
405
- )
406
- if response.status_code == 200:
407
- return True, response.json()
408
- else:
409
- return False, response.json().get("detail", "Error validating API key")
410
-
411
406
  def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
412
407
  """
413
408
  Fetches a classifier scorer configuration from the Judgment API.
@@ -493,22 +488,26 @@ class JudgmentClient(metaclass=SingletonMeta):
493
488
 
494
489
  def assert_test(
495
490
  self,
496
- examples: List[Example],
497
491
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
498
- model: Union[str, List[str], JudgevalJudge],
492
+ examples: Optional[List[Example]] = None,
493
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
494
+ test_file: Optional[str] = None,
499
495
  aggregator: Optional[str] = None,
500
496
  metadata: Optional[Dict[str, Any]] = None,
501
497
  log_results: bool = True,
502
- project_name: str = "default_project",
503
- eval_run_name: str = "default_eval_run",
498
+ project_name: str = "default_test",
499
+ eval_run_name: str = str(uuid4()),
504
500
  override: bool = False,
505
- rules: Optional[List[Rule]] = None
501
+ rules: Optional[List[Rule]] = None,
502
+ function: Optional[Callable] = None,
503
+ tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
506
504
  ) -> None:
507
505
  """
508
506
  Asserts a test by running the evaluation and checking the results for success
509
507
 
510
508
  Args:
511
- examples (List[Example]): The examples to evaluate
509
+ examples (Optional[List[Example]]): The examples to evaluate. Must be provided if test_file is not.
510
+ test_file (Optional[str]): Path to a YAML file containing test examples. Must be provided if examples is not.
512
511
  scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
513
512
  model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
514
513
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -519,17 +518,37 @@ class JudgmentClient(metaclass=SingletonMeta):
519
518
  override (bool): Whether to override an existing evaluation run with the same name
520
519
  rules (Optional[List[Rule]]): Rules to evaluate against scoring results
521
520
  """
522
- results = self.run_evaluation(
523
- examples=examples,
524
- scorers=scorers,
525
- model=model,
526
- aggregator=aggregator,
527
- metadata=metadata,
528
- log_results=log_results,
529
- project_name=project_name,
530
- eval_run_name=eval_run_name,
531
- override=override,
532
- rules=rules
533
- )
521
+ # Validate that exactly one of examples or test_file is provided
522
+ if (examples is None and test_file is None) or (examples is not None and test_file is not None):
523
+ raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
524
+
525
+ if function:
526
+ results = self.run_sequence_evaluation(
527
+ examples=examples,
528
+ scorers=scorers,
529
+ model=model,
530
+ aggregator=aggregator,
531
+ log_results=log_results,
532
+ project_name=project_name,
533
+ eval_run_name=eval_run_name,
534
+ override=override,
535
+ rules=rules,
536
+ function=function,
537
+ tracer=tracer,
538
+ test_file=test_file
539
+ )
540
+ else:
541
+ results = self.run_evaluation(
542
+ examples=examples,
543
+ scorers=scorers,
544
+ model=model,
545
+ aggregator=aggregator,
546
+ metadata=metadata,
547
+ log_results=log_results,
548
+ project_name=project_name,
549
+ eval_run_name=eval_run_name,
550
+ override=override,
551
+ rules=rules
552
+ )
534
553
 
535
554
  assert_test(results)
@@ -4,7 +4,7 @@ import time
4
4
  import sys
5
5
  import itertools
6
6
  import threading
7
- from typing import List, Dict, Any, Union
7
+ from typing import List, Dict, Any, Union, Optional, Callable
8
8
  from datetime import datetime
9
9
  from rich import print as rprint
10
10
 
@@ -12,7 +12,9 @@ from judgeval.data import (
12
12
  ScorerData,
13
13
  ScoringResult,
14
14
  Example,
15
- CustomExample
15
+ CustomExample,
16
+ Sequence,
17
+ Trace
16
18
  )
17
19
  from judgeval.scorers import (
18
20
  JudgevalScorer,
@@ -26,7 +28,8 @@ from judgeval.constants import (
26
28
  JUDGMENT_SEQUENCE_EVAL_API_URL,
27
29
  JUDGMENT_EVAL_LOG_API_URL,
28
30
  MAX_CONCURRENT_EVALUATIONS,
29
- JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
31
+ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
32
+ JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL
30
33
  )
31
34
  from judgeval.common.exceptions import JudgmentAPIError
32
35
  from judgeval.common.logger import (
@@ -37,6 +40,8 @@ from judgeval.common.logger import (
37
40
  )
38
41
  from judgeval.evaluation_run import EvaluationRun
39
42
  from judgeval.data.sequence_run import SequenceRun
43
+ from judgeval.common.tracer import Tracer
44
+ from langchain_core.callbacks import BaseCallbackHandler
40
45
 
41
46
  def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
42
47
  """
@@ -198,6 +203,40 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
198
203
  )
199
204
  return results
200
205
 
206
+ def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_sequence: bool) -> None:
207
+ """
208
+ Checks if the current experiment, if one exists, has the same type (examples of sequences)
209
+ """
210
+ try:
211
+ response = requests.post(
212
+ f"{ROOT_API}/check_experiment_type/",
213
+ headers={
214
+ "Content-Type": "application/json",
215
+ "Authorization": f"Bearer {judgment_api_key}",
216
+ "X-Organization-Id": organization_id
217
+ },
218
+ json={
219
+ "eval_name": eval_name,
220
+ "project_name": project_name,
221
+ "judgment_api_key": judgment_api_key,
222
+ "is_sequence": is_sequence
223
+ },
224
+ verify=True
225
+ )
226
+
227
+ if response.status_code == 422:
228
+ error(f"{response.json()}")
229
+ raise ValueError(f"{response.json()}")
230
+
231
+ if not response.ok:
232
+ response_data = response.json()
233
+ error_message = response_data.get('detail', 'An unknown error occurred.')
234
+ error(f"Error checking eval run name: {error_message}")
235
+ raise JudgmentAPIError(error_message)
236
+
237
+ except requests.exceptions.RequestException as e:
238
+ error(f"Failed to check if experiment type exists: {str(e)}")
239
+ raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
201
240
 
202
241
  def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str) -> None:
203
242
  """
@@ -243,7 +282,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
243
282
  raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
244
283
 
245
284
 
246
- def log_evaluation_results(merged_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
285
+ def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
247
286
  """
248
287
  Logs evaluation results to the Judgment API database.
249
288
 
@@ -264,7 +303,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
264
303
  "X-Organization-Id": run.organization_id
265
304
  },
266
305
  json={
267
- "results": [result.model_dump(warnings=False) for result in merged_results],
306
+ "results": scoring_results,
268
307
  "run": run.model_dump(warnings=False)
269
308
  },
270
309
  verify=True
@@ -288,6 +327,51 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
288
327
  error(f"Failed to save evaluation results to DB: {str(e)}")
289
328
  raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
290
329
 
330
+ def retrieve_sequence_from_trace(trace_id: str, parent_span: str, judgment_api_key: str, organization_id: str) -> Sequence:
331
+ """
332
+ Retrieves a sequence from a trace ID.
333
+ """
334
+ """
335
+ Logs evaluation results to the Judgment API database.
336
+
337
+ Args:
338
+ merged_results (List[ScoringResult]): The results to log
339
+ evaluation_run (EvaluationRun): The evaluation run containing project info and API key
340
+
341
+ Raises:
342
+ JudgmentAPIError: If there's an API error during logging
343
+ ValueError: If there's a validation error with the results
344
+ """
345
+ try:
346
+ res = requests.post(
347
+ JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL,
348
+ headers={
349
+ "Content-Type": "application/json",
350
+ "Authorization": f"Bearer {judgment_api_key}",
351
+ "X-Organization-Id": organization_id
352
+ },
353
+ json={
354
+ "trace_id": trace_id,
355
+ "trace_span_id": parent_span,
356
+ },
357
+ verify=True
358
+ )
359
+
360
+ if not res.ok:
361
+ response_data = res.json()
362
+ error_message = response_data.get('detail', 'An unknown error occurred.')
363
+ error(f"Error {res.status_code}: {error_message}")
364
+ raise JudgmentAPIError(error_message)
365
+
366
+ return Sequence(**res.json())
367
+ except requests.exceptions.RequestException as e:
368
+ error(f"Request failed while saving evaluation results to DB: {str(e)}")
369
+ raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
370
+ except Exception as e:
371
+ error(f"Failed to save evaluation results to DB: {str(e)}")
372
+ raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
373
+
374
+
291
375
  def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
292
376
  """Run a function with a spinner in the terminal."""
293
377
  spinner = itertools.cycle(['|', '/', '-', '\\'])
@@ -318,23 +402,20 @@ def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
318
402
 
319
403
  return result
320
404
 
321
- def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) -> None:
405
+ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]) -> None:
322
406
  """
323
407
  Checks if the example contains the necessary parameters for the scorer.
324
408
  """
325
409
  for scorer in scorers:
326
- if isinstance(scorer, APIJudgmentScorer):
327
- for example in examples:
328
- missing_params = []
329
- for param in scorer.required_params:
330
- if getattr(example, param.value) is None:
331
- missing_params.append(f"'{param.value}'")
332
- if missing_params:
333
- # We do this because we want to inform users that an example is missing parameters for a scorer
334
- # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
335
- print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
336
-
337
- def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True) -> List[ScoringResult]:
410
+ for example in examples:
411
+ missing_params = []
412
+ for param in scorer.required_params:
413
+ if getattr(example, param.value) is None:
414
+ missing_params.append(f"'{param.value}'")
415
+ if missing_params:
416
+ print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
417
+
418
+ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
338
419
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
339
420
  if not override and sequence_run.log_results and not sequence_run.append:
340
421
  check_eval_run_name_exists(
@@ -344,13 +425,41 @@ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_
344
425
  sequence_run.organization_id
345
426
  )
346
427
 
428
+ if sequence_run.append:
429
+ # Check that the current experiment, if one exists, has the same type (examples of sequences)
430
+ check_experiment_type(
431
+ sequence_run.eval_name,
432
+ sequence_run.project_name,
433
+ sequence_run.judgment_api_key,
434
+ sequence_run.organization_id,
435
+ True
436
+ )
437
+
438
+ if function and tracer:
439
+ new_sequences: List[Sequence] = []
440
+ for example in examples:
441
+ if example.input:
442
+ result = run_with_spinner("Running agent function: ", function, **example.input)
443
+ else:
444
+ result = run_with_spinner("Running agent function: ", function)
445
+ for i, trace in enumerate(tracer.traces):
446
+ trace_id = trace['trace_id']
447
+ parent_span = trace['entries'][0]['span_id']
448
+ new_sequence = retrieve_sequence_from_trace(trace_id, parent_span, sequence_run.judgment_api_key, sequence_run.organization_id)
449
+ new_sequence.expected_tools = examples[i].expected_tools
450
+ new_sequences.append(new_sequence)
451
+ sequence_run.sequences = new_sequences
452
+
453
+ for sequence in sequence_run.sequences:
454
+ sequence.scorers = sequence_run.scorers
455
+
347
456
  # Execute evaluation using Judgment API
348
457
  info("Starting API evaluation")
349
458
  try: # execute an EvaluationRun with just JudgmentScorers
350
459
  debug("Sending request to Judgment API")
351
460
  response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
352
-
353
- info(f"Received {len(response_data['results'])} results from API")
461
+ scoring_results = [ScoringResult(**result) for result in response_data["results"]]
462
+ info(f"Received {len(scoring_results)} results from API")
354
463
  except JudgmentAPIError as e:
355
464
  error(f"An error occurred while executing the Judgment API request: {str(e)}")
356
465
  raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
@@ -359,14 +468,12 @@ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_
359
468
 
360
469
  # Convert the response data to `ScoringResult` objects
361
470
  debug("Processing API results")
362
- api_results = []
363
- for result in response_data["results"]:
364
- api_results.append(ScoringResult(**result))
365
-
366
471
  # TODO: allow for custom scorer on sequences
367
472
  if sequence_run.log_results:
368
- pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, api_results, sequence_run)
473
+ pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], sequence_run)
369
474
  rprint(pretty_str)
475
+
476
+ return scoring_results
370
477
 
371
478
 
372
479
 
@@ -404,6 +511,16 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
404
511
  evaluation_run.organization_id
405
512
  )
406
513
 
514
+ if evaluation_run.append:
515
+ # Check that the current experiment, if one exists, has the same type (examples of sequences)
516
+ check_experiment_type(
517
+ evaluation_run.eval_name,
518
+ evaluation_run.project_name,
519
+ evaluation_run.judgment_api_key,
520
+ evaluation_run.organization_id,
521
+ False
522
+ )
523
+
407
524
  # Set example IDs if not already set
408
525
  debug("Initializing examples with IDs and timestamps")
409
526
  for idx, example in enumerate(evaluation_run.examples):
@@ -539,7 +656,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
539
656
  # )
540
657
  # print(merged_results)
541
658
  if evaluation_run.log_results:
542
- pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
659
+ send_results = [scoring_result.model_dump(warnings=False) for scoring_result in merged_results]
660
+ pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, send_results, evaluation_run)
543
661
  rprint(pretty_str)
544
662
 
545
663
  for i, result in enumerate(merged_results):
@@ -564,34 +682,31 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
564
682
 
565
683
  # Create a test case context with all relevant fields
566
684
  test_case = {
567
- 'input': result.data_object.input,
568
- 'actual_output': result.data_object.actual_output,
569
- 'expected_output': result.data_object.expected_output,
570
- 'context': result.data_object.context,
571
- 'retrieval_context': result.data_object.retrieval_context,
572
- 'additional_metadata': result.data_object.additional_metadata,
573
- 'tools_called': result.data_object.tools_called,
574
- 'expected_tools': result.data_object.expected_tools,
575
- 'failed_scorers': []
685
+ "failed_scorers": []
576
686
  }
577
687
  if result.scorers_data:
578
688
  # If the result was not successful, check each scorer_data
579
689
  for scorer_data in result.scorers_data:
580
690
  if not scorer_data.success:
691
+ if scorer_data.name == "Tool Order":
692
+ # Remove threshold, evaluation model for Tool Order scorer
693
+ scorer_data.threshold = None
694
+ scorer_data.evaluation_model = None
581
695
  test_case['failed_scorers'].append(scorer_data)
582
696
  failed_cases.append(test_case)
583
697
 
584
698
  if failed_cases:
699
+
585
700
  error_msg = f"The following test cases failed: \n"
586
701
  for fail_case in failed_cases:
587
- error_msg += f"\nInput: {fail_case['input']}\n"
588
- error_msg += f"Actual Output: {fail_case['actual_output']}\n"
589
- error_msg += f"Expected Output: {fail_case['expected_output']}\n"
590
- error_msg += f"Context: {fail_case['context']}\n"
591
- error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
592
- error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
593
- error_msg += f"Tools Called: {fail_case['tools_called']}\n"
594
- error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
702
+ # error_msg += f"\nInput: {fail_case['input']}\n"
703
+ # error_msg += f"Actual Output: {fail_case['actual_output']}\n"
704
+ # error_msg += f"Expected Output: {fail_case['expected_output']}\n"
705
+ # error_msg += f"Context: {fail_case['context']}\n"
706
+ # error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
707
+ # error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
708
+ # error_msg += f"Tools Called: {fail_case['tools_called']}\n"
709
+ # error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
595
710
 
596
711
  for fail_scorer in fail_case['failed_scorers']:
597
712
 
@@ -609,6 +724,37 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
609
724
  f"Additional Metadata: {fail_scorer.additional_metadata}\n"
610
725
  )
611
726
  error_msg += "-"*100
612
-
613
- raise AssertionError(error_msg)
727
+
728
+ total_tests = len(scoring_results)
729
+ failed_tests = len(failed_cases)
730
+ passed_tests = total_tests - failed_tests
731
+
732
+ # Print summary with colors
733
+ rprint("\n" + "="*80)
734
+ if failed_tests == 0:
735
+ rprint(f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]")
736
+ else:
737
+ rprint(f"[bold red]⚠️ TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]")
738
+ rprint("="*80 + "\n")
739
+
740
+ # Print individual test cases
741
+ for i, result in enumerate(scoring_results):
742
+ test_num = i + 1
743
+ if result.success:
744
+ rprint(f"[green]✓ Test {test_num}: PASSED[/green]")
745
+ else:
746
+ rprint(f"[red]✗ Test {test_num}: FAILED[/red]")
747
+ if result.scorers_data:
748
+ for scorer_data in result.scorers_data:
749
+ if not scorer_data.success:
750
+ rprint(f" [yellow]Scorer: {scorer_data.name}[/yellow]")
751
+ rprint(f" [red] Score: {scorer_data.score}[/red]")
752
+ rprint(f" [red] Reason: {scorer_data.reason}[/red]")
753
+ if scorer_data.error:
754
+ rprint(f" [red] Error: {scorer_data.error}[/red]")
755
+ rprint(" " + "-"*40)
756
+
757
+ rprint("\n" + "="*80)
758
+ if failed_tests > 0:
759
+ raise AssertionError(failed_cases)
614
760
 
@@ -16,6 +16,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
16
16
  InstructionAdherenceScorer,
17
17
  GroundednessScorer,
18
18
  DerailmentScorer,
19
+ ToolOrderScorer,
19
20
  )
20
21
  from judgeval.scorers.judgeval_scorers.classifiers import (
21
22
  Text2SQLScorer,
@@ -41,4 +42,5 @@ __all__ = [
41
42
  "InstructionAdherenceScorer",
42
43
  "GroundednessScorer",
43
44
  "DerailmentScorer",
45
+ "ToolOrderScorer",
44
46
  ]
@@ -12,6 +12,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonS
12
12
  from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
13
13
  from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
14
14
  from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
15
+ from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
15
16
  __all__ = [
16
17
  "ExecutionOrderScorer",
17
18
  "JSONCorrectnessScorer",
@@ -27,4 +28,5 @@ __all__ = [
27
28
  "InstructionAdherenceScorer",
28
29
  "GroundednessScorer",
29
30
  "DerailmentScorer",
31
+ "ToolOrderScorer",
30
32
  ]
@@ -0,0 +1,18 @@
1
+ """
2
+ `judgeval` tool order scorer
3
+ """
4
+
5
+ # Internal imports
6
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
7
+ from judgeval.constants import APIScorer
8
+
9
+ class ToolOrderScorer(APIJudgmentScorer):
10
+ def __init__(self, threshold: float=1.0):
11
+ super().__init__(
12
+ threshold=threshold,
13
+ score_type=APIScorer.TOOL_ORDER,
14
+ )
15
+
16
+ @property
17
+ def __name__(self):
18
+ return "Tool Order"