judgeval 0.0.36__py3-none-any.whl → 0.0.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,7 +12,7 @@ BASE_CONVERSATION = [
12
12
 
13
13
 
14
14
  class LiteLLMJudge(JudgevalJudge):
15
- def __init__(self, model: str = "gpt-4o-mini", **kwargs):
15
+ def __init__(self, model: str = "gpt-4.1-mini", **kwargs):
16
16
  debug(f"Initializing LiteLLMJudge with model={model}")
17
17
  self.model = model
18
18
  self.kwargs = kwargs
@@ -136,7 +136,7 @@ class MixtureOfJudges(JudgevalJudge):
136
136
  """
137
137
  def __init__(self,
138
138
  models: List[str] = ['QWEN', 'LLAMA3_70B_INSTRUCT_TURBO', 'MISTRAL_8x22B_INSTRUCT'],
139
- aggregator: str = 'gpt-4o',
139
+ aggregator: str = 'gpt-4.1',
140
140
  **kwargs):
141
141
  """
142
142
  `models` are the individual judge models to be used for generating responses.
judgeval/judges/utils.py CHANGED
@@ -23,7 +23,7 @@ def create_judge(
23
23
  If no model is provided, uses GPT4o as the default judge.
24
24
  """
25
25
  if model is None: # default option
26
- return LiteLLMJudge(model="gpt-4o"), True
26
+ return LiteLLMJudge(model="gpt-4.1"), True
27
27
  if not isinstance(model, (str, list, JudgevalJudge)):
28
28
  raise InvalidJudgeModelError(f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead.")
29
29
  # If model is already a valid judge type, return it and mark native
@@ -2,7 +2,8 @@
2
2
  Implements the JudgmentClient to interact with the Judgment API.
3
3
  """
4
4
  import os
5
- from typing import Optional, List, Dict, Any, Union
5
+ from uuid import uuid4
6
+ from typing import Optional, List, Dict, Any, Union, Callable
6
7
  import requests
7
8
 
8
9
  from judgeval.constants import ROOT_API
@@ -11,7 +12,7 @@ from judgeval.data import (
11
12
  ScoringResult,
12
13
  Example,
13
14
  CustomExample,
14
- Sequence,
15
+ Trace,
15
16
  )
16
17
  from judgeval.scorers import (
17
18
  APIJudgmentScorer,
@@ -22,9 +23,9 @@ from judgeval.evaluation_run import EvaluationRun
22
23
  from judgeval.run_evaluation import (
23
24
  run_eval,
24
25
  assert_test,
25
- run_sequence_eval
26
+ run_trace_eval
26
27
  )
27
- from judgeval.data.sequence_run import SequenceRun
28
+ from judgeval.data.trace_run import TraceRun
28
29
  from judgeval.judges import JudgevalJudge
29
30
  from judgeval.constants import (
30
31
  JUDGMENT_EVAL_FETCH_API_URL,
@@ -33,7 +34,11 @@ from judgeval.constants import (
33
34
  JUDGMENT_PROJECT_DELETE_API_URL,
34
35
  JUDGMENT_PROJECT_CREATE_API_URL
35
36
  )
37
+ from judgeval.utils.data_utils import add_from_yaml
36
38
  from judgeval.common.exceptions import JudgmentAPIError
39
+ from langchain_core.callbacks import BaseCallbackHandler
40
+ from judgeval.common.tracer import Tracer
41
+ from judgeval.common.utils import validate_api_key
37
42
  from pydantic import BaseModel
38
43
  from judgeval.rules import Rule
39
44
 
@@ -63,7 +68,7 @@ class JudgmentClient(metaclass=SingletonMeta):
63
68
  self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
64
69
 
65
70
  # Verify API key is valid
66
- result, response = self._validate_api_key()
71
+ result, response = validate_api_key(judgment_api_key)
67
72
  if not result:
68
73
  # May be bad to output their invalid API key...
69
74
  raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
@@ -74,7 +79,7 @@ class JudgmentClient(metaclass=SingletonMeta):
74
79
  self,
75
80
  examples: List[Example],
76
81
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
77
- model: Union[str, List[str], JudgevalJudge],
82
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
78
83
  aggregator: Optional[str] = None,
79
84
  metadata: Optional[Dict[str, Any]] = None,
80
85
  log_results: bool = True,
@@ -100,54 +105,56 @@ class JudgmentClient(metaclass=SingletonMeta):
100
105
  rules=rules
101
106
  )
102
107
 
103
- def run_sequence_evaluation(
108
+ def run_trace_evaluation(
104
109
  self,
105
- sequences: List[Sequence],
106
- model: Union[str, List[str], JudgevalJudge],
107
110
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
111
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
112
+ traces: Optional[List[Trace]] = None,
113
+ examples: Optional[List[Example]] = None,
114
+ test_file: Optional[str] = None,
108
115
  aggregator: Optional[str] = None,
109
116
  project_name: str = "default_project",
110
- eval_run_name: str = "default_eval_sequence",
117
+ eval_run_name: str = "default_eval_trace",
111
118
  log_results: bool = True,
112
119
  append: bool = False,
113
120
  override: bool = False,
114
121
  ignore_errors: bool = True,
115
- rules: Optional[List[Rule]] = None
122
+ rules: Optional[List[Rule]] = None,
123
+ function: Optional[Callable] = None,
124
+ tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
116
125
  ) -> List[ScoringResult]:
117
- try:
118
- def get_all_sequences(root: Sequence) -> List[Sequence]:
119
- all_sequences = [root]
120
-
121
- for item in root.items:
122
- if isinstance(item, Sequence):
123
- all_sequences.extend(get_all_sequences(item))
124
-
125
- return all_sequences
126
-
127
- def flatten_sequence_list(sequences: List[Sequence]) -> List[Sequence]:
128
- flattened = []
129
- for seq in sequences:
130
- flattened.extend(get_all_sequences(seq))
131
- return flattened
126
+ try:
127
+
128
+ if test_file:
129
+ try:
130
+ examples = add_from_yaml(test_file)
131
+ except FileNotFoundError:
132
+ raise FileNotFoundError(f"Test file not found: {test_file}")
133
+
134
+ if examples and not function:
135
+ raise ValueError("Cannot pass in examples without a function")
136
+
137
+ if traces and function:
138
+ raise ValueError("Cannot pass in traces and function")
132
139
 
133
- flattened_sequences = flatten_sequence_list(sequences)
134
- for sequence in flattened_sequences:
135
- sequence.scorers = scorers
136
-
137
- sequence_run = SequenceRun(
140
+ if examples and traces:
141
+ raise ValueError("Cannot pass in both examples and traces")
142
+
143
+ trace_run = TraceRun(
138
144
  project_name=project_name,
139
145
  eval_name=eval_run_name,
140
- sequences=sequences,
146
+ traces=traces,
147
+ scorers=scorers,
141
148
  model=model,
142
149
  aggregator=aggregator,
143
150
  log_results=log_results,
144
151
  append=append,
145
152
  judgment_api_key=self.judgment_api_key,
146
- organization_id=self.organization_id
153
+ organization_id=self.organization_id,
147
154
  )
148
- return run_sequence_eval(sequence_run, override, ignore_errors)
155
+ return run_trace_eval(trace_run, override, ignore_errors, function, tracer, examples)
149
156
  except ValueError as e:
150
- raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
157
+ raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}")
151
158
  except Exception as e:
152
159
  raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
153
160
 
@@ -155,7 +162,7 @@ class JudgmentClient(metaclass=SingletonMeta):
155
162
  self,
156
163
  examples: Union[List[Example], List[CustomExample]],
157
164
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
158
- model: Union[str, List[str], JudgevalJudge],
165
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
159
166
  aggregator: Optional[str] = None,
160
167
  metadata: Optional[Dict[str, Any]] = None,
161
168
  log_results: bool = True,
@@ -238,12 +245,6 @@ class JudgmentClient(metaclass=SingletonMeta):
238
245
  """
239
246
  return self.eval_dataset_client.append_examples(alias, examples, project_name)
240
247
 
241
- def append_sequence_dataset(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
242
- """
243
- Appends a `Sequence` to the Judgment platform for storage.
244
- """
245
- return self.eval_dataset_client.append_sequences(alias, sequences, project_name)
246
-
247
248
  def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
248
249
  """
249
250
  Retrieves a saved `EvalDataset` from the Judgment platform.
@@ -396,24 +397,6 @@ class JudgmentClient(metaclass=SingletonMeta):
396
397
  raise ValueError(f"Error deleting project: {response.json()}")
397
398
  return response.json()
398
399
 
399
- def _validate_api_key(self):
400
- """
401
- Validates that the user api key is valid
402
- """
403
- response = requests.post(
404
- f"{ROOT_API}/validate_api_key/",
405
- headers={
406
- "Content-Type": "application/json",
407
- "Authorization": f"Bearer {self.judgment_api_key}",
408
- },
409
- json={}, # Empty body now
410
- verify=True
411
- )
412
- if response.status_code == 200:
413
- return True, response.json()
414
- else:
415
- return False, response.json().get("detail", "Error validating API key")
416
-
417
400
  def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
418
401
  """
419
402
  Fetches a classifier scorer configuration from the Judgment API.
@@ -499,22 +482,26 @@ class JudgmentClient(metaclass=SingletonMeta):
499
482
 
500
483
  def assert_test(
501
484
  self,
502
- examples: List[Example],
503
485
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
504
- model: Union[str, List[str], JudgevalJudge],
486
+ examples: Optional[List[Example]] = None,
487
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
488
+ test_file: Optional[str] = None,
505
489
  aggregator: Optional[str] = None,
506
490
  metadata: Optional[Dict[str, Any]] = None,
507
491
  log_results: bool = True,
508
- project_name: str = "default_project",
509
- eval_run_name: str = "default_eval_run",
492
+ project_name: str = "default_test",
493
+ eval_run_name: str = str(uuid4()),
510
494
  override: bool = False,
511
- rules: Optional[List[Rule]] = None
495
+ rules: Optional[List[Rule]] = None,
496
+ function: Optional[Callable] = None,
497
+ tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
512
498
  ) -> None:
513
499
  """
514
500
  Asserts a test by running the evaluation and checking the results for success
515
501
 
516
502
  Args:
517
- examples (List[Example]): The examples to evaluate
503
+ examples (Optional[List[Example]]): The examples to evaluate. Must be provided if test_file is not.
504
+ test_file (Optional[str]): Path to a YAML file containing test examples. Must be provided if examples is not.
518
505
  scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
519
506
  model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
520
507
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -525,17 +512,37 @@ class JudgmentClient(metaclass=SingletonMeta):
525
512
  override (bool): Whether to override an existing evaluation run with the same name
526
513
  rules (Optional[List[Rule]]): Rules to evaluate against scoring results
527
514
  """
528
- results = self.run_evaluation(
529
- examples=examples,
530
- scorers=scorers,
531
- model=model,
532
- aggregator=aggregator,
533
- metadata=metadata,
534
- log_results=log_results,
535
- project_name=project_name,
536
- eval_run_name=eval_run_name,
537
- override=override,
538
- rules=rules
539
- )
515
+ # Validate that exactly one of examples or test_file is provided
516
+ if (examples is None and test_file is None) or (examples is not None and test_file is not None):
517
+ raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
518
+
519
+ if function:
520
+ results = self.run_trace_evaluation(
521
+ examples=examples,
522
+ scorers=scorers,
523
+ model=model,
524
+ aggregator=aggregator,
525
+ log_results=log_results,
526
+ project_name=project_name,
527
+ eval_run_name=eval_run_name,
528
+ override=override,
529
+ rules=rules,
530
+ function=function,
531
+ tracer=tracer,
532
+ test_file=test_file
533
+ )
534
+ else:
535
+ results = self.run_evaluation(
536
+ examples=examples,
537
+ scorers=scorers,
538
+ model=model,
539
+ aggregator=aggregator,
540
+ metadata=metadata,
541
+ log_results=log_results,
542
+ project_name=project_name,
543
+ eval_run_name=eval_run_name,
544
+ override=override,
545
+ rules=rules
546
+ )
540
547
 
541
548
  assert_test(results)
@@ -4,7 +4,7 @@ import time
4
4
  import sys
5
5
  import itertools
6
6
  import threading
7
- from typing import List, Dict, Any, Union
7
+ from typing import List, Dict, Any, Union, Optional, Callable
8
8
  from datetime import datetime
9
9
  from rich import print as rprint
10
10
 
@@ -12,7 +12,8 @@ from judgeval.data import (
12
12
  ScorerData,
13
13
  ScoringResult,
14
14
  Example,
15
- CustomExample
15
+ CustomExample,
16
+ Trace
16
17
  )
17
18
  from judgeval.scorers import (
18
19
  JudgevalScorer,
@@ -23,10 +24,10 @@ from judgeval.scorers.score import a_execute_scoring
23
24
  from judgeval.constants import (
24
25
  ROOT_API,
25
26
  JUDGMENT_EVAL_API_URL,
26
- JUDGMENT_SEQUENCE_EVAL_API_URL,
27
+ JUDGMENT_TRACE_EVAL_API_URL,
27
28
  JUDGMENT_EVAL_LOG_API_URL,
28
29
  MAX_CONCURRENT_EVALUATIONS,
29
- JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
30
+ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
30
31
  )
31
32
  from judgeval.common.exceptions import JudgmentAPIError
32
33
  from judgeval.common.logger import (
@@ -36,7 +37,9 @@ from judgeval.common.logger import (
36
37
  example_logging_context
37
38
  )
38
39
  from judgeval.evaluation_run import EvaluationRun
39
- from judgeval.data.sequence_run import SequenceRun
40
+ from judgeval.data.trace_run import TraceRun
41
+ from judgeval.common.tracer import Tracer
42
+ from langchain_core.callbacks import BaseCallbackHandler
40
43
 
41
44
  def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
42
45
  """
@@ -93,20 +96,20 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
93
96
  raise JudgmentAPIError(error_message)
94
97
  return response_data
95
98
 
96
- def execute_api_sequence_eval(sequence_run: SequenceRun) -> List[Dict]:
99
+ def execute_api_trace_eval(trace_run: TraceRun) -> List[Dict]:
97
100
  """
98
101
  Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
99
102
  """
100
103
 
101
104
  try:
102
105
  # submit API request to execute evals
103
- payload = sequence_run.model_dump(warnings=False)
106
+ payload = trace_run.model_dump(warnings=False)
104
107
  response = requests.post(
105
- JUDGMENT_SEQUENCE_EVAL_API_URL,
108
+ JUDGMENT_TRACE_EVAL_API_URL,
106
109
  headers={
107
110
  "Content-Type": "application/json",
108
- "Authorization": f"Bearer {sequence_run.judgment_api_key}",
109
- "X-Organization-Id": sequence_run.organization_id
111
+ "Authorization": f"Bearer {trace_run.judgment_api_key}",
112
+ "X-Organization-Id": trace_run.organization_id
110
113
  },
111
114
  json=payload,
112
115
  verify=True
@@ -277,7 +280,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
277
280
  raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
278
281
 
279
282
 
280
- def log_evaluation_results(merged_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
283
+ def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, TraceRun]) -> str:
281
284
  """
282
285
  Logs evaluation results to the Judgment API database.
283
286
 
@@ -298,7 +301,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
298
301
  "X-Organization-Id": run.organization_id
299
302
  },
300
303
  json={
301
- "results": merged_results,
304
+ "results": scoring_results,
302
305
  "run": run.model_dump(warnings=False)
303
306
  },
304
307
  verify=True
@@ -365,46 +368,62 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
365
368
  if missing_params:
366
369
  print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
367
370
 
368
- def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True) -> List[ScoringResult]:
371
+ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
369
372
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
370
- if not override and sequence_run.log_results and not sequence_run.append:
373
+ if not override and trace_run.log_results and not trace_run.append:
371
374
  check_eval_run_name_exists(
372
- sequence_run.eval_name,
373
- sequence_run.project_name,
374
- sequence_run.judgment_api_key,
375
- sequence_run.organization_id
375
+ trace_run.eval_name,
376
+ trace_run.project_name,
377
+ trace_run.judgment_api_key,
378
+ trace_run.organization_id
376
379
  )
377
380
 
378
- if sequence_run.append:
381
+ if trace_run.append:
379
382
  # Check that the current experiment, if one exists, has the same type (examples of sequences)
380
383
  check_experiment_type(
381
- sequence_run.eval_name,
382
- sequence_run.project_name,
383
- sequence_run.judgment_api_key,
384
- sequence_run.organization_id,
384
+ trace_run.eval_name,
385
+ trace_run.project_name,
386
+ trace_run.judgment_api_key,
387
+ trace_run.organization_id,
385
388
  True
386
389
  )
387
-
388
390
 
391
+ if function and tracer:
392
+ new_traces: List[Trace] = []
393
+ tracer.offline_mode = True
394
+ for example in examples:
395
+ if example.input:
396
+ result = run_with_spinner("Running agent function: ", function, **example.input)
397
+ else:
398
+ result = run_with_spinner("Running agent function: ", function)
399
+ for i, trace in enumerate(tracer.traces):
400
+ # We set the root-level trace span with the expected tools of the Trace
401
+ trace = Trace(**trace)
402
+ trace.entries[0].expected_tools = examples[i].expected_tools
403
+ new_traces.append(trace)
404
+ trace_run.traces = new_traces
405
+
389
406
  # Execute evaluation using Judgment API
390
407
  info("Starting API evaluation")
391
408
  try: # execute an EvaluationRun with just JudgmentScorers
392
409
  debug("Sending request to Judgment API")
393
- response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
394
-
395
- info(f"Received {len(response_data['results'])} results from API")
410
+ response_data: List[Dict] = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
411
+ scoring_results = [ScoringResult(**result) for result in response_data["results"]]
412
+ info(f"Received {len(scoring_results)} results from API")
396
413
  except JudgmentAPIError as e:
397
414
  error(f"An error occurred while executing the Judgment API request: {str(e)}")
398
415
  raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
399
416
  except ValueError as e:
400
- raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: {str(e)}")
417
+ raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: {str(e)}")
401
418
 
402
419
  # Convert the response data to `ScoringResult` objects
403
420
  debug("Processing API results")
404
- # TODO: allow for custom scorer on sequences
405
- if sequence_run.log_results:
406
- pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], sequence_run)
421
+ # TODO: allow for custom scorer on traces
422
+ if trace_run.log_results:
423
+ pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], trace_run)
407
424
  rprint(pretty_str)
425
+
426
+ return scoring_results
408
427
 
409
428
 
410
429
 
@@ -587,7 +606,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
587
606
  # )
588
607
  # print(merged_results)
589
608
  if evaluation_run.log_results:
590
- send_results = [result.model_dump(warnings=False) for result in merged_results]
609
+ send_results = [scoring_result.model_dump(warnings=False) for scoring_result in merged_results]
591
610
  pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, send_results, evaluation_run)
592
611
  rprint(pretty_str)
593
612
 
@@ -613,34 +632,31 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
613
632
 
614
633
  # Create a test case context with all relevant fields
615
634
  test_case = {
616
- 'input': result.data_object.input,
617
- 'actual_output': result.data_object.actual_output,
618
- 'expected_output': result.data_object.expected_output,
619
- 'context': result.data_object.context,
620
- 'retrieval_context': result.data_object.retrieval_context,
621
- 'additional_metadata': result.data_object.additional_metadata,
622
- 'tools_called': result.data_object.tools_called,
623
- 'expected_tools': result.data_object.expected_tools,
624
- 'failed_scorers': []
635
+ "failed_scorers": []
625
636
  }
626
637
  if result.scorers_data:
627
638
  # If the result was not successful, check each scorer_data
628
639
  for scorer_data in result.scorers_data:
629
640
  if not scorer_data.success:
641
+ if scorer_data.name == "Tool Order":
642
+ # Remove threshold, evaluation model for Tool Order scorer
643
+ scorer_data.threshold = None
644
+ scorer_data.evaluation_model = None
630
645
  test_case['failed_scorers'].append(scorer_data)
631
646
  failed_cases.append(test_case)
632
647
 
633
648
  if failed_cases:
649
+
634
650
  error_msg = f"The following test cases failed: \n"
635
651
  for fail_case in failed_cases:
636
- error_msg += f"\nInput: {fail_case['input']}\n"
637
- error_msg += f"Actual Output: {fail_case['actual_output']}\n"
638
- error_msg += f"Expected Output: {fail_case['expected_output']}\n"
639
- error_msg += f"Context: {fail_case['context']}\n"
640
- error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
641
- error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
642
- error_msg += f"Tools Called: {fail_case['tools_called']}\n"
643
- error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
652
+ # error_msg += f"\nInput: {fail_case['input']}\n"
653
+ # error_msg += f"Actual Output: {fail_case['actual_output']}\n"
654
+ # error_msg += f"Expected Output: {fail_case['expected_output']}\n"
655
+ # error_msg += f"Context: {fail_case['context']}\n"
656
+ # error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
657
+ # error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
658
+ # error_msg += f"Tools Called: {fail_case['tools_called']}\n"
659
+ # error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
644
660
 
645
661
  for fail_scorer in fail_case['failed_scorers']:
646
662
 
@@ -658,6 +674,37 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
658
674
  f"Additional Metadata: {fail_scorer.additional_metadata}\n"
659
675
  )
660
676
  error_msg += "-"*100
661
-
662
- raise AssertionError(error_msg)
677
+
678
+ total_tests = len(scoring_results)
679
+ failed_tests = len(failed_cases)
680
+ passed_tests = total_tests - failed_tests
681
+
682
+ # Print summary with colors
683
+ rprint("\n" + "="*80)
684
+ if failed_tests == 0:
685
+ rprint(f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]")
686
+ else:
687
+ rprint(f"[bold red]⚠️ TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]")
688
+ rprint("="*80 + "\n")
689
+
690
+ # Print individual test cases
691
+ for i, result in enumerate(scoring_results):
692
+ test_num = i + 1
693
+ if result.success:
694
+ rprint(f"[green]✓ Test {test_num}: PASSED[/green]")
695
+ else:
696
+ rprint(f"[red]✗ Test {test_num}: FAILED[/red]")
697
+ if result.scorers_data:
698
+ for scorer_data in result.scorers_data:
699
+ if not scorer_data.success:
700
+ rprint(f" [yellow]Scorer: {scorer_data.name}[/yellow]")
701
+ rprint(f" [red] Score: {scorer_data.score}[/red]")
702
+ rprint(f" [red] Reason: {scorer_data.reason}[/red]")
703
+ if scorer_data.error:
704
+ rprint(f" [red] Error: {scorer_data.error}[/red]")
705
+ rprint(" " + "-"*40)
706
+
707
+ rprint("\n" + "="*80)
708
+ if failed_tests > 0:
709
+ raise AssertionError(failed_cases)
663
710
 
@@ -16,6 +16,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
16
16
  InstructionAdherenceScorer,
17
17
  GroundednessScorer,
18
18
  DerailmentScorer,
19
+ ToolOrderScorer,
19
20
  )
20
21
  from judgeval.scorers.judgeval_scorers.classifiers import (
21
22
  Text2SQLScorer,
@@ -41,4 +42,5 @@ __all__ = [
41
42
  "InstructionAdherenceScorer",
42
43
  "GroundednessScorer",
43
44
  "DerailmentScorer",
45
+ "ToolOrderScorer",
44
46
  ]
@@ -12,6 +12,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonS
12
12
  from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
13
13
  from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
14
14
  from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
15
+ from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
15
16
  __all__ = [
16
17
  "ExecutionOrderScorer",
17
18
  "JSONCorrectnessScorer",
@@ -27,4 +28,5 @@ __all__ = [
27
28
  "InstructionAdherenceScorer",
28
29
  "GroundednessScorer",
29
30
  "DerailmentScorer",
31
+ "ToolOrderScorer",
30
32
  ]
@@ -0,0 +1,20 @@
1
+ """
2
+ `judgeval` tool order scorer
3
+ """
4
+
5
+ # Internal imports
6
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
7
+ from judgeval.constants import APIScorer
8
+ from typing import Optional, Dict
9
+ class ToolOrderScorer(APIJudgmentScorer):
10
+ kwargs: Optional[Dict] = None
11
+ def __init__(self, threshold: float=1.0, exact_match: bool=False):
12
+ super().__init__(
13
+ threshold=threshold,
14
+ score_type=APIScorer.TOOL_ORDER,
15
+ )
16
+ self.kwargs = {"exact_match": exact_match}
17
+
18
+ @property
19
+ def __name__(self):
20
+ return "Tool Order"
judgeval/scorers/score.py CHANGED
@@ -243,7 +243,7 @@ async def score_with_indicator(
243
243
  async def a_execute_scoring(
244
244
  examples: Union[List[Example], List[CustomExample]],
245
245
  scorers: List[JudgevalScorer],
246
- model: Optional[Union[str, List[str], JudgevalJudge]] = None,
246
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
247
247
  ignore_errors: bool = True,
248
248
  skip_on_missing_params: bool = True,
249
249
  show_indicator: bool = True,