judgeval 0.0.36__py3-none-any.whl → 0.0.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,8 @@
2
2
  Implements the JudgmentClient to interact with the Judgment API.
3
3
  """
4
4
  import os
5
- from typing import Optional, List, Dict, Any, Union
5
+ from uuid import uuid4
6
+ from typing import Optional, List, Dict, Any, Union, Callable
6
7
  import requests
7
8
 
8
9
  from judgeval.constants import ROOT_API
@@ -33,7 +34,11 @@ from judgeval.constants import (
33
34
  JUDGMENT_PROJECT_DELETE_API_URL,
34
35
  JUDGMENT_PROJECT_CREATE_API_URL
35
36
  )
37
+ from judgeval.utils.data_utils import add_from_yaml
36
38
  from judgeval.common.exceptions import JudgmentAPIError
39
+ from langchain_core.callbacks import BaseCallbackHandler
40
+ from judgeval.common.tracer import Tracer
41
+ from judgeval.common.utils import validate_api_key
37
42
  from pydantic import BaseModel
38
43
  from judgeval.rules import Rule
39
44
 
@@ -63,7 +68,7 @@ class JudgmentClient(metaclass=SingletonMeta):
63
68
  self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
64
69
 
65
70
  # Verify API key is valid
66
- result, response = self._validate_api_key()
71
+ result, response = validate_api_key(judgment_api_key)
67
72
  if not result:
68
73
  # May be bad to output their invalid API key...
69
74
  raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
@@ -74,7 +79,7 @@ class JudgmentClient(metaclass=SingletonMeta):
74
79
  self,
75
80
  examples: List[Example],
76
81
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
77
- model: Union[str, List[str], JudgevalJudge],
82
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
78
83
  aggregator: Optional[str] = None,
79
84
  metadata: Optional[Dict[str, Any]] = None,
80
85
  log_results: bool = True,
@@ -102,9 +107,11 @@ class JudgmentClient(metaclass=SingletonMeta):
102
107
 
103
108
  def run_sequence_evaluation(
104
109
  self,
105
- sequences: List[Sequence],
106
- model: Union[str, List[str], JudgevalJudge],
107
110
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
111
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
112
+ sequences: Optional[List[Sequence]] = None,
113
+ examples: Optional[List[Example]] = None,
114
+ test_file: Optional[str] = None,
108
115
  aggregator: Optional[str] = None,
109
116
  project_name: str = "default_project",
110
117
  eval_run_name: str = "default_eval_sequence",
@@ -112,40 +119,40 @@ class JudgmentClient(metaclass=SingletonMeta):
112
119
  append: bool = False,
113
120
  override: bool = False,
114
121
  ignore_errors: bool = True,
115
- rules: Optional[List[Rule]] = None
122
+ rules: Optional[List[Rule]] = None,
123
+ function: Optional[Callable] = None,
124
+ tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
116
125
  ) -> List[ScoringResult]:
117
- try:
118
- def get_all_sequences(root: Sequence) -> List[Sequence]:
119
- all_sequences = [root]
120
-
121
- for item in root.items:
122
- if isinstance(item, Sequence):
123
- all_sequences.extend(get_all_sequences(item))
124
-
125
- return all_sequences
126
-
127
- def flatten_sequence_list(sequences: List[Sequence]) -> List[Sequence]:
128
- flattened = []
129
- for seq in sequences:
130
- flattened.extend(get_all_sequences(seq))
131
- return flattened
126
+ try:
127
+
128
+ if test_file:
129
+ try:
130
+ examples = add_from_yaml(test_file)
131
+ except FileNotFoundError:
132
+ raise FileNotFoundError(f"Test file not found: {test_file}")
133
+
134
+ if examples and not function:
135
+ raise ValueError("Cannot pass in examples without a function")
136
+
137
+ if sequences and function:
138
+ raise ValueError("Cannot pass in sequences and function")
139
+
140
+ if examples and sequences:
141
+ raise ValueError("Cannot pass in both examples and sequences")
132
142
 
133
- flattened_sequences = flatten_sequence_list(sequences)
134
- for sequence in flattened_sequences:
135
- sequence.scorers = scorers
136
-
137
143
  sequence_run = SequenceRun(
138
144
  project_name=project_name,
139
145
  eval_name=eval_run_name,
140
146
  sequences=sequences,
147
+ scorers=scorers,
141
148
  model=model,
142
149
  aggregator=aggregator,
143
150
  log_results=log_results,
144
151
  append=append,
145
152
  judgment_api_key=self.judgment_api_key,
146
- organization_id=self.organization_id
153
+ organization_id=self.organization_id,
147
154
  )
148
- return run_sequence_eval(sequence_run, override, ignore_errors)
155
+ return run_sequence_eval(sequence_run, override, ignore_errors, function, tracer, examples)
149
156
  except ValueError as e:
150
157
  raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
151
158
  except Exception as e:
@@ -155,7 +162,7 @@ class JudgmentClient(metaclass=SingletonMeta):
155
162
  self,
156
163
  examples: Union[List[Example], List[CustomExample]],
157
164
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
158
- model: Union[str, List[str], JudgevalJudge],
165
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
159
166
  aggregator: Optional[str] = None,
160
167
  metadata: Optional[Dict[str, Any]] = None,
161
168
  log_results: bool = True,
@@ -396,24 +403,6 @@ class JudgmentClient(metaclass=SingletonMeta):
396
403
  raise ValueError(f"Error deleting project: {response.json()}")
397
404
  return response.json()
398
405
 
399
- def _validate_api_key(self):
400
- """
401
- Validates that the user api key is valid
402
- """
403
- response = requests.post(
404
- f"{ROOT_API}/validate_api_key/",
405
- headers={
406
- "Content-Type": "application/json",
407
- "Authorization": f"Bearer {self.judgment_api_key}",
408
- },
409
- json={}, # Empty body now
410
- verify=True
411
- )
412
- if response.status_code == 200:
413
- return True, response.json()
414
- else:
415
- return False, response.json().get("detail", "Error validating API key")
416
-
417
406
  def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
418
407
  """
419
408
  Fetches a classifier scorer configuration from the Judgment API.
@@ -499,22 +488,26 @@ class JudgmentClient(metaclass=SingletonMeta):
499
488
 
500
489
  def assert_test(
501
490
  self,
502
- examples: List[Example],
503
491
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
504
- model: Union[str, List[str], JudgevalJudge],
492
+ examples: Optional[List[Example]] = None,
493
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
494
+ test_file: Optional[str] = None,
505
495
  aggregator: Optional[str] = None,
506
496
  metadata: Optional[Dict[str, Any]] = None,
507
497
  log_results: bool = True,
508
- project_name: str = "default_project",
509
- eval_run_name: str = "default_eval_run",
498
+ project_name: str = "default_test",
499
+ eval_run_name: str = str(uuid4()),
510
500
  override: bool = False,
511
- rules: Optional[List[Rule]] = None
501
+ rules: Optional[List[Rule]] = None,
502
+ function: Optional[Callable] = None,
503
+ tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
512
504
  ) -> None:
513
505
  """
514
506
  Asserts a test by running the evaluation and checking the results for success
515
507
 
516
508
  Args:
517
- examples (List[Example]): The examples to evaluate
509
+ examples (Optional[List[Example]]): The examples to evaluate. Must be provided if test_file is not.
510
+ test_file (Optional[str]): Path to a YAML file containing test examples. Must be provided if examples is not.
518
511
  scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
519
512
  model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
520
513
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -525,17 +518,37 @@ class JudgmentClient(metaclass=SingletonMeta):
525
518
  override (bool): Whether to override an existing evaluation run with the same name
526
519
  rules (Optional[List[Rule]]): Rules to evaluate against scoring results
527
520
  """
528
- results = self.run_evaluation(
529
- examples=examples,
530
- scorers=scorers,
531
- model=model,
532
- aggregator=aggregator,
533
- metadata=metadata,
534
- log_results=log_results,
535
- project_name=project_name,
536
- eval_run_name=eval_run_name,
537
- override=override,
538
- rules=rules
539
- )
521
+ # Validate that exactly one of examples or test_file is provided
522
+ if (examples is None and test_file is None) or (examples is not None and test_file is not None):
523
+ raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
524
+
525
+ if function:
526
+ results = self.run_sequence_evaluation(
527
+ examples=examples,
528
+ scorers=scorers,
529
+ model=model,
530
+ aggregator=aggregator,
531
+ log_results=log_results,
532
+ project_name=project_name,
533
+ eval_run_name=eval_run_name,
534
+ override=override,
535
+ rules=rules,
536
+ function=function,
537
+ tracer=tracer,
538
+ test_file=test_file
539
+ )
540
+ else:
541
+ results = self.run_evaluation(
542
+ examples=examples,
543
+ scorers=scorers,
544
+ model=model,
545
+ aggregator=aggregator,
546
+ metadata=metadata,
547
+ log_results=log_results,
548
+ project_name=project_name,
549
+ eval_run_name=eval_run_name,
550
+ override=override,
551
+ rules=rules
552
+ )
540
553
 
541
554
  assert_test(results)
@@ -4,7 +4,7 @@ import time
4
4
  import sys
5
5
  import itertools
6
6
  import threading
7
- from typing import List, Dict, Any, Union
7
+ from typing import List, Dict, Any, Union, Optional, Callable
8
8
  from datetime import datetime
9
9
  from rich import print as rprint
10
10
 
@@ -12,7 +12,9 @@ from judgeval.data import (
12
12
  ScorerData,
13
13
  ScoringResult,
14
14
  Example,
15
- CustomExample
15
+ CustomExample,
16
+ Sequence,
17
+ Trace
16
18
  )
17
19
  from judgeval.scorers import (
18
20
  JudgevalScorer,
@@ -26,7 +28,8 @@ from judgeval.constants import (
26
28
  JUDGMENT_SEQUENCE_EVAL_API_URL,
27
29
  JUDGMENT_EVAL_LOG_API_URL,
28
30
  MAX_CONCURRENT_EVALUATIONS,
29
- JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
31
+ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
32
+ JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL
30
33
  )
31
34
  from judgeval.common.exceptions import JudgmentAPIError
32
35
  from judgeval.common.logger import (
@@ -37,6 +40,8 @@ from judgeval.common.logger import (
37
40
  )
38
41
  from judgeval.evaluation_run import EvaluationRun
39
42
  from judgeval.data.sequence_run import SequenceRun
43
+ from judgeval.common.tracer import Tracer
44
+ from langchain_core.callbacks import BaseCallbackHandler
40
45
 
41
46
  def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
42
47
  """
@@ -277,7 +282,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
277
282
  raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
278
283
 
279
284
 
280
- def log_evaluation_results(merged_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
285
+ def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
281
286
  """
282
287
  Logs evaluation results to the Judgment API database.
283
288
 
@@ -298,7 +303,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
298
303
  "X-Organization-Id": run.organization_id
299
304
  },
300
305
  json={
301
- "results": merged_results,
306
+ "results": scoring_results,
302
307
  "run": run.model_dump(warnings=False)
303
308
  },
304
309
  verify=True
@@ -322,6 +327,51 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
322
327
  error(f"Failed to save evaluation results to DB: {str(e)}")
323
328
  raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
324
329
 
330
+ def retrieve_sequence_from_trace(trace_id: str, parent_span: str, judgment_api_key: str, organization_id: str) -> Sequence:
331
+ """
332
+ Retrieves a sequence from a trace ID.
333
+ """
334
+ """
335
+ Logs evaluation results to the Judgment API database.
336
+
337
+ Args:
338
+ merged_results (List[ScoringResult]): The results to log
339
+ evaluation_run (EvaluationRun): The evaluation run containing project info and API key
340
+
341
+ Raises:
342
+ JudgmentAPIError: If there's an API error during logging
343
+ ValueError: If there's a validation error with the results
344
+ """
345
+ try:
346
+ res = requests.post(
347
+ JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL,
348
+ headers={
349
+ "Content-Type": "application/json",
350
+ "Authorization": f"Bearer {judgment_api_key}",
351
+ "X-Organization-Id": organization_id
352
+ },
353
+ json={
354
+ "trace_id": trace_id,
355
+ "trace_span_id": parent_span,
356
+ },
357
+ verify=True
358
+ )
359
+
360
+ if not res.ok:
361
+ response_data = res.json()
362
+ error_message = response_data.get('detail', 'An unknown error occurred.')
363
+ error(f"Error {res.status_code}: {error_message}")
364
+ raise JudgmentAPIError(error_message)
365
+
366
+ return Sequence(**res.json())
367
+ except requests.exceptions.RequestException as e:
368
+ error(f"Request failed while saving evaluation results to DB: {str(e)}")
369
+ raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
370
+ except Exception as e:
371
+ error(f"Failed to save evaluation results to DB: {str(e)}")
372
+ raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
373
+
374
+
325
375
  def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
326
376
  """Run a function with a spinner in the terminal."""
327
377
  spinner = itertools.cycle(['|', '/', '-', '\\'])
@@ -365,7 +415,7 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
365
415
  if missing_params:
366
416
  print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
367
417
 
368
- def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True) -> List[ScoringResult]:
418
+ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
369
419
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
370
420
  if not override and sequence_run.log_results and not sequence_run.append:
371
421
  check_eval_run_name_exists(
@@ -384,15 +434,32 @@ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_
384
434
  sequence_run.organization_id,
385
435
  True
386
436
  )
387
-
388
437
 
438
+ if function and tracer:
439
+ new_sequences: List[Sequence] = []
440
+ for example in examples:
441
+ if example.input:
442
+ result = run_with_spinner("Running agent function: ", function, **example.input)
443
+ else:
444
+ result = run_with_spinner("Running agent function: ", function)
445
+ for i, trace in enumerate(tracer.traces):
446
+ trace_id = trace['trace_id']
447
+ parent_span = trace['entries'][0]['span_id']
448
+ new_sequence = retrieve_sequence_from_trace(trace_id, parent_span, sequence_run.judgment_api_key, sequence_run.organization_id)
449
+ new_sequence.expected_tools = examples[i].expected_tools
450
+ new_sequences.append(new_sequence)
451
+ sequence_run.sequences = new_sequences
452
+
453
+ for sequence in sequence_run.sequences:
454
+ sequence.scorers = sequence_run.scorers
455
+
389
456
  # Execute evaluation using Judgment API
390
457
  info("Starting API evaluation")
391
458
  try: # execute an EvaluationRun with just JudgmentScorers
392
459
  debug("Sending request to Judgment API")
393
460
  response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
394
-
395
- info(f"Received {len(response_data['results'])} results from API")
461
+ scoring_results = [ScoringResult(**result) for result in response_data["results"]]
462
+ info(f"Received {len(scoring_results)} results from API")
396
463
  except JudgmentAPIError as e:
397
464
  error(f"An error occurred while executing the Judgment API request: {str(e)}")
398
465
  raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
@@ -405,6 +472,8 @@ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_
405
472
  if sequence_run.log_results:
406
473
  pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], sequence_run)
407
474
  rprint(pretty_str)
475
+
476
+ return scoring_results
408
477
 
409
478
 
410
479
 
@@ -587,7 +656,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
587
656
  # )
588
657
  # print(merged_results)
589
658
  if evaluation_run.log_results:
590
- send_results = [result.model_dump(warnings=False) for result in merged_results]
659
+ send_results = [scoring_result.model_dump(warnings=False) for scoring_result in merged_results]
591
660
  pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, send_results, evaluation_run)
592
661
  rprint(pretty_str)
593
662
 
@@ -613,34 +682,31 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
613
682
 
614
683
  # Create a test case context with all relevant fields
615
684
  test_case = {
616
- 'input': result.data_object.input,
617
- 'actual_output': result.data_object.actual_output,
618
- 'expected_output': result.data_object.expected_output,
619
- 'context': result.data_object.context,
620
- 'retrieval_context': result.data_object.retrieval_context,
621
- 'additional_metadata': result.data_object.additional_metadata,
622
- 'tools_called': result.data_object.tools_called,
623
- 'expected_tools': result.data_object.expected_tools,
624
- 'failed_scorers': []
685
+ "failed_scorers": []
625
686
  }
626
687
  if result.scorers_data:
627
688
  # If the result was not successful, check each scorer_data
628
689
  for scorer_data in result.scorers_data:
629
690
  if not scorer_data.success:
691
+ if scorer_data.name == "Tool Order":
692
+ # Remove threshold, evaluation model for Tool Order scorer
693
+ scorer_data.threshold = None
694
+ scorer_data.evaluation_model = None
630
695
  test_case['failed_scorers'].append(scorer_data)
631
696
  failed_cases.append(test_case)
632
697
 
633
698
  if failed_cases:
699
+
634
700
  error_msg = f"The following test cases failed: \n"
635
701
  for fail_case in failed_cases:
636
- error_msg += f"\nInput: {fail_case['input']}\n"
637
- error_msg += f"Actual Output: {fail_case['actual_output']}\n"
638
- error_msg += f"Expected Output: {fail_case['expected_output']}\n"
639
- error_msg += f"Context: {fail_case['context']}\n"
640
- error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
641
- error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
642
- error_msg += f"Tools Called: {fail_case['tools_called']}\n"
643
- error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
702
+ # error_msg += f"\nInput: {fail_case['input']}\n"
703
+ # error_msg += f"Actual Output: {fail_case['actual_output']}\n"
704
+ # error_msg += f"Expected Output: {fail_case['expected_output']}\n"
705
+ # error_msg += f"Context: {fail_case['context']}\n"
706
+ # error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
707
+ # error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
708
+ # error_msg += f"Tools Called: {fail_case['tools_called']}\n"
709
+ # error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
644
710
 
645
711
  for fail_scorer in fail_case['failed_scorers']:
646
712
 
@@ -658,6 +724,37 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
658
724
  f"Additional Metadata: {fail_scorer.additional_metadata}\n"
659
725
  )
660
726
  error_msg += "-"*100
661
-
662
- raise AssertionError(error_msg)
727
+
728
+ total_tests = len(scoring_results)
729
+ failed_tests = len(failed_cases)
730
+ passed_tests = total_tests - failed_tests
731
+
732
+ # Print summary with colors
733
+ rprint("\n" + "="*80)
734
+ if failed_tests == 0:
735
+ rprint(f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]")
736
+ else:
737
+ rprint(f"[bold red]⚠️ TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]")
738
+ rprint("="*80 + "\n")
739
+
740
+ # Print individual test cases
741
+ for i, result in enumerate(scoring_results):
742
+ test_num = i + 1
743
+ if result.success:
744
+ rprint(f"[green]✓ Test {test_num}: PASSED[/green]")
745
+ else:
746
+ rprint(f"[red]✗ Test {test_num}: FAILED[/red]")
747
+ if result.scorers_data:
748
+ for scorer_data in result.scorers_data:
749
+ if not scorer_data.success:
750
+ rprint(f" [yellow]Scorer: {scorer_data.name}[/yellow]")
751
+ rprint(f" [red] Score: {scorer_data.score}[/red]")
752
+ rprint(f" [red] Reason: {scorer_data.reason}[/red]")
753
+ if scorer_data.error:
754
+ rprint(f" [red] Error: {scorer_data.error}[/red]")
755
+ rprint(" " + "-"*40)
756
+
757
+ rprint("\n" + "="*80)
758
+ if failed_tests > 0:
759
+ raise AssertionError(failed_cases)
663
760
 
@@ -16,6 +16,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
16
16
  InstructionAdherenceScorer,
17
17
  GroundednessScorer,
18
18
  DerailmentScorer,
19
+ ToolOrderScorer,
19
20
  )
20
21
  from judgeval.scorers.judgeval_scorers.classifiers import (
21
22
  Text2SQLScorer,
@@ -41,4 +42,5 @@ __all__ = [
41
42
  "InstructionAdherenceScorer",
42
43
  "GroundednessScorer",
43
44
  "DerailmentScorer",
45
+ "ToolOrderScorer",
44
46
  ]
@@ -12,6 +12,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonS
12
12
  from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
13
13
  from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
14
14
  from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
15
+ from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
15
16
  __all__ = [
16
17
  "ExecutionOrderScorer",
17
18
  "JSONCorrectnessScorer",
@@ -27,4 +28,5 @@ __all__ = [
27
28
  "InstructionAdherenceScorer",
28
29
  "GroundednessScorer",
29
30
  "DerailmentScorer",
31
+ "ToolOrderScorer",
30
32
  ]
@@ -0,0 +1,18 @@
1
+ """
2
+ `judgeval` tool order scorer
3
+ """
4
+
5
+ # Internal imports
6
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
7
+ from judgeval.constants import APIScorer
8
+
9
+ class ToolOrderScorer(APIJudgmentScorer):
10
+ def __init__(self, threshold: float=1.0):
11
+ super().__init__(
12
+ threshold=threshold,
13
+ score_type=APIScorer.TOOL_ORDER,
14
+ )
15
+
16
+ @property
17
+ def __name__(self):
18
+ return "Tool Order"
judgeval/scorers/score.py CHANGED
@@ -243,7 +243,7 @@ async def score_with_indicator(
243
243
  async def a_execute_scoring(
244
244
  examples: Union[List[Example], List[CustomExample]],
245
245
  scorers: List[JudgevalScorer],
246
- model: Optional[Union[str, List[str], JudgevalJudge]] = None,
246
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
247
247
  ignore_errors: bool = True,
248
248
  skip_on_missing_params: bool = True,
249
249
  show_indicator: bool = True,
@@ -0,0 +1,57 @@
1
+ import yaml
2
+ from judgeval.common.logger import (
3
+ debug,
4
+ info,
5
+ error,
6
+ example_logging_context
7
+ )
8
+
9
+ from judgeval.data import Example
10
+
11
+
12
+ def add_from_yaml(file_path: str) -> None:
13
+ debug(f"Loading dataset from YAML file: {file_path}")
14
+ """
15
+ Adds examples from a YAML file.
16
+
17
+ The format of the YAML file is expected to be a dictionary with one key: "examples".
18
+ The value of the key is a list of dictionaries, where each dictionary represents an example.
19
+
20
+ The YAML file is expected to have the following format:
21
+ examples:
22
+ - input: "test input"
23
+ actual_output: "test output"
24
+ expected_output: "expected output"
25
+ context:
26
+ - "context1"
27
+ - "context2"
28
+ retrieval_context:
29
+ - "retrieval1"
30
+ additional_metadata:
31
+ key: "value"
32
+ tools_called:
33
+ - "tool1"
34
+ expected_tools:
35
+ - {tool_name: "tool1", parameters: {"query": "test query 1"}}
36
+ - {tool_name: "tool2", parameters: {"query": "test query 2"}}
37
+ name: "test example"
38
+ example_id: null
39
+ timestamp: "20241230_160117"
40
+ trace_id: "123"
41
+ """
42
+ try:
43
+ with open(file_path, "r") as file:
44
+ payload = yaml.safe_load(file)
45
+ if payload is None:
46
+ raise ValueError("The YAML file is empty.")
47
+ examples = payload.get("examples", [])
48
+ except FileNotFoundError:
49
+ error(f"YAML file not found: {file_path}")
50
+ raise FileNotFoundError(f"The file {file_path} was not found.")
51
+ except yaml.YAMLError:
52
+ error(f"Invalid YAML file: {file_path}")
53
+ raise ValueError(f"The file {file_path} is not a valid YAML file.")
54
+
55
+ info(f"Added {len(examples)} examples from YAML")
56
+ new_examples = [Example(**e) for e in examples]
57
+ return new_examples