PyPI - judgeval - Versions diffs - 0.0.36__py3-none-any.whl → 0.0.38__py3-none-any.whl - Mend

judgeval 0.0.36py3-none-any.whl → 0.0.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

judgeval/common/tracer.py +663 -1105
judgeval/common/utils.py +19 -1
judgeval/constants.py +3 -3
judgeval/data/__init__.py +4 -2
judgeval/data/datasets/dataset.py +2 -11
judgeval/data/datasets/eval_dataset_client.py +1 -62
judgeval/data/example.py +29 -8
judgeval/data/result.py +3 -3
judgeval/data/trace.py +132 -0
judgeval/data/{sequence_run.py → trace_run.py} +7 -6
judgeval/evaluation_run.py +2 -2
judgeval/integrations/langgraph.py +189 -1769
judgeval/judges/litellm_judge.py +1 -1
judgeval/judges/mixture_of_judges.py +1 -1
judgeval/judges/utils.py +1 -1
judgeval/judgment_client.py +85 -78
judgeval/run_evaluation.py +98 -51
judgeval/scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +20 -0
judgeval/scorers/score.py +1 -1
judgeval/utils/data_utils.py +57 -0
judgeval-0.0.38.dist-info/METADATA +247 -0
{judgeval-0.0.36.dist-info → judgeval-0.0.38.dist-info}/RECORD +26 -24
judgeval/data/sequence.py +0 -49
judgeval-0.0.36.dist-info/METADATA +0 -169
{judgeval-0.0.36.dist-info → judgeval-0.0.38.dist-info}/WHEEL +0 -0
{judgeval-0.0.36.dist-info → judgeval-0.0.38.dist-info}/licenses/LICENSE.md +0 -0

judgeval/judges/litellm_judge.py CHANGED Viewed

@@ -12,7 +12,7 @@ BASE_CONVERSATION = [
 class LiteLLMJudge(JudgevalJudge):
-    def __init__(self, model: str = "gpt-4o-mini", **kwargs):
+    def __init__(self, model: str = "gpt-4.1-mini", **kwargs):
         debug(f"Initializing LiteLLMJudge with model={model}")
         self.model = model
         self.kwargs = kwargs

judgeval/judges/mixture_of_judges.py CHANGED Viewed

@@ -136,7 +136,7 @@ class MixtureOfJudges(JudgevalJudge):
     """
     def __init__(self,
                  models: List[str] = ['QWEN', 'LLAMA3_70B_INSTRUCT_TURBO', 'MISTRAL_8x22B_INSTRUCT'],
-                 aggregator: str = 'gpt-4o',
+                 aggregator: str = 'gpt-4.1',
                  **kwargs):
         """
         `models` are the individual judge models to be used for generating responses.

judgeval/judges/utils.py CHANGED Viewed

@@ -23,7 +23,7 @@ def create_judge(
     If no model is provided, uses GPT4o as the default judge.
     """
     if model is None:  # default option
-        return LiteLLMJudge(model="gpt-4o"), True
+        return LiteLLMJudge(model="gpt-4.1"), True
     if not isinstance(model, (str, list, JudgevalJudge)):
         raise InvalidJudgeModelError(f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead.")
     # If model is already a valid judge type, return it and mark native

judgeval/judgment_client.py CHANGED Viewed

@@ -2,7 +2,8 @@
 Implements the JudgmentClient to interact with the Judgment API.
 """
 import os
-from typing import Optional, List, Dict, Any, Union
+from uuid import uuid4
+from typing import Optional, List, Dict, Any, Union, Callable
 import requests
 from judgeval.constants import ROOT_API
@@ -11,7 +12,7 @@ from judgeval.data import (
     ScoringResult,
     Example,
     CustomExample,
-    Sequence,
+    Trace,
 )
 from judgeval.scorers import (
     APIJudgmentScorer,
@@ -22,9 +23,9 @@ from judgeval.evaluation_run import EvaluationRun
 from judgeval.run_evaluation import (
     run_eval,
     assert_test,
-    run_sequence_eval
+    run_trace_eval
 )
-from judgeval.data.sequence_run import SequenceRun
+from judgeval.data.trace_run import TraceRun
 from judgeval.judges import JudgevalJudge
 from judgeval.constants import (
     JUDGMENT_EVAL_FETCH_API_URL,
@@ -33,7 +34,11 @@ from judgeval.constants import (
     JUDGMENT_PROJECT_DELETE_API_URL,
     JUDGMENT_PROJECT_CREATE_API_URL
 )
+from judgeval.utils.data_utils import add_from_yaml
 from judgeval.common.exceptions import JudgmentAPIError
+from langchain_core.callbacks import BaseCallbackHandler
+from judgeval.common.tracer import Tracer
+from judgeval.common.utils import validate_api_key
 from pydantic import BaseModel
 from judgeval.rules import Rule
@@ -63,7 +68,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
         # Verify API key is valid
-        result, response = self._validate_api_key()
+        result, response = validate_api_key(judgment_api_key)
         if not result:
             # May be bad to output their invalid API key...
             raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
@@ -74,7 +79,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         self,
         examples: List[Example],
         scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
-        model: Union[str, List[str], JudgevalJudge],
+        model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
         aggregator: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
         log_results: bool = True,
@@ -100,54 +105,56 @@ class JudgmentClient(metaclass=SingletonMeta):
             rules=rules
         )
-    def run_sequence_evaluation(
+    def run_trace_evaluation(
         self,
-        sequences: List[Sequence],
-        model: Union[str, List[str], JudgevalJudge],
         scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
+        model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
+        traces: Optional[List[Trace]] = None,
+        examples: Optional[List[Example]] = None,
+        test_file: Optional[str] = None,
         aggregator: Optional[str] = None,
         project_name: str = "default_project",
-        eval_run_name: str = "default_eval_sequence",
+        eval_run_name: str = "default_eval_trace",
         log_results: bool = True,
         append: bool = False,
         override: bool = False,
         ignore_errors: bool = True,
-        rules: Optional[List[Rule]] = None
+        rules: Optional[List[Rule]] = None,
+        function: Optional[Callable] = None,
+        tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
     ) -> List[ScoringResult]:
-        try:
-            def get_all_sequences(root: Sequence) -> List[Sequence]:
-                all_sequences = [root]
-                for item in root.items:
-                    if isinstance(item, Sequence):
-                        all_sequences.extend(get_all_sequences(item))
-                return all_sequences
-            def flatten_sequence_list(sequences: List[Sequence]) -> List[Sequence]:
-                flattened = []
-                for seq in sequences:
-                    flattened.extend(get_all_sequences(seq))
-                return flattened
+        try:
+            if test_file:
+                try:
+                    examples = add_from_yaml(test_file)
+                except FileNotFoundError:
+                    raise FileNotFoundError(f"Test file not found: {test_file}")
+            if examples and not function:
+                raise ValueError("Cannot pass in examples without a function")
+            if traces and function:
+                raise ValueError("Cannot pass in traces and function")
-            flattened_sequences = flatten_sequence_list(sequences)
-            for sequence in flattened_sequences:
-                sequence.scorers = scorers
-            sequence_run = SequenceRun(
+            if examples and traces:
+                raise ValueError("Cannot pass in both examples and traces")
+            trace_run = TraceRun(
                 project_name=project_name,
                 eval_name=eval_run_name,
-                sequences=sequences,
+                traces=traces,
+                scorers=scorers,
                 model=model,
                 aggregator=aggregator,
                 log_results=log_results,
                 append=append,
                 judgment_api_key=self.judgment_api_key,
-                organization_id=self.organization_id
+                organization_id=self.organization_id,
             )
-            return run_sequence_eval(sequence_run, override, ignore_errors)
+            return run_trace_eval(trace_run, override, ignore_errors, function, tracer, examples)
         except ValueError as e:
-            raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
+            raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}")
         except Exception as e:
             raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
@@ -155,7 +162,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         self,
         examples: Union[List[Example], List[CustomExample]],
         scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
-        model: Union[str, List[str], JudgevalJudge],
+        model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
         aggregator: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
         log_results: bool = True,
@@ -238,12 +245,6 @@ class JudgmentClient(metaclass=SingletonMeta):
         """
         return self.eval_dataset_client.append_examples(alias, examples, project_name)
-    def append_sequence_dataset(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
-        """
-        Appends a `Sequence` to the Judgment platform for storage.
-        """
-        return self.eval_dataset_client.append_sequences(alias, sequences, project_name)
     def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
         """
         Retrieves a saved `EvalDataset` from the Judgment platform.
@@ -396,24 +397,6 @@ class JudgmentClient(metaclass=SingletonMeta):
             raise ValueError(f"Error deleting project: {response.json()}")
         return response.json()
-    def _validate_api_key(self):
-        """
-        Validates that the user api key is valid
-        """
-        response = requests.post(
-            f"{ROOT_API}/validate_api_key/",
-            headers={
-                "Content-Type": "application/json",
-                "Authorization": f"Bearer {self.judgment_api_key}",
-            },
-            json={},  # Empty body now
-            verify=True
-        )
-        if response.status_code == 200:
-            return True, response.json()
-        else:
-            return False, response.json().get("detail", "Error validating API key")
     def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
         """
         Fetches a classifier scorer configuration from the Judgment API.
@@ -499,22 +482,26 @@ class JudgmentClient(metaclass=SingletonMeta):
     def assert_test(
         self,
-        examples: List[Example],
         scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
-        model: Union[str, List[str], JudgevalJudge],
+        examples: Optional[List[Example]] = None,
+        model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
+        test_file: Optional[str] = None,
         aggregator: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
         log_results: bool = True,
-        project_name: str = "default_project",
-        eval_run_name: str = "default_eval_run",
+        project_name: str = "default_test",
+        eval_run_name: str = str(uuid4()),
         override: bool = False,
-        rules: Optional[List[Rule]] = None
+        rules: Optional[List[Rule]] = None,
+        function: Optional[Callable] = None,
+        tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
     ) -> None:
         """
         Asserts a test by running the evaluation and checking the results for success
         Args:
-            examples (List[Example]): The examples to evaluate
+            examples (Optional[List[Example]]): The examples to evaluate. Must be provided if test_file is not.
+            test_file (Optional[str]): Path to a YAML file containing test examples. Must be provided if examples is not.
             scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
             model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
             aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -525,17 +512,37 @@ class JudgmentClient(metaclass=SingletonMeta):
             override (bool): Whether to override an existing evaluation run with the same name
             rules (Optional[List[Rule]]): Rules to evaluate against scoring results
         """
-        results = self.run_evaluation(
-            examples=examples,
-            scorers=scorers,
-            model=model,
-            aggregator=aggregator,
-            metadata=metadata,
-            log_results=log_results,
-            project_name=project_name,
-            eval_run_name=eval_run_name,
-            override=override,
-            rules=rules
-        )
+        # Validate that exactly one of examples or test_file is provided
+        if (examples is None and test_file is None) or (examples is not None and test_file is not None):
+            raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
+        if function:
+            results = self.run_trace_evaluation(
+                examples=examples,
+                scorers=scorers,
+                model=model,
+                aggregator=aggregator,
+                log_results=log_results,
+                project_name=project_name,
+                eval_run_name=eval_run_name,
+                override=override,
+                rules=rules,
+                function=function,
+                tracer=tracer,
+                test_file=test_file
+            )
+        else:
+            results = self.run_evaluation(
+                examples=examples,
+                scorers=scorers,
+                model=model,
+                aggregator=aggregator,
+                metadata=metadata,
+                log_results=log_results,
+                project_name=project_name,
+                eval_run_name=eval_run_name,
+                override=override,
+                rules=rules
+            )
         assert_test(results)

judgeval/run_evaluation.py CHANGED Viewed

@@ -4,7 +4,7 @@ import time
 import sys
 import itertools
 import threading
-from typing import List, Dict, Any, Union
+from typing import List, Dict, Any, Union, Optional, Callable
 from datetime import datetime
 from rich import print as rprint
@@ -12,7 +12,8 @@ from judgeval.data import (
     ScorerData,
     ScoringResult,
     Example,
-    CustomExample
+    CustomExample,
+    Trace
 )
 from judgeval.scorers import (
     JudgevalScorer,
@@ -23,10 +24,10 @@ from judgeval.scorers.score import a_execute_scoring
 from judgeval.constants import (
     ROOT_API,
     JUDGMENT_EVAL_API_URL,
-    JUDGMENT_SEQUENCE_EVAL_API_URL,
+    JUDGMENT_TRACE_EVAL_API_URL,
     JUDGMENT_EVAL_LOG_API_URL,
     MAX_CONCURRENT_EVALUATIONS,
-    JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
+    JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
 )
 from judgeval.common.exceptions import JudgmentAPIError
 from judgeval.common.logger import (
@@ -36,7 +37,9 @@ from judgeval.common.logger import (
     example_logging_context
 )
 from judgeval.evaluation_run import EvaluationRun
-from judgeval.data.sequence_run import SequenceRun
+from judgeval.data.trace_run import TraceRun
+from judgeval.common.tracer import Tracer
+from langchain_core.callbacks import BaseCallbackHandler
 def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
     """
@@ -93,20 +96,20 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
         raise JudgmentAPIError(error_message)
     return response_data
-def execute_api_sequence_eval(sequence_run: SequenceRun) -> List[Dict]:
+def execute_api_trace_eval(trace_run: TraceRun) -> List[Dict]:
     """
     Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
     """
     try:
         # submit API request to execute evals
-        payload = sequence_run.model_dump(warnings=False)
+        payload = trace_run.model_dump(warnings=False)
         response = requests.post(
-            JUDGMENT_SEQUENCE_EVAL_API_URL,
+            JUDGMENT_TRACE_EVAL_API_URL,
             headers={
                 "Content-Type": "application/json",
-                "Authorization": f"Bearer {sequence_run.judgment_api_key}",
-                "X-Organization-Id": sequence_run.organization_id
+                "Authorization": f"Bearer {trace_run.judgment_api_key}",
+                "X-Organization-Id": trace_run.organization_id
             },
             json=payload,
             verify=True
@@ -277,7 +280,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
         raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
-def log_evaluation_results(merged_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
+def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, TraceRun]) -> str:
     """
     Logs evaluation results to the Judgment API database.
@@ -298,7 +301,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
                 "X-Organization-Id": run.organization_id
             },
             json={
-                "results": merged_results,
+                "results": scoring_results,
                 "run": run.model_dump(warnings=False)
             },
             verify=True
@@ -365,46 +368,62 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
             if missing_params:
                 print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
-def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True) -> List[ScoringResult]:
+def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
-    if not override and sequence_run.log_results and not sequence_run.append:
+    if not override and trace_run.log_results and not trace_run.append:
         check_eval_run_name_exists(
-            sequence_run.eval_name,
-            sequence_run.project_name,
-            sequence_run.judgment_api_key,
-            sequence_run.organization_id
+            trace_run.eval_name,
+            trace_run.project_name,
+            trace_run.judgment_api_key,
+            trace_run.organization_id
         )
-    if sequence_run.append:
+    if trace_run.append:
         # Check that the current experiment, if one exists, has the same type (examples of sequences)
         check_experiment_type(
-            sequence_run.eval_name,
-            sequence_run.project_name,
-            sequence_run.judgment_api_key,
-            sequence_run.organization_id,
+            trace_run.eval_name,
+            trace_run.project_name,
+            trace_run.judgment_api_key,
+            trace_run.organization_id,
             True
         )
+    if function and tracer:
+        new_traces: List[Trace] = []
+        tracer.offline_mode = True
+        for example in examples:
+            if example.input:
+                result = run_with_spinner("Running agent function: ", function, **example.input)
+            else:
+                result = run_with_spinner("Running agent function: ", function)
+        for i, trace in enumerate(tracer.traces):
+            # We set the root-level trace span with the expected tools of the Trace
+            trace = Trace(**trace)
+            trace.entries[0].expected_tools = examples[i].expected_tools
+            new_traces.append(trace)
+        trace_run.traces = new_traces
     # Execute evaluation using Judgment API
     info("Starting API evaluation")
     try:  # execute an EvaluationRun with just JudgmentScorers
         debug("Sending request to Judgment API")
-        response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
-        info(f"Received {len(response_data['results'])} results from API")
+        response_data: List[Dict] = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
+        scoring_results = [ScoringResult(**result) for result in response_data["results"]]
+        info(f"Received {len(scoring_results)} results from API")
     except JudgmentAPIError as e:
         error(f"An error occurred while executing the Judgment API request: {str(e)}")
         raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
     except ValueError as e:
-        raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: {str(e)}")
+        raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: {str(e)}")
     # Convert the response data to `ScoringResult` objects
     debug("Processing API results")
-    # TODO: allow for custom scorer on sequences
-    if sequence_run.log_results:
-        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], sequence_run)
+    # TODO: allow for custom scorer on traces
+    if trace_run.log_results:
+        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], trace_run)
         rprint(pretty_str)
+    return scoring_results
@@ -587,7 +606,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
         #     )
         # print(merged_results)
         if evaluation_run.log_results:
-            send_results = [result.model_dump(warnings=False) for result in merged_results]
+            send_results = [scoring_result.model_dump(warnings=False) for scoring_result in merged_results]
             pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, send_results, evaluation_run)
             rprint(pretty_str)
@@ -613,34 +632,31 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
             # Create a test case context with all relevant fields
             test_case = {
-                'input': result.data_object.input,
-                'actual_output': result.data_object.actual_output,
-                'expected_output': result.data_object.expected_output,
-                'context': result.data_object.context,
-                'retrieval_context': result.data_object.retrieval_context,
-                'additional_metadata': result.data_object.additional_metadata,
-                'tools_called': result.data_object.tools_called,
-                'expected_tools': result.data_object.expected_tools,
-                'failed_scorers': []
+                "failed_scorers": []
             }
             if result.scorers_data:
                 # If the result was not successful, check each scorer_data
                 for scorer_data in result.scorers_data:
                     if not scorer_data.success:
+                        if scorer_data.name == "Tool Order":
+                            # Remove threshold, evaluation model for Tool Order scorer
+                            scorer_data.threshold = None
+                            scorer_data.evaluation_model = None
                         test_case['failed_scorers'].append(scorer_data)
             failed_cases.append(test_case)
     if failed_cases:
         error_msg = f"The following test cases failed: \n"
         for fail_case in failed_cases:
-            error_msg += f"\nInput: {fail_case['input']}\n"
-            error_msg += f"Actual Output: {fail_case['actual_output']}\n"
-            error_msg += f"Expected Output: {fail_case['expected_output']}\n"
-            error_msg += f"Context: {fail_case['context']}\n"
-            error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
-            error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
-            error_msg += f"Tools Called: {fail_case['tools_called']}\n"
-            error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
+            # error_msg += f"\nInput: {fail_case['input']}\n"
+            # error_msg += f"Actual Output: {fail_case['actual_output']}\n"
+            # error_msg += f"Expected Output: {fail_case['expected_output']}\n"
+            # error_msg += f"Context: {fail_case['context']}\n"
+            # error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
+            # error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
+            # error_msg += f"Tools Called: {fail_case['tools_called']}\n"
+            # error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
             for fail_scorer in fail_case['failed_scorers']:
@@ -658,6 +674,37 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
                     f"Additional Metadata: {fail_scorer.additional_metadata}\n"
                 )
             error_msg += "-"*100
-        raise AssertionError(error_msg)
+        total_tests = len(scoring_results)
+        failed_tests = len(failed_cases)
+        passed_tests = total_tests - failed_tests
+        # Print summary with colors
+        rprint("\n" + "="*80)
+        if failed_tests == 0:
+            rprint(f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]")
+        else:
+            rprint(f"[bold red]⚠️  TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]")
+        rprint("="*80 + "\n")
+        # Print individual test cases
+        for i, result in enumerate(scoring_results):
+            test_num = i + 1
+            if result.success:
+                rprint(f"[green]✓ Test {test_num}: PASSED[/green]")
+            else:
+                rprint(f"[red]✗ Test {test_num}: FAILED[/red]")
+                if result.scorers_data:
+                    for scorer_data in result.scorers_data:
+                        if not scorer_data.success:
+                            rprint(f"  [yellow]Scorer: {scorer_data.name}[/yellow]")
+                            rprint(f"  [red]  Score: {scorer_data.score}[/red]")
+                            rprint(f"  [red]  Reason: {scorer_data.reason}[/red]")
+                            if scorer_data.error:
+                                rprint(f"  [red]  Error: {scorer_data.error}[/red]")
+                rprint("  " + "-"*40)
+        rprint("\n" + "="*80)
+        if failed_tests > 0:
+            raise AssertionError(failed_cases)

judgeval/scorers/__init__.py CHANGED Viewed

@@ -16,6 +16,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
     InstructionAdherenceScorer,
     GroundednessScorer,
     DerailmentScorer,
+    ToolOrderScorer,
 )
 from judgeval.scorers.judgeval_scorers.classifiers import (
     Text2SQLScorer,
@@ -41,4 +42,5 @@ __all__ = [
     "InstructionAdherenceScorer",
     "GroundednessScorer",
     "DerailmentScorer",
+    "ToolOrderScorer",
 ]

judgeval/scorers/judgeval_scorers/api_scorers/__init__.py CHANGED Viewed

@@ -12,6 +12,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonS
 from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
+from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
 __all__ = [
     "ExecutionOrderScorer",
     "JSONCorrectnessScorer",
@@ -27,4 +28,5 @@ __all__ = [
     "InstructionAdherenceScorer",
     "GroundednessScorer",
     "DerailmentScorer",
+    "ToolOrderScorer",
 ]

judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""
+`judgeval` tool order scorer
+"""
+# Internal imports
+from judgeval.scorers.api_scorer import APIJudgmentScorer
+from judgeval.constants import APIScorer
+from typing import Optional, Dict
+class ToolOrderScorer(APIJudgmentScorer):
+    kwargs: Optional[Dict] = None
+    def __init__(self, threshold: float=1.0, exact_match: bool=False):
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.TOOL_ORDER,
+        )
+        self.kwargs = {"exact_match": exact_match}
+    @property
+    def __name__(self):
+        return "Tool Order"

judgeval/scorers/score.py CHANGED Viewed

@@ -243,7 +243,7 @@ async def score_with_indicator(
 async def a_execute_scoring(
     examples: Union[List[Example], List[CustomExample]],
     scorers: List[JudgevalScorer],
-    model: Optional[Union[str, List[str], JudgevalJudge]] = None,
+    model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
     ignore_errors: bool = True,
     skip_on_missing_params: bool = True,
     show_indicator: bool = True,

judgeval 0.0.36__py3-none-any.whl → 0.0.38__py3-none-any.whl

judgeval 0.0.36py3-none-any.whl → 0.0.38py3-none-any.whl