PyPI - judgeval - Versions diffs - 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl - Mend

judgeval 0.0.35py3-none-any.whl → 0.0.37py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

judgeval/common/tracer.py +869 -928
judgeval/common/utils.py +18 -0
judgeval/constants.py +6 -3
judgeval/data/__init__.py +4 -0
judgeval/data/datasets/dataset.py +3 -2
judgeval/data/datasets/eval_dataset_client.py +63 -3
judgeval/data/example.py +29 -7
judgeval/data/sequence.py +5 -4
judgeval/data/sequence_run.py +4 -3
judgeval/data/trace.py +129 -0
judgeval/evaluation_run.py +1 -1
judgeval/integrations/langgraph.py +1962 -299
judgeval/judgment_client.py +85 -66
judgeval/run_evaluation.py +191 -45
judgeval/scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -0
judgeval/scorers/score.py +2 -1
judgeval/utils/data_utils.py +57 -0
judgeval-0.0.37.dist-info/METADATA +214 -0
{judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/RECORD +23 -20
judgeval-0.0.35.dist-info/METADATA +0 -170
{judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/WHEEL +0 -0
{judgeval-0.0.35.dist-info → judgeval-0.0.37.dist-info}/licenses/LICENSE.md +0 -0

judgeval/judgment_client.py CHANGED Viewed

@@ -2,7 +2,8 @@
 Implements the JudgmentClient to interact with the Judgment API.
 """
 import os
-from typing import Optional, List, Dict, Any, Union
+from uuid import uuid4
+from typing import Optional, List, Dict, Any, Union, Callable
 import requests
 from judgeval.constants import ROOT_API
@@ -33,7 +34,11 @@ from judgeval.constants import (
     JUDGMENT_PROJECT_DELETE_API_URL,
     JUDGMENT_PROJECT_CREATE_API_URL
 )
+from judgeval.utils.data_utils import add_from_yaml
 from judgeval.common.exceptions import JudgmentAPIError
+from langchain_core.callbacks import BaseCallbackHandler
+from judgeval.common.tracer import Tracer
+from judgeval.common.utils import validate_api_key
 from pydantic import BaseModel
 from judgeval.rules import Rule
@@ -63,7 +68,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
         # Verify API key is valid
-        result, response = self._validate_api_key()
+        result, response = validate_api_key(judgment_api_key)
         if not result:
             # May be bad to output their invalid API key...
             raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
@@ -74,7 +79,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         self,
         examples: List[Example],
         scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
-        model: Union[str, List[str], JudgevalJudge],
+        model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
         aggregator: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
         log_results: bool = True,
@@ -102,9 +107,11 @@ class JudgmentClient(metaclass=SingletonMeta):
     def run_sequence_evaluation(
         self,
-        sequences: List[Sequence],
-        model: Union[str, List[str], JudgevalJudge],
         scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
+        model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
+        sequences: Optional[List[Sequence]] = None,
+        examples: Optional[List[Example]] = None,
+        test_file: Optional[str] = None,
         aggregator: Optional[str] = None,
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_sequence",
@@ -112,40 +119,40 @@ class JudgmentClient(metaclass=SingletonMeta):
         append: bool = False,
         override: bool = False,
         ignore_errors: bool = True,
-        rules: Optional[List[Rule]] = None
+        rules: Optional[List[Rule]] = None,
+        function: Optional[Callable] = None,
+        tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
     ) -> List[ScoringResult]:
-        try:
-            def get_all_sequences(root: Sequence) -> List[Sequence]:
-                all_sequences = [root]
-                for item in root.items:
-                    if isinstance(item, Sequence):
-                        all_sequences.extend(get_all_sequences(item))
-                return all_sequences
-            def flatten_sequence_list(sequences: List[Sequence]) -> List[Sequence]:
-                flattened = []
-                for seq in sequences:
-                    flattened.extend(get_all_sequences(seq))
-                return flattened
+        try:
+            if test_file:
+                try:
+                    examples = add_from_yaml(test_file)
+                except FileNotFoundError:
+                    raise FileNotFoundError(f"Test file not found: {test_file}")
+            if examples and not function:
+                raise ValueError("Cannot pass in examples without a function")
+            if sequences and function:
+                raise ValueError("Cannot pass in sequences and function")
+            if examples and sequences:
+                raise ValueError("Cannot pass in both examples and sequences")
-            flattened_sequences = flatten_sequence_list(sequences)
-            for sequence in flattened_sequences:
-                sequence.scorers = scorers
             sequence_run = SequenceRun(
                 project_name=project_name,
                 eval_name=eval_run_name,
                 sequences=sequences,
+                scorers=scorers,
                 model=model,
                 aggregator=aggregator,
                 log_results=log_results,
                 append=append,
                 judgment_api_key=self.judgment_api_key,
-                organization_id=self.organization_id
+                organization_id=self.organization_id,
             )
-            return run_sequence_eval(sequence_run, override, ignore_errors)
+            return run_sequence_eval(sequence_run, override, ignore_errors, function, tracer, examples)
         except ValueError as e:
             raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
         except Exception as e:
@@ -155,7 +162,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         self,
         examples: Union[List[Example], List[CustomExample]],
         scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
-        model: Union[str, List[str], JudgevalJudge],
+        model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
         aggregator: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
         log_results: bool = True,
@@ -232,11 +239,17 @@ class JudgmentClient(metaclass=SingletonMeta):
         dataset.judgment_api_key = self.judgment_api_key
         return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
-    def append_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
+    def append_example_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
         """
         Appends an `EvalDataset` to the Judgment platform for storage.
         """
-        return self.eval_dataset_client.append(alias, examples, project_name)
+        return self.eval_dataset_client.append_examples(alias, examples, project_name)
+    def append_sequence_dataset(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
+        """
+        Appends a `Sequence` to the Judgment platform for storage.
+        """
+        return self.eval_dataset_client.append_sequences(alias, sequences, project_name)
     def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
         """
@@ -390,24 +403,6 @@ class JudgmentClient(metaclass=SingletonMeta):
             raise ValueError(f"Error deleting project: {response.json()}")
         return response.json()
-    def _validate_api_key(self):
-        """
-        Validates that the user api key is valid
-        """
-        response = requests.post(
-            f"{ROOT_API}/validate_api_key/",
-            headers={
-                "Content-Type": "application/json",
-                "Authorization": f"Bearer {self.judgment_api_key}",
-            },
-            json={},  # Empty body now
-            verify=True
-        )
-        if response.status_code == 200:
-            return True, response.json()
-        else:
-            return False, response.json().get("detail", "Error validating API key")
     def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
         """
         Fetches a classifier scorer configuration from the Judgment API.
@@ -493,22 +488,26 @@ class JudgmentClient(metaclass=SingletonMeta):
     def assert_test(
         self,
-        examples: List[Example],
         scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
-        model: Union[str, List[str], JudgevalJudge],
+        examples: Optional[List[Example]] = None,
+        model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
+        test_file: Optional[str] = None,
         aggregator: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
         log_results: bool = True,
-        project_name: str = "default_project",
-        eval_run_name: str = "default_eval_run",
+        project_name: str = "default_test",
+        eval_run_name: str = str(uuid4()),
         override: bool = False,
-        rules: Optional[List[Rule]] = None
+        rules: Optional[List[Rule]] = None,
+        function: Optional[Callable] = None,
+        tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
     ) -> None:
         """
         Asserts a test by running the evaluation and checking the results for success
         Args:
-            examples (List[Example]): The examples to evaluate
+            examples (Optional[List[Example]]): The examples to evaluate. Must be provided if test_file is not.
+            test_file (Optional[str]): Path to a YAML file containing test examples. Must be provided if examples is not.
             scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
             model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
             aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -519,17 +518,37 @@ class JudgmentClient(metaclass=SingletonMeta):
             override (bool): Whether to override an existing evaluation run with the same name
             rules (Optional[List[Rule]]): Rules to evaluate against scoring results
         """
-        results = self.run_evaluation(
-            examples=examples,
-            scorers=scorers,
-            model=model,
-            aggregator=aggregator,
-            metadata=metadata,
-            log_results=log_results,
-            project_name=project_name,
-            eval_run_name=eval_run_name,
-            override=override,
-            rules=rules
-        )
+        # Validate that exactly one of examples or test_file is provided
+        if (examples is None and test_file is None) or (examples is not None and test_file is not None):
+            raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
+        if function:
+            results = self.run_sequence_evaluation(
+                examples=examples,
+                scorers=scorers,
+                model=model,
+                aggregator=aggregator,
+                log_results=log_results,
+                project_name=project_name,
+                eval_run_name=eval_run_name,
+                override=override,
+                rules=rules,
+                function=function,
+                tracer=tracer,
+                test_file=test_file
+            )
+        else:
+            results = self.run_evaluation(
+                examples=examples,
+                scorers=scorers,
+                model=model,
+                aggregator=aggregator,
+                metadata=metadata,
+                log_results=log_results,
+                project_name=project_name,
+                eval_run_name=eval_run_name,
+                override=override,
+                rules=rules
+            )
         assert_test(results)

judgeval/run_evaluation.py CHANGED Viewed

@@ -4,7 +4,7 @@ import time
 import sys
 import itertools
 import threading
-from typing import List, Dict, Any, Union
+from typing import List, Dict, Any, Union, Optional, Callable
 from datetime import datetime
 from rich import print as rprint
@@ -12,7 +12,9 @@ from judgeval.data import (
     ScorerData,
     ScoringResult,
     Example,
-    CustomExample
+    CustomExample,
+    Sequence,
+    Trace
 )
 from judgeval.scorers import (
     JudgevalScorer,
@@ -26,7 +28,8 @@ from judgeval.constants import (
     JUDGMENT_SEQUENCE_EVAL_API_URL,
     JUDGMENT_EVAL_LOG_API_URL,
     MAX_CONCURRENT_EVALUATIONS,
-    JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
+    JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
+    JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL
 )
 from judgeval.common.exceptions import JudgmentAPIError
 from judgeval.common.logger import (
@@ -37,6 +40,8 @@ from judgeval.common.logger import (
 )
 from judgeval.evaluation_run import EvaluationRun
 from judgeval.data.sequence_run import SequenceRun
+from judgeval.common.tracer import Tracer
+from langchain_core.callbacks import BaseCallbackHandler
 def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
     """
@@ -198,6 +203,40 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
             )
     return results
+def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_sequence: bool) -> None:
+    """
+    Checks if the current experiment, if one exists, has the same type (examples of sequences)
+    """
+    try:
+        response = requests.post(
+            f"{ROOT_API}/check_experiment_type/",
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {judgment_api_key}",
+                "X-Organization-Id": organization_id
+            },
+            json={
+                "eval_name": eval_name,
+                "project_name": project_name,
+                "judgment_api_key": judgment_api_key,
+                "is_sequence": is_sequence
+            },
+            verify=True
+        )
+        if response.status_code == 422:
+            error(f"{response.json()}")
+            raise ValueError(f"{response.json()}")
+        if not response.ok:
+            response_data = response.json()
+            error_message = response_data.get('detail', 'An unknown error occurred.')
+            error(f"Error checking eval run name: {error_message}")
+            raise JudgmentAPIError(error_message)
+    except requests.exceptions.RequestException as e:
+        error(f"Failed to check if experiment type exists: {str(e)}")
+        raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
 def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str) -> None:
     """
@@ -243,7 +282,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
         raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
-def log_evaluation_results(merged_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
+def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
     """
     Logs evaluation results to the Judgment API database.
@@ -264,7 +303,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
                 "X-Organization-Id": run.organization_id
             },
             json={
-                "results": [result.model_dump(warnings=False) for result in merged_results],
+                "results": scoring_results,
                 "run": run.model_dump(warnings=False)
             },
             verify=True
@@ -288,6 +327,51 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
         error(f"Failed to save evaluation results to DB: {str(e)}")
         raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
+def retrieve_sequence_from_trace(trace_id: str, parent_span: str, judgment_api_key: str, organization_id: str) -> Sequence:
+    """
+    Retrieves a sequence from a trace ID.
+    """
+    """
+    Logs evaluation results to the Judgment API database.
+    Args:
+        merged_results (List[ScoringResult]): The results to log
+        evaluation_run (EvaluationRun): The evaluation run containing project info and API key
+    Raises:
+        JudgmentAPIError: If there's an API error during logging
+        ValueError: If there's a validation error with the results
+    """
+    try:
+        res = requests.post(
+            JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL,
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {judgment_api_key}",
+                "X-Organization-Id": organization_id
+            },
+            json={
+                "trace_id": trace_id,
+                "trace_span_id": parent_span,
+            },
+            verify=True
+        )
+        if not res.ok:
+            response_data = res.json()
+            error_message = response_data.get('detail', 'An unknown error occurred.')
+            error(f"Error {res.status_code}: {error_message}")
+            raise JudgmentAPIError(error_message)
+        return Sequence(**res.json())
+    except requests.exceptions.RequestException as e:
+        error(f"Request failed while saving evaluation results to DB: {str(e)}")
+        raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
+    except Exception as e:
+        error(f"Failed to save evaluation results to DB: {str(e)}")
+        raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
 def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
         """Run a function with a spinner in the terminal."""
         spinner = itertools.cycle(['|', '/', '-', '\\'])
@@ -318,23 +402,20 @@ def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
         return result
-def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) -> None:
+def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]) -> None:
     """
     Checks if the example contains the necessary parameters for the scorer.
     """
     for scorer in scorers:
-        if isinstance(scorer, APIJudgmentScorer):
-            for example in examples:
-                missing_params = []
-                for param in scorer.required_params:
-                    if getattr(example, param.value) is None:
-                        missing_params.append(f"'{param.value}'")
-                if missing_params:
-                    # We do this because we want to inform users that an example is missing parameters for a scorer
-                    # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
-                    print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
-def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True) -> List[ScoringResult]:
+        for example in examples:
+            missing_params = []
+            for param in scorer.required_params:
+                if getattr(example, param.value) is None:
+                    missing_params.append(f"'{param.value}'")
+            if missing_params:
+                print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
+def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
     if not override and sequence_run.log_results and not sequence_run.append:
         check_eval_run_name_exists(
@@ -344,13 +425,41 @@ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_
             sequence_run.organization_id
         )
+    if sequence_run.append:
+        # Check that the current experiment, if one exists, has the same type (examples of sequences)
+        check_experiment_type(
+            sequence_run.eval_name,
+            sequence_run.project_name,
+            sequence_run.judgment_api_key,
+            sequence_run.organization_id,
+            True
+        )
+    if function and tracer:
+        new_sequences: List[Sequence] = []
+        for example in examples:
+            if example.input:
+                result = run_with_spinner("Running agent function: ", function, **example.input)
+            else:
+                result = run_with_spinner("Running agent function: ", function)
+        for i, trace in enumerate(tracer.traces):
+            trace_id = trace['trace_id']
+            parent_span = trace['entries'][0]['span_id']
+            new_sequence = retrieve_sequence_from_trace(trace_id, parent_span, sequence_run.judgment_api_key, sequence_run.organization_id)
+            new_sequence.expected_tools = examples[i].expected_tools
+            new_sequences.append(new_sequence)
+        sequence_run.sequences = new_sequences
+    for sequence in sequence_run.sequences:
+        sequence.scorers = sequence_run.scorers
     # Execute evaluation using Judgment API
     info("Starting API evaluation")
     try:  # execute an EvaluationRun with just JudgmentScorers
         debug("Sending request to Judgment API")
         response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
-        info(f"Received {len(response_data['results'])} results from API")
+        scoring_results = [ScoringResult(**result) for result in response_data["results"]]
+        info(f"Received {len(scoring_results)} results from API")
     except JudgmentAPIError as e:
         error(f"An error occurred while executing the Judgment API request: {str(e)}")
         raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
@@ -359,14 +468,12 @@ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_
     # Convert the response data to `ScoringResult` objects
     debug("Processing API results")
-    api_results = []
-    for result in response_data["results"]:
-        api_results.append(ScoringResult(**result))
     # TODO: allow for custom scorer on sequences
     if sequence_run.log_results:
-        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, api_results, sequence_run)
+        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], sequence_run)
         rprint(pretty_str)
+    return scoring_results
@@ -404,6 +511,16 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
             evaluation_run.organization_id
         )
+    if evaluation_run.append:
+        # Check that the current experiment, if one exists, has the same type (examples of sequences)
+        check_experiment_type(
+            evaluation_run.eval_name,
+            evaluation_run.project_name,
+            evaluation_run.judgment_api_key,
+            evaluation_run.organization_id,
+            False
+        )
     # Set example IDs if not already set
     debug("Initializing examples with IDs and timestamps")
     for idx, example in enumerate(evaluation_run.examples):
@@ -539,7 +656,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
         #     )
         # print(merged_results)
         if evaluation_run.log_results:
-            pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
+            send_results = [scoring_result.model_dump(warnings=False) for scoring_result in merged_results]
+            pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, send_results, evaluation_run)
             rprint(pretty_str)
         for i, result in enumerate(merged_results):
@@ -564,34 +682,31 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
             # Create a test case context with all relevant fields
             test_case = {
-                'input': result.data_object.input,
-                'actual_output': result.data_object.actual_output,
-                'expected_output': result.data_object.expected_output,
-                'context': result.data_object.context,
-                'retrieval_context': result.data_object.retrieval_context,
-                'additional_metadata': result.data_object.additional_metadata,
-                'tools_called': result.data_object.tools_called,
-                'expected_tools': result.data_object.expected_tools,
-                'failed_scorers': []
+                "failed_scorers": []
             }
             if result.scorers_data:
                 # If the result was not successful, check each scorer_data
                 for scorer_data in result.scorers_data:
                     if not scorer_data.success:
+                        if scorer_data.name == "Tool Order":
+                            # Remove threshold, evaluation model for Tool Order scorer
+                            scorer_data.threshold = None
+                            scorer_data.evaluation_model = None
                         test_case['failed_scorers'].append(scorer_data)
             failed_cases.append(test_case)
     if failed_cases:
         error_msg = f"The following test cases failed: \n"
         for fail_case in failed_cases:
-            error_msg += f"\nInput: {fail_case['input']}\n"
-            error_msg += f"Actual Output: {fail_case['actual_output']}\n"
-            error_msg += f"Expected Output: {fail_case['expected_output']}\n"
-            error_msg += f"Context: {fail_case['context']}\n"
-            error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
-            error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
-            error_msg += f"Tools Called: {fail_case['tools_called']}\n"
-            error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
+            # error_msg += f"\nInput: {fail_case['input']}\n"
+            # error_msg += f"Actual Output: {fail_case['actual_output']}\n"
+            # error_msg += f"Expected Output: {fail_case['expected_output']}\n"
+            # error_msg += f"Context: {fail_case['context']}\n"
+            # error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
+            # error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
+            # error_msg += f"Tools Called: {fail_case['tools_called']}\n"
+            # error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
             for fail_scorer in fail_case['failed_scorers']:
@@ -609,6 +724,37 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
                     f"Additional Metadata: {fail_scorer.additional_metadata}\n"
                 )
             error_msg += "-"*100
-        raise AssertionError(error_msg)
+        total_tests = len(scoring_results)
+        failed_tests = len(failed_cases)
+        passed_tests = total_tests - failed_tests
+        # Print summary with colors
+        rprint("\n" + "="*80)
+        if failed_tests == 0:
+            rprint(f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]")
+        else:
+            rprint(f"[bold red]⚠️  TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]")
+        rprint("="*80 + "\n")
+        # Print individual test cases
+        for i, result in enumerate(scoring_results):
+            test_num = i + 1
+            if result.success:
+                rprint(f"[green]✓ Test {test_num}: PASSED[/green]")
+            else:
+                rprint(f"[red]✗ Test {test_num}: FAILED[/red]")
+                if result.scorers_data:
+                    for scorer_data in result.scorers_data:
+                        if not scorer_data.success:
+                            rprint(f"  [yellow]Scorer: {scorer_data.name}[/yellow]")
+                            rprint(f"  [red]  Score: {scorer_data.score}[/red]")
+                            rprint(f"  [red]  Reason: {scorer_data.reason}[/red]")
+                            if scorer_data.error:
+                                rprint(f"  [red]  Error: {scorer_data.error}[/red]")
+                rprint("  " + "-"*40)
+        rprint("\n" + "="*80)
+        if failed_tests > 0:
+            raise AssertionError(failed_cases)

judgeval/scorers/__init__.py CHANGED Viewed

@@ -16,6 +16,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
     InstructionAdherenceScorer,
     GroundednessScorer,
     DerailmentScorer,
+    ToolOrderScorer,
 )
 from judgeval.scorers.judgeval_scorers.classifiers import (
     Text2SQLScorer,
@@ -41,4 +42,5 @@ __all__ = [
     "InstructionAdherenceScorer",
     "GroundednessScorer",
     "DerailmentScorer",
+    "ToolOrderScorer",
 ]

judgeval/scorers/judgeval_scorers/api_scorers/__init__.py CHANGED Viewed

@@ -12,6 +12,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonS
 from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
+from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
 __all__ = [
     "ExecutionOrderScorer",
     "JSONCorrectnessScorer",
@@ -27,4 +28,5 @@ __all__ = [
     "InstructionAdherenceScorer",
     "GroundednessScorer",
     "DerailmentScorer",
+    "ToolOrderScorer",
 ]

judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""
+`judgeval` tool order scorer
+"""
+# Internal imports
+from judgeval.scorers.api_scorer import APIJudgmentScorer
+from judgeval.constants import APIScorer
+class ToolOrderScorer(APIJudgmentScorer):
+    def __init__(self, threshold: float=1.0):
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.TOOL_ORDER,
+        )
+    @property
+    def __name__(self):
+        return "Tool Order"

judgeval 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl

judgeval 0.0.35py3-none-any.whl → 0.0.37py3-none-any.whl