PyPI - judgeval - Versions diffs - 0.0.36__py3-none-any.whl → 0.0.37__py3-none-any.whl - Mend

judgeval 0.0.36py3-none-any.whl → 0.0.37py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

judgeval/common/tracer.py +565 -858
judgeval/common/utils.py +18 -0
judgeval/constants.py +3 -1
judgeval/data/__init__.py +4 -0
judgeval/data/datasets/dataset.py +0 -2
judgeval/data/example.py +29 -7
judgeval/data/sequence.py +5 -4
judgeval/data/sequence_run.py +4 -3
judgeval/data/trace.py +129 -0
judgeval/evaluation_run.py +1 -1
judgeval/integrations/langgraph.py +18 -17
judgeval/judgment_client.py +77 -64
judgeval/run_evaluation.py +126 -29
judgeval/scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -0
judgeval/scorers/score.py +1 -1
judgeval/utils/data_utils.py +57 -0
judgeval-0.0.37.dist-info/METADATA +214 -0
{judgeval-0.0.36.dist-info → judgeval-0.0.37.dist-info}/RECORD +22 -19
judgeval-0.0.36.dist-info/METADATA +0 -169
{judgeval-0.0.36.dist-info → judgeval-0.0.37.dist-info}/WHEEL +0 -0
{judgeval-0.0.36.dist-info → judgeval-0.0.37.dist-info}/licenses/LICENSE.md +0 -0

judgeval/judgment_client.py CHANGED Viewed

@@ -2,7 +2,8 @@
 Implements the JudgmentClient to interact with the Judgment API.
 """
 import os
-from typing import Optional, List, Dict, Any, Union
+from uuid import uuid4
+from typing import Optional, List, Dict, Any, Union, Callable
 import requests
 from judgeval.constants import ROOT_API
@@ -33,7 +34,11 @@ from judgeval.constants import (
     JUDGMENT_PROJECT_DELETE_API_URL,
     JUDGMENT_PROJECT_CREATE_API_URL
 )
+from judgeval.utils.data_utils import add_from_yaml
 from judgeval.common.exceptions import JudgmentAPIError
+from langchain_core.callbacks import BaseCallbackHandler
+from judgeval.common.tracer import Tracer
+from judgeval.common.utils import validate_api_key
 from pydantic import BaseModel
 from judgeval.rules import Rule
@@ -63,7 +68,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
         # Verify API key is valid
-        result, response = self._validate_api_key()
+        result, response = validate_api_key(judgment_api_key)
         if not result:
             # May be bad to output their invalid API key...
             raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
@@ -74,7 +79,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         self,
         examples: List[Example],
         scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
-        model: Union[str, List[str], JudgevalJudge],
+        model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
         aggregator: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
         log_results: bool = True,
@@ -102,9 +107,11 @@ class JudgmentClient(metaclass=SingletonMeta):
     def run_sequence_evaluation(
         self,
-        sequences: List[Sequence],
-        model: Union[str, List[str], JudgevalJudge],
         scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
+        model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
+        sequences: Optional[List[Sequence]] = None,
+        examples: Optional[List[Example]] = None,
+        test_file: Optional[str] = None,
         aggregator: Optional[str] = None,
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_sequence",
@@ -112,40 +119,40 @@ class JudgmentClient(metaclass=SingletonMeta):
         append: bool = False,
         override: bool = False,
         ignore_errors: bool = True,
-        rules: Optional[List[Rule]] = None
+        rules: Optional[List[Rule]] = None,
+        function: Optional[Callable] = None,
+        tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
     ) -> List[ScoringResult]:
-        try:
-            def get_all_sequences(root: Sequence) -> List[Sequence]:
-                all_sequences = [root]
-                for item in root.items:
-                    if isinstance(item, Sequence):
-                        all_sequences.extend(get_all_sequences(item))
-                return all_sequences
-            def flatten_sequence_list(sequences: List[Sequence]) -> List[Sequence]:
-                flattened = []
-                for seq in sequences:
-                    flattened.extend(get_all_sequences(seq))
-                return flattened
+        try:
+            if test_file:
+                try:
+                    examples = add_from_yaml(test_file)
+                except FileNotFoundError:
+                    raise FileNotFoundError(f"Test file not found: {test_file}")
+            if examples and not function:
+                raise ValueError("Cannot pass in examples without a function")
+            if sequences and function:
+                raise ValueError("Cannot pass in sequences and function")
+            if examples and sequences:
+                raise ValueError("Cannot pass in both examples and sequences")
-            flattened_sequences = flatten_sequence_list(sequences)
-            for sequence in flattened_sequences:
-                sequence.scorers = scorers
             sequence_run = SequenceRun(
                 project_name=project_name,
                 eval_name=eval_run_name,
                 sequences=sequences,
+                scorers=scorers,
                 model=model,
                 aggregator=aggregator,
                 log_results=log_results,
                 append=append,
                 judgment_api_key=self.judgment_api_key,
-                organization_id=self.organization_id
+                organization_id=self.organization_id,
             )
-            return run_sequence_eval(sequence_run, override, ignore_errors)
+            return run_sequence_eval(sequence_run, override, ignore_errors, function, tracer, examples)
         except ValueError as e:
             raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
         except Exception as e:
@@ -155,7 +162,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         self,
         examples: Union[List[Example], List[CustomExample]],
         scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
-        model: Union[str, List[str], JudgevalJudge],
+        model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
         aggregator: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
         log_results: bool = True,
@@ -396,24 +403,6 @@ class JudgmentClient(metaclass=SingletonMeta):
             raise ValueError(f"Error deleting project: {response.json()}")
         return response.json()
-    def _validate_api_key(self):
-        """
-        Validates that the user api key is valid
-        """
-        response = requests.post(
-            f"{ROOT_API}/validate_api_key/",
-            headers={
-                "Content-Type": "application/json",
-                "Authorization": f"Bearer {self.judgment_api_key}",
-            },
-            json={},  # Empty body now
-            verify=True
-        )
-        if response.status_code == 200:
-            return True, response.json()
-        else:
-            return False, response.json().get("detail", "Error validating API key")
     def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
         """
         Fetches a classifier scorer configuration from the Judgment API.
@@ -499,22 +488,26 @@ class JudgmentClient(metaclass=SingletonMeta):
     def assert_test(
         self,
-        examples: List[Example],
         scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
-        model: Union[str, List[str], JudgevalJudge],
+        examples: Optional[List[Example]] = None,
+        model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
+        test_file: Optional[str] = None,
         aggregator: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
         log_results: bool = True,
-        project_name: str = "default_project",
-        eval_run_name: str = "default_eval_run",
+        project_name: str = "default_test",
+        eval_run_name: str = str(uuid4()),
         override: bool = False,
-        rules: Optional[List[Rule]] = None
+        rules: Optional[List[Rule]] = None,
+        function: Optional[Callable] = None,
+        tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
     ) -> None:
         """
         Asserts a test by running the evaluation and checking the results for success
         Args:
-            examples (List[Example]): The examples to evaluate
+            examples (Optional[List[Example]]): The examples to evaluate. Must be provided if test_file is not.
+            test_file (Optional[str]): Path to a YAML file containing test examples. Must be provided if examples is not.
             scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
             model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
             aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -525,17 +518,37 @@ class JudgmentClient(metaclass=SingletonMeta):
             override (bool): Whether to override an existing evaluation run with the same name
             rules (Optional[List[Rule]]): Rules to evaluate against scoring results
         """
-        results = self.run_evaluation(
-            examples=examples,
-            scorers=scorers,
-            model=model,
-            aggregator=aggregator,
-            metadata=metadata,
-            log_results=log_results,
-            project_name=project_name,
-            eval_run_name=eval_run_name,
-            override=override,
-            rules=rules
-        )
+        # Validate that exactly one of examples or test_file is provided
+        if (examples is None and test_file is None) or (examples is not None and test_file is not None):
+            raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
+        if function:
+            results = self.run_sequence_evaluation(
+                examples=examples,
+                scorers=scorers,
+                model=model,
+                aggregator=aggregator,
+                log_results=log_results,
+                project_name=project_name,
+                eval_run_name=eval_run_name,
+                override=override,
+                rules=rules,
+                function=function,
+                tracer=tracer,
+                test_file=test_file
+            )
+        else:
+            results = self.run_evaluation(
+                examples=examples,
+                scorers=scorers,
+                model=model,
+                aggregator=aggregator,
+                metadata=metadata,
+                log_results=log_results,
+                project_name=project_name,
+                eval_run_name=eval_run_name,
+                override=override,
+                rules=rules
+            )
         assert_test(results)

judgeval/run_evaluation.py CHANGED Viewed

@@ -4,7 +4,7 @@ import time
 import sys
 import itertools
 import threading
-from typing import List, Dict, Any, Union
+from typing import List, Dict, Any, Union, Optional, Callable
 from datetime import datetime
 from rich import print as rprint
@@ -12,7 +12,9 @@ from judgeval.data import (
     ScorerData,
     ScoringResult,
     Example,
-    CustomExample
+    CustomExample,
+    Sequence,
+    Trace
 )
 from judgeval.scorers import (
     JudgevalScorer,
@@ -26,7 +28,8 @@ from judgeval.constants import (
     JUDGMENT_SEQUENCE_EVAL_API_URL,
     JUDGMENT_EVAL_LOG_API_URL,
     MAX_CONCURRENT_EVALUATIONS,
-    JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
+    JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
+    JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL
 )
 from judgeval.common.exceptions import JudgmentAPIError
 from judgeval.common.logger import (
@@ -37,6 +40,8 @@ from judgeval.common.logger import (
 )
 from judgeval.evaluation_run import EvaluationRun
 from judgeval.data.sequence_run import SequenceRun
+from judgeval.common.tracer import Tracer
+from langchain_core.callbacks import BaseCallbackHandler
 def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
     """
@@ -277,7 +282,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
         raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
-def log_evaluation_results(merged_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
+def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
     """
     Logs evaluation results to the Judgment API database.
@@ -298,7 +303,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
                 "X-Organization-Id": run.organization_id
             },
             json={
-                "results": merged_results,
+                "results": scoring_results,
                 "run": run.model_dump(warnings=False)
             },
             verify=True
@@ -322,6 +327,51 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
         error(f"Failed to save evaluation results to DB: {str(e)}")
         raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
+def retrieve_sequence_from_trace(trace_id: str, parent_span: str, judgment_api_key: str, organization_id: str) -> Sequence:
+    """
+    Retrieves a sequence from a trace ID.
+    """
+    """
+    Logs evaluation results to the Judgment API database.
+    Args:
+        merged_results (List[ScoringResult]): The results to log
+        evaluation_run (EvaluationRun): The evaluation run containing project info and API key
+    Raises:
+        JudgmentAPIError: If there's an API error during logging
+        ValueError: If there's a validation error with the results
+    """
+    try:
+        res = requests.post(
+            JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL,
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {judgment_api_key}",
+                "X-Organization-Id": organization_id
+            },
+            json={
+                "trace_id": trace_id,
+                "trace_span_id": parent_span,
+            },
+            verify=True
+        )
+        if not res.ok:
+            response_data = res.json()
+            error_message = response_data.get('detail', 'An unknown error occurred.')
+            error(f"Error {res.status_code}: {error_message}")
+            raise JudgmentAPIError(error_message)
+        return Sequence(**res.json())
+    except requests.exceptions.RequestException as e:
+        error(f"Request failed while saving evaluation results to DB: {str(e)}")
+        raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
+    except Exception as e:
+        error(f"Failed to save evaluation results to DB: {str(e)}")
+        raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
 def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
         """Run a function with a spinner in the terminal."""
         spinner = itertools.cycle(['|', '/', '-', '\\'])
@@ -365,7 +415,7 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
             if missing_params:
                 print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
-def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True) -> List[ScoringResult]:
+def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
     if not override and sequence_run.log_results and not sequence_run.append:
         check_eval_run_name_exists(
@@ -384,15 +434,32 @@ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_
             sequence_run.organization_id,
             True
         )
+    if function and tracer:
+        new_sequences: List[Sequence] = []
+        for example in examples:
+            if example.input:
+                result = run_with_spinner("Running agent function: ", function, **example.input)
+            else:
+                result = run_with_spinner("Running agent function: ", function)
+        for i, trace in enumerate(tracer.traces):
+            trace_id = trace['trace_id']
+            parent_span = trace['entries'][0]['span_id']
+            new_sequence = retrieve_sequence_from_trace(trace_id, parent_span, sequence_run.judgment_api_key, sequence_run.organization_id)
+            new_sequence.expected_tools = examples[i].expected_tools
+            new_sequences.append(new_sequence)
+        sequence_run.sequences = new_sequences
+    for sequence in sequence_run.sequences:
+        sequence.scorers = sequence_run.scorers
     # Execute evaluation using Judgment API
     info("Starting API evaluation")
     try:  # execute an EvaluationRun with just JudgmentScorers
         debug("Sending request to Judgment API")
         response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
-        info(f"Received {len(response_data['results'])} results from API")
+        scoring_results = [ScoringResult(**result) for result in response_data["results"]]
+        info(f"Received {len(scoring_results)} results from API")
     except JudgmentAPIError as e:
         error(f"An error occurred while executing the Judgment API request: {str(e)}")
         raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
@@ -405,6 +472,8 @@ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_
     if sequence_run.log_results:
         pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], sequence_run)
         rprint(pretty_str)
+    return scoring_results
@@ -587,7 +656,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
         #     )
         # print(merged_results)
         if evaluation_run.log_results:
-            send_results = [result.model_dump(warnings=False) for result in merged_results]
+            send_results = [scoring_result.model_dump(warnings=False) for scoring_result in merged_results]
             pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, send_results, evaluation_run)
             rprint(pretty_str)
@@ -613,34 +682,31 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
             # Create a test case context with all relevant fields
             test_case = {
-                'input': result.data_object.input,
-                'actual_output': result.data_object.actual_output,
-                'expected_output': result.data_object.expected_output,
-                'context': result.data_object.context,
-                'retrieval_context': result.data_object.retrieval_context,
-                'additional_metadata': result.data_object.additional_metadata,
-                'tools_called': result.data_object.tools_called,
-                'expected_tools': result.data_object.expected_tools,
-                'failed_scorers': []
+                "failed_scorers": []
             }
             if result.scorers_data:
                 # If the result was not successful, check each scorer_data
                 for scorer_data in result.scorers_data:
                     if not scorer_data.success:
+                        if scorer_data.name == "Tool Order":
+                            # Remove threshold, evaluation model for Tool Order scorer
+                            scorer_data.threshold = None
+                            scorer_data.evaluation_model = None
                         test_case['failed_scorers'].append(scorer_data)
             failed_cases.append(test_case)
     if failed_cases:
         error_msg = f"The following test cases failed: \n"
         for fail_case in failed_cases:
-            error_msg += f"\nInput: {fail_case['input']}\n"
-            error_msg += f"Actual Output: {fail_case['actual_output']}\n"
-            error_msg += f"Expected Output: {fail_case['expected_output']}\n"
-            error_msg += f"Context: {fail_case['context']}\n"
-            error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
-            error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
-            error_msg += f"Tools Called: {fail_case['tools_called']}\n"
-            error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
+            # error_msg += f"\nInput: {fail_case['input']}\n"
+            # error_msg += f"Actual Output: {fail_case['actual_output']}\n"
+            # error_msg += f"Expected Output: {fail_case['expected_output']}\n"
+            # error_msg += f"Context: {fail_case['context']}\n"
+            # error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
+            # error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
+            # error_msg += f"Tools Called: {fail_case['tools_called']}\n"
+            # error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
             for fail_scorer in fail_case['failed_scorers']:
@@ -658,6 +724,37 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
                     f"Additional Metadata: {fail_scorer.additional_metadata}\n"
                 )
             error_msg += "-"*100
-        raise AssertionError(error_msg)
+        total_tests = len(scoring_results)
+        failed_tests = len(failed_cases)
+        passed_tests = total_tests - failed_tests
+        # Print summary with colors
+        rprint("\n" + "="*80)
+        if failed_tests == 0:
+            rprint(f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]")
+        else:
+            rprint(f"[bold red]⚠️  TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]")
+        rprint("="*80 + "\n")
+        # Print individual test cases
+        for i, result in enumerate(scoring_results):
+            test_num = i + 1
+            if result.success:
+                rprint(f"[green]✓ Test {test_num}: PASSED[/green]")
+            else:
+                rprint(f"[red]✗ Test {test_num}: FAILED[/red]")
+                if result.scorers_data:
+                    for scorer_data in result.scorers_data:
+                        if not scorer_data.success:
+                            rprint(f"  [yellow]Scorer: {scorer_data.name}[/yellow]")
+                            rprint(f"  [red]  Score: {scorer_data.score}[/red]")
+                            rprint(f"  [red]  Reason: {scorer_data.reason}[/red]")
+                            if scorer_data.error:
+                                rprint(f"  [red]  Error: {scorer_data.error}[/red]")
+                rprint("  " + "-"*40)
+        rprint("\n" + "="*80)
+        if failed_tests > 0:
+            raise AssertionError(failed_cases)

judgeval/scorers/__init__.py CHANGED Viewed

@@ -16,6 +16,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
     InstructionAdherenceScorer,
     GroundednessScorer,
     DerailmentScorer,
+    ToolOrderScorer,
 )
 from judgeval.scorers.judgeval_scorers.classifiers import (
     Text2SQLScorer,
@@ -41,4 +42,5 @@ __all__ = [
     "InstructionAdherenceScorer",
     "GroundednessScorer",
     "DerailmentScorer",
+    "ToolOrderScorer",
 ]

judgeval/scorers/judgeval_scorers/api_scorers/__init__.py CHANGED Viewed

@@ -12,6 +12,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonS
 from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
+from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
 __all__ = [
     "ExecutionOrderScorer",
     "JSONCorrectnessScorer",
@@ -27,4 +28,5 @@ __all__ = [
     "InstructionAdherenceScorer",
     "GroundednessScorer",
     "DerailmentScorer",
+    "ToolOrderScorer",
 ]

judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""
+`judgeval` tool order scorer
+"""
+# Internal imports
+from judgeval.scorers.api_scorer import APIJudgmentScorer
+from judgeval.constants import APIScorer
+class ToolOrderScorer(APIJudgmentScorer):
+    def __init__(self, threshold: float=1.0):
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.TOOL_ORDER,
+        )
+    @property
+    def __name__(self):
+        return "Tool Order"

judgeval/scorers/score.py CHANGED Viewed

@@ -243,7 +243,7 @@ async def score_with_indicator(
 async def a_execute_scoring(
     examples: Union[List[Example], List[CustomExample]],
     scorers: List[JudgevalScorer],
-    model: Optional[Union[str, List[str], JudgevalJudge]] = None,
+    model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
     ignore_errors: bool = True,
     skip_on_missing_params: bool = True,
     show_indicator: bool = True,

judgeval/utils/data_utils.py ADDED Viewed

@@ -0,0 +1,57 @@
+import yaml
+from judgeval.common.logger import (
+    debug,
+    info,
+    error,
+    example_logging_context
+)
+from judgeval.data import Example
+def add_from_yaml(file_path: str) -> None:
+    debug(f"Loading dataset from YAML file: {file_path}")
+    """
+    Adds examples from a YAML file.
+    The format of the YAML file is expected to be a dictionary with one key: "examples".
+    The value of the key is a list of dictionaries, where each dictionary represents an example.
+    The YAML file is expected to have the following format:
+    examples:
+        - input: "test input"
+        actual_output: "test output"
+        expected_output: "expected output"
+        context:
+            - "context1"
+            - "context2"
+        retrieval_context:
+            - "retrieval1"
+        additional_metadata:
+            key: "value"
+        tools_called:
+            - "tool1"
+        expected_tools:
+            - {tool_name: "tool1", parameters: {"query": "test query 1"}}
+            - {tool_name: "tool2", parameters: {"query": "test query 2"}}
+        name: "test example"
+        example_id: null
+        timestamp: "20241230_160117"
+        trace_id: "123"
+    """
+    try:
+        with open(file_path, "r") as file:
+            payload = yaml.safe_load(file)
+            if payload is None:
+                raise ValueError("The YAML file is empty.")
+            examples = payload.get("examples", [])
+    except FileNotFoundError:
+        error(f"YAML file not found: {file_path}")
+        raise FileNotFoundError(f"The file {file_path} was not found.")
+    except yaml.YAMLError:
+        error(f"Invalid YAML file: {file_path}")
+        raise ValueError(f"The file {file_path} is not a valid YAML file.")
+    info(f"Added {len(examples)} examples from YAML")
+    new_examples = [Example(**e) for e in examples]
+    return new_examples

judgeval 0.0.36__py3-none-any.whl → 0.0.37__py3-none-any.whl

judgeval 0.0.36py3-none-any.whl → 0.0.37py3-none-any.whl