PyPI - judgeval - Versions diffs - 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl - Mend

judgeval 0.0.19py3-none-any.whl → 0.0.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

judgeval/common/tracer.py CHANGED Viewed

@@ -10,7 +10,9 @@ import os
 import time
 import uuid
 import warnings
+from contextvars import ContextVar
 from contextlib import contextmanager
+from collections import defaultdict
 from dataclasses import dataclass, field
 from datetime import datetime
 from http import HTTPStatus
@@ -36,6 +38,7 @@ from judgeval.constants import (
     RABBITMQ_PORT,
     RABBITMQ_QUEUE,
     JUDGMENT_TRACES_DELETE_API_URL,
+    JUDGMENT_PROJECT_DELETE_API_URL,
     JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
 )
 from judgeval.judgment_client import JudgmentClient
@@ -53,7 +56,7 @@ from langchain_core.utils.function_calling import convert_to_openai_tool
 from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
 from langchain_core.agents import AgentAction, AgentFinish
 from langchain_core.outputs import LLMResult
+from langchain_core.tracers.context import register_configure_hook
 from langchain_core.messages.ai import AIMessage
 from langchain_core.messages.tool import ToolMessage
 from langchain_core.messages.base import BaseMessage
@@ -250,7 +253,8 @@ class TraceManagerClient:
             raise ValueError(f"Failed to save trace data: {response.text}")
         if not empty_save and "ui_results_url" in response.json():
-            rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
+            pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
+            rprint(pretty_str)
     def delete_trace(self, trace_id: str):
         """
@@ -293,6 +297,27 @@ class TraceManagerClient:
             raise ValueError(f"Failed to delete trace: {response.text}")
         return response.json()
+    def delete_project(self, project_name: str):
+        """
+        Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
+        """
+        response = requests.delete(
+            JUDGMENT_PROJECT_DELETE_API_URL,
+            json={
+                "project_name": project_name,
+            },
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {self.judgment_api_key}",
+                "X-Organization-Id": self.organization_id
+            }
+        )
+        if response.status_code != HTTPStatus.OK:
+            raise ValueError(f"Failed to delete traces: {response.text}")
+        return response.json()
 class TraceClient:
@@ -962,6 +987,10 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
 class JudgevalCallbackHandler(BaseCallbackHandler):
     def __init__(self, trace_client: TraceClient):
         self.trace_client = trace_client
+        self.previous_node = "__start__"
+        self.executed_node_tools = []
+        self.executed_nodes = []
+        self.executed_tools = []
         self.openai_count = 1
     def start_span(self, name: str, span_type: SpanType = "span"):
@@ -1049,6 +1078,23 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
         # End the retriever span
         self.end_span(self.trace_client._current_span, span_type="retriever")
+    def on_chain_start(
+        self,
+        serialized: Dict[str, Any],
+        inputs: Dict[str, Any],
+        *,
+        run_id: UUID,
+        parent_run_id: Optional[UUID] = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        **kwargs: Any
+    ) -> None:
+        node = metadata.get("langgraph_node")
+        if node != None and node != "__start__" and node != self.previous_node:
+            self.executed_node_tools.append(node)
+            self.executed_nodes.append(node)
+        self.previous_node = node
     def on_tool_start(
         self,
         serialized: Optional[dict[str, Any]],
@@ -1060,6 +1106,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
     ):
         name = serialized["name"]
         self.start_span(name, span_type="tool")
+        self.executed_node_tools.append(f"{self.previous_node}:{name}")
+        self.executed_tools.append(name)
         self.trace_client.record_input({
             'args': input_str,
             'kwargs': kwargs
@@ -1128,3 +1176,18 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
             'args': str(messages),
             'kwargs': kwargs
         })
+judgeval_callback_handler_var: ContextVar[Optional[JudgevalCallbackHandler]] = ContextVar(
+    "judgeval_callback_handler", default=None
+)
+def set_global_handler(handler: JudgevalCallbackHandler):
+    judgeval_callback_handler_var.set(handler)
+def clear_global_handler():
+    judgeval_callback_handler_var.set(None)
+register_configure_hook(
+    context_var=judgeval_callback_handler_var,
+    inheritable=True,
+)

judgeval/constants.py CHANGED Viewed

@@ -22,7 +22,7 @@ class APIScorer(str, Enum):
     CONTEXTUAL_RELEVANCY = "contextual_relevancy"
     CONTEXTUAL_PRECISION = "contextual_precision"
     INSTRUCTION_ADHERENCE = "instruction_adherence"
-    TOOL_CORRECTNESS = "tool_correctness"
+    EXECUTION_ORDER = "execution_order"
     JSON_CORRECTNESS = "json_correctness"
     COMPARISON = "comparison"
     GROUNDEDNESS = "groundedness"
@@ -48,6 +48,7 @@ JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
 JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
 JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
 JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
+JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
 JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
 JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
 JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"

judgeval/data/api_example.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import List, Optional, Dict, Any
+from typing import List, Optional, Dict, Any, Union
 from pydantic import BaseModel, ConfigDict, model_validator
 from judgeval.data.example import Example
@@ -13,8 +13,8 @@ class ProcessExample(BaseModel):
     """
     name: str
     input: Optional[str] = None
-    actual_output: Optional[str] = None
-    expected_output: Optional[str] = None
+    actual_output: Optional[Union[str, List[str]]] = None
+    expected_output: Optional[Union[str, List[str]]] = None
     context: Optional[list] = None
     retrieval_context: Optional[list] = None
     tools_called: Optional[list] = None
@@ -57,19 +57,6 @@ class ProcessExample(BaseModel):
     def update_run_duration(self, run_duration: float):
         self.run_duration = run_duration
-    @model_validator(mode="before")
-    def check_input(cls, values: Dict[str, Any]):
-        input = values.get("input")
-        actual_output = values.get("actual_output")
-        if (input is None or actual_output is None):
-            error(f"Validation error: Required fields missing. input={input}, actual_output={actual_output}")
-            raise ValueError(
-                "'input' and 'actual_output' must be provided."
-            )
-        return values
 def create_process_example(

judgeval/data/datasets/dataset.py CHANGED Viewed

@@ -3,6 +3,7 @@ import csv
 import datetime
 import json
 import os
+import yaml
 from dataclasses import dataclass, field
 from typing import List, Union, Literal
@@ -190,6 +191,76 @@ class EvalDataset:
         for g in ground_truths:
             self.add_ground_truth(g)
+    def add_from_yaml(self, file_path: str) -> None:
+        debug(f"Loading dataset from YAML file: {file_path}")
+        """
+        Adds examples and ground truths from a YAML file.
+        The format of the YAML file is expected to be a dictionary with two keys: "examples" and "ground_truths".
+        The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth.
+        The YAML file is expected to have the following format:
+        ground_truths:
+          - input: "test input"
+            actual_output: null
+            expected_output: "expected output"
+            context:
+              - "context1"
+            retrieval_context:
+              - "retrieval1"
+            additional_metadata:
+              key: "value"
+            comments: "test comment"
+            tools_called:
+              - "tool1"
+            expected_tools:
+              - "tool1"
+            source_file: "test.py"
+            trace_id: "094121"
+        examples:
+          - input: "test input"
+            actual_output: "test output"
+            expected_output: "expected output"
+            context:
+              - "context1"
+              - "context2"
+            retrieval_context:
+              - "retrieval1"
+            additional_metadata:
+              key: "value"
+            tools_called:
+              - "tool1"
+            expected_tools:
+              - "tool1"
+              - "tool2"
+            name: "test example"
+            example_id: null
+            timestamp: "20241230_160117"
+            trace_id: "123"
+        """
+        try:
+            with open(file_path, "r") as file:
+                payload = yaml.safe_load(file)
+                if payload is None:
+                    raise ValueError("The YAML file is empty.")
+                examples = payload.get("examples", [])
+                ground_truths = payload.get("ground_truths", [])
+        except FileNotFoundError:
+            error(f"YAML file not found: {file_path}")
+            raise FileNotFoundError(f"The file {file_path} was not found.")
+        except yaml.YAMLError:
+            error(f"Invalid YAML file: {file_path}")
+            raise ValueError(f"The file {file_path} is not a valid YAML file.")
+        info(f"Added {len(examples)} examples and {len(ground_truths)} ground truths from YAML")
+        new_examples = [Example(**e) for e in examples]
+        for e in new_examples:
+            self.add_example(e)
+        new_ground_truths = [GroundTruthExample(**g) for g in ground_truths]
+        for g in new_ground_truths:
+            self.add_ground_truth(g)
     def add_example(self, e: Example) -> None:
         self.examples = self.examples + [e]
         # TODO if we need to add rank, then we need to do it here
@@ -197,7 +268,7 @@ class EvalDataset:
     def add_ground_truth(self, g: GroundTruthExample) -> None:
         self.ground_truths = self.ground_truths + [g]
-    def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: str = None) -> None:
+    def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
         """
         Saves the dataset as a file. Save both the ground truths and examples.
@@ -266,8 +337,49 @@ class EvalDataset:
                             g.trace_id
                         ]
                     )
+        elif file_type == "yaml":
+            with open(complete_path, "w") as file:
+                yaml_data = {
+                    "examples": [
+                        {
+                            "input": e.input,
+                            "actual_output": e.actual_output,
+                            "expected_output": e.expected_output,
+                            "context": e.context,
+                            "retrieval_context": e.retrieval_context,
+                            "additional_metadata": e.additional_metadata,
+                            "tools_called": e.tools_called,
+                            "expected_tools": e.expected_tools,
+                            "name": e.name,
+                            "comments": None,  # Example does not have comments
+                            "source_file": None,  # Example does not have source file
+                            "example": True,  # Adding an Example
+                            "trace_id": e.trace_id
+                        }
+                        for e in self.examples
+                    ],
+                    "ground_truths": [
+                        {
+                            "input": g.input,
+                            "actual_output": g.actual_output,
+                            "expected_output": g.expected_output,
+                            "context": g.context,
+                            "retrieval_context": g.retrieval_context,
+                            "additional_metadata": g.additional_metadata,
+                            "tools_called": g.tools_called,
+                            "expected_tools": g.expected_tools,
+                            "name": None,  # GroundTruthExample does not have name
+                            "comments": g.comments,
+                            "source_file": g.source_file,
+                            "example": False,  # Adding a GroundTruthExample, not an Example
+                            "trace_id": g.trace_id
+                        }
+                        for g in self.ground_truths
+                    ]
+                }
+                yaml.dump(yaml_data, file, default_flow_style=False)
         else:
-            ACCEPTABLE_FILE_TYPES = ["json", "csv"]
+            ACCEPTABLE_FILE_TYPES = ["json", "csv", "yaml"]
             raise TypeError(f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}")
     def __iter__(self):

judgeval/data/example.py CHANGED Viewed

@@ -2,11 +2,13 @@
 Classes for representing examples in a dataset.
 """
-from typing import Optional, Any, Dict, List
+from typing import Optional, Any, Dict, List, Union
 from uuid import uuid4
 from pydantic import BaseModel, Field, field_validator
 from enum import Enum
 from datetime import datetime
+import time
 class ExampleParams(Enum):
@@ -22,9 +24,9 @@ class ExampleParams(Enum):
 class Example(BaseModel):
-    input: str
-    actual_output: str
-    expected_output: Optional[str] = None
+    input: Optional[str] = None
+    actual_output: Optional[Union[str, List[str]]] = None
+    expected_output: Optional[Union[str, List[str]]] = None
     context: Optional[List[str]] = None
     retrieval_context: Optional[List[str]] = None
     additional_metadata: Optional[Dict[str, Any]] = None
@@ -37,12 +39,6 @@ class Example(BaseModel):
     trace_id: Optional[str] = None
     def __init__(self, **data):
-        # Check that required fields are provided
-        if 'input' not in data:
-            raise ValueError("Example must be initialized with 'input' field.")
-        if 'actual_output' not in data:
-            raise ValueError("Example must be initialized with 'actual_output' field.")
         if 'example_id' not in data:
             data['example_id'] = str(uuid4())
         # Set timestamp if not provided
@@ -53,22 +49,27 @@ class Example(BaseModel):
     @field_validator('input', mode='before')
     @classmethod
     def validate_input(cls, v):
-        if not v or not isinstance(v, str):
+        if v is not None and (not v or not isinstance(v, str)):
             raise ValueError(f"Input must be a non-empty string but got '{v}' of type {type(v)}")
         return v
     @field_validator('actual_output', mode='before')
     @classmethod
     def validate_actual_output(cls, v):
-        if not isinstance(v, str):
-            raise ValueError(f"Actual output must be a string but got '{v}' of type {type(v)}")
+        if v is not None:
+            if not isinstance(v, (str, list)):
+                raise ValueError(f"Actual output must be a string or a list of strings but got {v} of type {type(v)}")
+            if isinstance(v, list) and not all(isinstance(item, str) for item in v):
+                raise ValueError(f"All items in actual_output must be strings but got {v}")
         return v
     @field_validator('expected_output', mode='before')
     @classmethod
     def validate_expected_output(cls, v):
-        if v is not None and not isinstance(v, str):
-            raise ValueError(f"Expected output must be a string or None but got {v} of type {type(v)}")
+        if v is not None and not isinstance(v, (str, list)):
+            raise ValueError(f"Expected output must be a string, a list of strings, or None but got {v} of type {type(v)}")
+        if isinstance(v, list) and not all(isinstance(item, str) for item in v):
+            raise ValueError(f"All items in expected_output must be strings but got {v}")
         return v
     @field_validator('context', 'retrieval_context', 'tools_called', 'expected_tools', mode='before')

judgeval/data/result.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Union, Optional, Dict, Any
+from typing import List, Union, Optional, Dict, Any, Union
 from judgeval.data import ScorerData, ProcessExample
@@ -30,8 +30,8 @@ class ScoringResult:
     # Inputs from the original example
     input: Optional[str] = None
-    actual_output: Optional[str] = None
-    expected_output: Optional[str] = None
+    actual_output: Optional[Union[str, List[str]]] = None
+    expected_output: Optional[Union[str, List[str]]] = None
     context: Optional[List[str]] = None
     retrieval_context: Optional[List[str]] = None
     additional_metadata: Optional[Dict[str, Any]] = None

judgeval/judgment_client.py CHANGED Viewed

@@ -27,7 +27,8 @@ from judgeval.judges import JudgevalJudge
 from judgeval.constants import (
     JUDGMENT_EVAL_FETCH_API_URL,
     JUDGMENT_EVAL_DELETE_API_URL,
-    JUDGMENT_EVAL_DELETE_PROJECT_API_URL
+    JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
+    JUDGMENT_PROJECT_DELETE_API_URL
 )
 from judgeval.common.exceptions import JudgmentAPIError
 from pydantic import BaseModel
@@ -156,7 +157,7 @@ class JudgmentClient:
         metadata: Optional[Dict[str, Any]] = None,
         project_name: str = "",
         eval_run_name: str = "",
-        log_results: bool = False,
+        log_results: bool = True,
         use_judgment: bool = True,
         rules: Optional[List[Rule]] = None
     ) -> List[ScoringResult]:
@@ -362,7 +363,6 @@ class JudgmentClient:
         response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
                         json={
                             "project_name": project_name,
-                            "judgment_api_key": self.judgment_api_key,
                         },
                         headers={
                             "Content-Type": "application/json",
@@ -372,6 +372,23 @@ class JudgmentClient:
         if response.status_code != requests.codes.ok:
             raise ValueError(f"Error deleting eval results: {response.json()}")
         return response.json()
+    def delete_project(self, project_name: str) -> bool:
+        """
+        Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
+        """
+        response = requests.delete(JUDGMENT_PROJECT_DELETE_API_URL,
+                        json={
+                            "project_name": project_name,
+                        },
+                        headers={
+                            "Content-Type": "application/json",
+                            "Authorization": f"Bearer {self.judgment_api_key}",
+                            "X-Organization-Id": self.organization_id
+                        })
+        if response.status_code != requests.codes.ok:
+            raise ValueError(f"Error deleting project: {response.json()}")
+        return response.json()
     def _validate_api_key(self):
         """

judgeval/run_evaluation.py CHANGED Viewed

@@ -1,12 +1,17 @@
 import asyncio
 import requests
-from typing import List, Dict
+import time
+import sys
+import itertools
+import threading
+from typing import List, Dict, Any
 from datetime import datetime
 from rich import print as rprint
 from judgeval.data import (
     ScorerData,
-    ScoringResult
+    ScoringResult,
+    Example
 )
 from judgeval.scorers import (
     JudgevalScorer,
@@ -14,7 +19,6 @@ from judgeval.scorers import (
     ClassifierScorer
 )
 from judgeval.scorers.score import a_execute_scoring
 from judgeval.constants import (
     ROOT_API,
     JUDGMENT_EVAL_API_URL,
@@ -185,7 +189,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
         raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
-def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> None:
+def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> str:
     """
     Logs evaluation results to the Judgment API database.
@@ -220,7 +224,9 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
             raise JudgmentAPIError(error_message)
         if "ui_results_url" in res.json():
-            rprint(f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)]{res.json()['ui_results_url']}[/]\n")
+            url = res.json()['ui_results_url']
+            pretty_str = f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
+            return pretty_str
     except requests.exceptions.RequestException as e:
         error(f"Request failed while saving evaluation results to DB: {str(e)}")
@@ -229,6 +235,51 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
         error(f"Failed to save evaluation results to DB: {str(e)}")
         raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
+def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
+        """Run a function with a spinner in the terminal."""
+        spinner = itertools.cycle(['|', '/', '-', '\\'])
+        def display_spinner():
+            while not stop_spinner_event.is_set():
+                sys.stdout.write(f'\r{message}{next(spinner)}')
+                sys.stdout.flush()
+                time.sleep(0.1)
+        stop_spinner_event = threading.Event()
+        spinner_thread = threading.Thread(target=display_spinner)
+        spinner_thread.start()
+        try:
+            result = func(*args, **kwargs)
+        except Exception as e:
+            error(f"An error occurred: {str(e)}")
+            stop_spinner_event.set()
+            spinner_thread.join()
+            raise e
+        finally:
+            stop_spinner_event.set()
+            spinner_thread.join()
+            sys.stdout.write('\r' + ' ' * (len(message) + 1) + '\r')
+            sys.stdout.flush()
+        return result
+def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) -> None:
+    """
+    Checks if the example contains the necessary parameters for the scorer.
+    """
+    for scorer in scorers:
+        if isinstance(scorer, APIJudgmentScorer):
+            for example in examples:
+                missing_params = []
+                for param in scorer.required_params:
+                    if getattr(example, param.value) is None:
+                        missing_params.append(f"'{param.value}'")
+                if missing_params:
+                    # We do this because we want to inform users that an example is missing parameters for a scorer
+                    # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
+                    print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
 def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
@@ -253,7 +304,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
     Returns:
         List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
     """
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
     if not override and evaluation_run.log_results:
         check_eval_run_name_exists(
@@ -306,6 +357,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
     # Execute evaluation using Judgment API
     if judgment_scorers:
+        check_examples(evaluation_run.examples, evaluation_run.scorers)
         info("Starting API evaluation")
         debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
         try:  # execute an EvaluationRun with just JudgmentScorers
@@ -323,7 +375,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
                 rules=evaluation_run.rules
             )
             debug("Sending request to Judgment API")
-            response_data: List[Dict] = execute_api_eval(api_evaluation_run)  # Dicts are `ScoringResult` objs
+            response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
             info(f"Received {len(response_data['results'])} results from API")
         except JudgmentAPIError as e:
             error(f"An error occurred while executing the Judgment API request: {str(e)}")
@@ -352,6 +404,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
                 api_results.append(ScoringResult(**filtered_result))
     # Run local evals
     if local_scorers:  # List[JudgevalScorer]
+        # We should be removing local scorers soon
         info("Starting local evaluation")
         for example in evaluation_run.examples:
             with example_logging_context(example.timestamp, example.example_id):
@@ -389,7 +442,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
     #     )
     if evaluation_run.log_results:
-        log_evaluation_results(merged_results, evaluation_run)
+        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
+        rprint(pretty_str)
     for i, result in enumerate(merged_results):
         if not result.scorers_data:  # none of the scorers could be executed on this example

judgeval/scorers/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.scorers.judgeval_scorer import JudgevalScorer
 from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
 from judgeval.scorers.judgeval_scorers import (
-    ToolCorrectnessScorer,
+    ExecutionOrderScorer,
     JSONCorrectnessScorer,
     SummarizationScorer,
     HallucinationScorer,
@@ -24,7 +24,7 @@ __all__ = [
     "JudgevalScorer",
     "PromptScorer",
     "ClassifierScorer",
-    "ToolCorrectnessScorer",
+    "ExecutionOrderScorer",
     "JSONCorrectnessScorer",
     "SummarizationScorer",
     "HallucinationScorer",

judgeval/scorers/api_scorer.py CHANGED Viewed

@@ -5,8 +5,9 @@ Scores `Example`s using ready-made Judgment evaluators.
 """
 from pydantic import BaseModel, field_validator
+from typing import List
 from judgeval.common.logger import debug, info, warning, error
+from judgeval.data import ExampleParams
 from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
@@ -20,6 +21,7 @@ class APIJudgmentScorer(BaseModel):
     """
     score_type: APIScorer
     threshold: float
+    required_params: List[ExampleParams] = [] # List of the required parameters on examples for the scorer
     @field_validator('threshold')
     def validate_threshold(cls, v, info):

judgeval/scorers/judgeval_scorers/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Type, Optional, Any
 # Import implementations
 from judgeval.scorers.judgeval_scorers.api_scorers import (
-    ToolCorrectnessScorer as APIToolCorrectnessScorer,
+    ExecutionOrderScorer as APIExecutionOrderScorer,
     JSONCorrectnessScorer as APIJSONCorrectnessScorer,
     SummarizationScorer as APISummarizationScorer,
     HallucinationScorer as APIHallucinationScorer,
@@ -24,7 +24,7 @@ from judgeval.scorers.judgeval_scorers.local_implementations import (
     ContextualRelevancyScorer as LocalContextualRelevancyScorer,
     FaithfulnessScorer as LocalFaithfulnessScorer,
     JsonCorrectnessScorer as LocalJsonCorrectnessScorer,
-    ToolCorrectnessScorer as LocalToolCorrectnessScorer,
+    ExecutionOrderScorer as LocalExecutionOrderScorer,
     HallucinationScorer as LocalHallucinationScorer,
     SummarizationScorer as LocalSummarizationScorer,
     AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer,
@@ -98,9 +98,9 @@ AnswerRelevancyScorer = ScorerWrapper(
     local_implementation=LocalAnswerRelevancyScorer
 )
-ToolCorrectnessScorer = ScorerWrapper(
-    api_implementation=APIToolCorrectnessScorer,
-    local_implementation=LocalToolCorrectnessScorer
+ExecutionOrderScorer = ScorerWrapper(
+    api_implementation=APIExecutionOrderScorer,
+    local_implementation=LocalExecutionOrderScorer
 )
 JSONCorrectnessScorer = ScorerWrapper(
@@ -154,7 +154,7 @@ GroundednessScorer = ScorerWrapper(
 )
 __all__ = [
-    "ToolCorrectnessScorer",
+    "ExecutionOrderScorer",
     "JSONCorrectnessScorer",
     "SummarizationScorer",
     "HallucinationScorer",

judgeval/scorers/judgeval_scorers/api_scorers/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from judgeval.scorers.judgeval_scorers.api_scorers.tool_correctness import ToolCorrectnessScorer
+from judgeval.scorers.judgeval_scorers.api_scorers.execution_order import ExecutionOrderScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.json_correctness import JSONCorrectnessScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.summarization import SummarizationScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.hallucination import HallucinationScorer
@@ -13,7 +13,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import
 from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
 __all__ = [
-    "ToolCorrectnessScorer",
+    "ExecutionOrderScorer",
     "JSONCorrectnessScorer",
     "SummarizationScorer",
     "HallucinationScorer",

judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py CHANGED Viewed

@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class AnswerCorrectnessScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_CORRECTNESS)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.ANSWER_CORRECTNESS,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.EXPECTED_OUTPUT,
+            ]
+        )
     @property
     def __name__(self):

judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py CHANGED Viewed

@@ -8,11 +8,18 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class AnswerRelevancyScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_RELEVANCY)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.ANSWER_RELEVANCY,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+            ]
+        )
     @property
     def __name__(self):

judgeval/scorers/judgeval_scorers/api_scorers/comparison.py CHANGED Viewed

@@ -9,12 +9,20 @@ TODO add link to docs page for this scorer
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
 from typing import Optional, Dict
+from judgeval.data import ExampleParams
 class ComparisonScorer(APIJudgmentScorer):
     kwargs: Optional[Dict] = None
     def __init__(self, threshold: float, criteria: str, description: str):
-        super().__init__(threshold=threshold, score_type=APIScorer.COMPARISON)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.COMPARISON,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.EXPECTED_OUTPUT,
+            ]
+        )
         self.kwargs = {"criteria": criteria, "description": description}
     @property

judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py CHANGED Viewed

@@ -8,11 +8,20 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class ContextualPrecisionScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_PRECISION)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.CONTEXTUAL_PRECISION,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.RETRIEVAL_CONTEXT,
+                ExampleParams.EXPECTED_OUTPUT,
+            ]
+        )
     @property
     def __name__(self):

judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py CHANGED Viewed

@@ -8,12 +8,21 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class ContextualRecallScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_RECALL)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.CONTEXTUAL_RECALL,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.EXPECTED_OUTPUT,
+                ExampleParams.RETRIEVAL_CONTEXT,
+            ]
+        )
     @property
     def __name__(self):
         return "Contextual Recall"

judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py CHANGED Viewed

@@ -8,15 +8,22 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class ContextualRelevancyScorer(APIJudgmentScorer):
     """
     Scorer that checks if the output of a model is relevant to the retrieval context
     """
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_RELEVANCY)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.CONTEXTUAL_RELEVANCY,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.RETRIEVAL_CONTEXT,
+            ]
+        )
     @property
     def __name__(self):
         return "Contextual Relevancy"

judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""
+`judgeval` tool correctness scorer
+TODO add link to docs page for this scorer
+"""
+# Internal imports
+from judgeval.scorers.api_scorer import APIJudgmentScorer
+from judgeval.constants import APIScorer
+from typing import Optional, Dict, List
+from judgeval.data import ExampleParams
+class ExecutionOrderScorer(APIJudgmentScorer):
+    kwargs: Optional[Dict] = None
+    def __init__(self, threshold: float, should_exact_match: bool = False, should_consider_ordering: bool = False):
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.EXECUTION_ORDER,
+            required_params=[
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.EXPECTED_OUTPUT,
+            ]
+        )
+        self.kwargs = {"should_exact_match": should_exact_match, "should_consider_ordering": should_consider_ordering}
+    @property
+    def __name__(self):
+        return "Execution Order"
+    def to_dict(self) -> dict:
+        """
+        Converts the scorer configuration to a dictionary format.
+        Returns:
+            dict: A dictionary containing the scorer's configuration
+        """
+        return {
+            "score_type": self.score_type,
+            "threshold": self.threshold,
+            "kwargs": self.kwargs
+        }

judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py CHANGED Viewed

@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class FaithfulnessScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.FAITHFULNESS)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.FAITHFULNESS,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.RETRIEVAL_CONTEXT,
+            ]
+        )
     @property
     def __name__(self):

judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py CHANGED Viewed

@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class GroundednessScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.GROUNDEDNESS)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.GROUNDEDNESS,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.RETRIEVAL_CONTEXT,
+            ]
+        )
     @property
     def __name__(self):

judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py CHANGED Viewed

@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class HallucinationScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.HALLUCINATION)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.HALLUCINATION,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.CONTEXT,
+            ]
+        )
     @property
     def __name__(self):

judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py CHANGED Viewed

@@ -8,11 +8,18 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class InstructionAdherenceScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.INSTRUCTION_ADHERENCE)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.INSTRUCTION_ADHERENCE,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+            ]
+        )
     @property
     def __name__(self):

judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py CHANGED Viewed

@@ -11,13 +11,20 @@ from pydantic import BaseModel, Field
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class JSONCorrectnessScorer(APIJudgmentScorer):
     json_schema: BaseModel = Field(None, exclude=True)
     def __init__(self, threshold: float, json_schema: BaseModel):
-        super().__init__(threshold=threshold, score_type=APIScorer.JSON_CORRECTNESS)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.JSON_CORRECTNESS,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+            ]
+        )
         object.__setattr__(self, 'json_schema', json_schema)
     def to_dict(self):

judgeval/scorers/judgeval_scorers/api_scorers/summarization.py CHANGED Viewed

@@ -7,12 +7,19 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
-from judgeval.constants import APIScorer
+from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class SummarizationScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.SUMMARIZATION)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.SUMMARIZATION,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+            ]
+        )
     @property
     def __name__(self):

judgeval/scorers/judgeval_scorers/local_implementations/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ from judgeval.scorers.judgeval_scorers.local_implementations.contextual_recall.c
 from judgeval.scorers.judgeval_scorers.local_implementations.contextual_relevancy.contextual_relevancy_scorer import ContextualRelevancyScorer
 from judgeval.scorers.judgeval_scorers.local_implementations.faithfulness.faithfulness_scorer import FaithfulnessScorer
 from judgeval.scorers.judgeval_scorers.local_implementations.json_correctness.json_correctness_scorer import JsonCorrectnessScorer
-from judgeval.scorers.judgeval_scorers.local_implementations.tool_correctness.tool_correctness_scorer import ToolCorrectnessScorer
+from judgeval.scorers.judgeval_scorers.local_implementations.execution_order.execution_order import ExecutionOrderScorer
 from judgeval.scorers.judgeval_scorers.local_implementations.hallucination.hallucination_scorer import HallucinationScorer
 from judgeval.scorers.judgeval_scorers.local_implementations.summarization.summarization_scorer import SummarizationScorer
 from judgeval.scorers.judgeval_scorers.local_implementations.answer_correctness.answer_correctness_scorer import AnswerCorrectnessScorer
@@ -20,7 +20,7 @@ __all__ = [
     "ContextualRelevancyScorer",
     "FaithfulnessScorer",
     "JsonCorrectnessScorer",
-    "ToolCorrectnessScorer",
+    "ExecutionOrderScorer",
     "HallucinationScorer",
     "SummarizationScorer",
     "InstructionAdherenceScorer",

judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from judgeval.scorers.judgeval_scorers.local_implementations.execution_order.execution_order import ExecutionOrderScorer
+__all__ = ["ExecutionOrderScorer"]

judgeval/scorers/judgeval_scorers/local_implementations/{tool_correctness/tool_correctness_scorer.py → execution_order/execution_order.py} RENAMED Viewed

@@ -45,7 +45,7 @@ def get_lcs(seq1, seq2):
     return lcs[::-1]
-class ToolCorrectnessScorer(JudgevalScorer):
+class ExecutionOrderScorer(JudgevalScorer):
     def __init__(
         self,
         threshold: float = 0.5,
@@ -56,7 +56,7 @@ class ToolCorrectnessScorer(JudgevalScorer):
         should_consider_ordering: bool = False,
     ):
         super().__init__(
-            score_type=APIScorer.TOOL_CORRECTNESS,
+            score_type=APIScorer.EXECUTION_ORDER,
             threshold=1 if strict_mode else threshold,
             evaluation_model=None,
             include_reason=include_reason,
@@ -152,5 +152,5 @@ class ToolCorrectnessScorer(JudgevalScorer):
     @property
     def __name__(self):
-        return "Tool Correctness"
+        return "Execution Order"

{judgeval-0.0.19.dist-info → judgeval-0.0.21.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.19
+Version: 0.0.21
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -12,9 +12,15 @@ Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.11
 Requires-Dist: anthropic
 Requires-Dist: fastapi
+Requires-Dist: langchain
+Requires-Dist: langchain-anthropic
+Requires-Dist: langchain-core
+Requires-Dist: langchain-huggingface
+Requires-Dist: langchain-openai
 Requires-Dist: litellm
 Requires-Dist: nest-asyncio
 Requires-Dist: openai
+Requires-Dist: openpyxl
 Requires-Dist: pandas
 Requires-Dist: pika
 Requires-Dist: python-dotenv==1.0.1
@@ -23,8 +29,6 @@ Requires-Dist: supabase
 Requires-Dist: together
 Requires-Dist: uvicorn
 Provides-Extra: dev
-Requires-Dist: langfuse==2.50.3; extra == 'dev'
-Requires-Dist: patronus; extra == 'dev'
 Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
 Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
 Requires-Dist: pytest>=8.3.4; extra == 'dev'

{judgeval-0.0.19.dist-info → judgeval-0.0.21.dist-info}/RECORD RENAMED Viewed

@@ -1,23 +1,23 @@
 judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
 judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
-judgeval/constants.py,sha256=BXTzKBmhDVutiitaCRarfkc_M-0NplRJofIt_QSa5QI,5010
+judgeval/constants.py,sha256=VhJppAECTUDQwzC_FpzJw2wPlkYoogsadHxaJIY_J8U,5073
 judgeval/evaluation_run.py,sha256=RgJD60lJsunNQzObjo7iXnAzXWgubCLOAAuuamAAuoI,6354
-judgeval/judgment_client.py,sha256=evlvcrYO9pF-oCgcvlGE59iODN0C6GJtn7bySFU_88k,23384
+judgeval/judgment_client.py,sha256=5lqp9X67qPzBUu7kQYETslsc3L5JjxrDVgVLslF07A0,24173
 judgeval/rules.py,sha256=ebsiDEBVAnYTQxwVNvh_RpmKeWBnjQXgHs8KofTjcAs,15526
-judgeval/run_evaluation.py,sha256=yLW24kFcw0xzXHvnDclYqtujTww6SDwvut6HM1x7SXk,21505
+judgeval/run_evaluation.py,sha256=YOzkyeWl-r3vaz0jB5nM-1VULi7ALmJ9_f58ENqexXk,23827
 judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
 judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
 judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
-judgeval/common/tracer.py,sha256=tTG4VZRXJjilm0ltQCeXJvd7TiL9W1PSVaf0LOmw2C4,44430
+judgeval/common/tracer.py,sha256=WFjFNf3NZ2BN8UAu2MG0F3Om9LgJNma3m_GrxyXgJqE,46655
 judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
 judgeval/data/__init__.py,sha256=QykVE22Qf-I2f1g-jC9-iQyLNXgDmX1-vHbCgZg8Ra8,558
-judgeval/data/api_example.py,sha256=NEiJKpf2WIo4FPQ2-vuoCZ_9ixexhdg_wdNYWXPSA2M,4094
-judgeval/data/example.py,sha256=PHqRI8l94ylLgfgjIH4DqcFFHb-t-WBxRkZb9eXKlpI,5648
+judgeval/data/api_example.py,sha256=dzkrQ0xno08y6qNfqL2djXbapUyc2B2aQ5iANn0o4CY,3667
+judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
 judgeval/data/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
-judgeval/data/result.py,sha256=8FIO-bFKPegZuByKRjA2_sumjb8oGWQ5ZeQ1RVz5z2w,4393
+judgeval/data/result.py,sha256=4fgjKtUmT3br7K6fkRiNIxTGKUuwMeGyRLqzkpxwXKE,4436
 judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
 judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
-judgeval/data/datasets/dataset.py,sha256=0NItb98Yz0P954rg9FF9s09uVQ7cEg9A5J6Xvie9nhw,12022
+judgeval/data/datasets/dataset.py,sha256=LrBK8y3y1R9_BKmXxTzdXMMIQvXlq7tf7TM-u7jgSxE,16839
 judgeval/data/datasets/eval_dataset_client.py,sha256=QsfHyFC4WePV7uJGYUVjiIwtk1Ie_VpWUrnd2Q4kKdU,11479
 judgeval/data/datasets/utils.py,sha256=6DpGCPmGFNOKIGNcVCOSjTOdWemrpAuYnlo778sGG7g,2455
 judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
@@ -26,33 +26,33 @@ judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6
 judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
 judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
 judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
-judgeval/scorers/__init__.py,sha256=_KP6c1dr6O2p95hx_WvRpZXfSGg9r2hNn_PjY9Ch5ds,1160
-judgeval/scorers/api_scorer.py,sha256=wGqTQCbUE7uE-PzaKcCmexAqutdTunjFR0zVA6bUxdE,2518
+judgeval/scorers/__init__.py,sha256=gkeKJvjXhswCnkEyjijrVvGVM3Om86egrZ-PUOGvNvI,1158
+judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
 judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1mC0,2183
 judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
 judgeval/scorers/judgeval_scorer.py,sha256=oIkfoGXA09wL_vcK1DRibzQSA-MFNa-hmw1IhGBErf8,6592
 judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
 judgeval/scorers/score.py,sha256=GALVmeApP1Cyih2vY93zRaU6RShtW4jJDG47Pm6yfnw,18657
 judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
-judgeval/scorers/judgeval_scorers/__init__.py,sha256=-nnqz-aU5PB_m1cb-2ySpZ18WDxupxmQCr-ws0aSalw,6000
-judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=cJSwTA6hqZXUSaPkTl4yDyl3cUzv0IlcTu592uoTY98,1651
-judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=690G5askjE8dcbKPGvCF6JxAEM9QJUqb-3K-D6lI6oM,463
-judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=CqvvjV7AZqPlXh-PZaPKYPILHr15u4bIYiKBFjlk5i0,457
-judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=6Q1qbsANOoZ3PM8n_gtZLIMbTBB9879L3acRelNJ6Uk,1001
-judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=2zBrm_EEc143bmPA4HVcf8XtQeuc_BexczGx-SHlwRY,473
-judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=NyojBWy_lRYx8diREulSK8s9dfYdZav4eZjg3TwUm0M,461
-judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=wROMWOliCnB39ftX9TdeZmG9y0vrnxIGVby65tLOQRU,574
-judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=gNf_i5c0jjpz2zCGhe7TtDMLKxc1PdOExJMFB5X7hSg,442
-judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=esO76hEp0NzeBUdoSICPLdx5AeA5zWSt_2zpcSgvGis,442
-judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=ffYwH3CexPkKgo1rCALMivypROQjG5WWEsKXEFZxe2k,446
-judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=t1lWYOF0Pxvw5-NrI1Dt9FojaOncOCRlZc4a2SA20h4,477
-judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=CAZBQKwNSqpqAoOgStYfr-yP1Brug_6VRimRIQY-zdg,894
-judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=-E3oxYbI0D_0q-_fGWh2jQHW9O4Pu7I7xvLWsHU6cn8,450
-judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py,sha256=17ppPXm962ew67GU5m0npzbPu3CuhgdKY_KmfPvKfu4,457
+judgeval/scorers/judgeval_scorers/__init__.py,sha256=xFRb62sp4JmBUSeuAB_pC_7kEGp-lGdqCRIu9--Bbdg,5992
+judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mZ6b_5Dl04k3PaG24ICBajB_j43ody1II1OJhO1DkXo,1648
+judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=Fnd9CVIOZ73sWEWymsU5eBrrZqPFjMZ0BKpeW-PDyTg,711
+judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=oETeN9K0HSIRdL2SDqn82Vskpwh5SlKnZvs5VDm2OBU,658
+judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=kuzf9OWvpY38yYSwlBgneLkUZwJNM4FQqvbS66keA90,1249
+judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=tpSuzFAaW8X9xqA0aLLKwh7qmBK0Pc_bJZMIe_q412U,770
+judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=pFVhk4pLtQ-FnNlbI-dFF-SIh69Jza7erHqiPkFWoBo,758
+judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=RQ6DZwEhChfecd89Ey-T7ke--7qTaXZlRsNxwH8gaME,823
+judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=Pb3CiNF2Ca826B92wJCVAi_68lJjLhqqCKwQKaflSUg,1294
+judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=-BwOapqjryYNKNydtdkUiKIij76dY0O1jBmdc6dKazQ,692
+judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=ntEEeTANEOsGlcbiTAF_3r6BeSJEaVDns8po8T0L6Vg,692
+judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=k5gDOki-8KXrZXydvdSqDt3NZqQ28hXoOCHQf6jNxr4,686
+judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=XnSGEkQfwVqaqnHEGMCsxNiHVzrsrej48uDbLoWc8CQ,678
+judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=mMKEuR87_yanEuZJ5YSGFMHDD_oLVZ6-rQuciFaDOMA,1095
+judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=QmWB8bVbDYHY5FcF0rYZE_3c2XXgMLRmR6aXJWfdMC4,655
 judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
 judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
 judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
-judgeval/scorers/judgeval_scorers/local_implementations/__init__.py,sha256=pipWXfS_n4UsnZViwZAF2bPB1FYNfmoJAJUNY7JSq7I,1937
+judgeval/scorers/judgeval_scorers/local_implementations/__init__.py,sha256=k_t-THIAtsk7lNvm9faj0u24dPZjn7qRbZ8YGjQ21xs,1926
 judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py,sha256=cxxUEspgoIdSzJbwIIioamC0-xDqhYVfYAWxaYF-D_Y,177
 judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py,sha256=3Dpm8BIIe0Th2p0ccO5bb-le93lywjOLSo712HwEIUE,10196
 judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py,sha256=hBUqEd8Hy3g8peOVjpSmRb31fPtpodDzdRUonhKRl30,6686
@@ -71,6 +71,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompt
 judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py,sha256=JPCvrekKLbl_xdD49evhtiFIVocuegCpCBkn1auzTSE,184
 judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py,sha256=BtVgE7z-9PHfFRcvn96aEG5mXVcWBweVyty934hZdiU,8915
 judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py,sha256=uO-8Uo7VrXu4xWpxjIx6_UI3aw5KuJxubSHb71Nzm6Q,4574
+judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py,sha256=DpOHbjYEhVmP-RiaTEa5PZHpoPvduNXG5p6k9lR0AS0,157
+judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py,sha256=y-Ag8YuzEvExUIj4qU7y53INVLH9L_TUTJLIxCIdAQo,5458
 judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py,sha256=NbkSqPwxgF4T8KsvuIWhVyRwdOlo7mNHMFuRStTFnvk,154
 judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py,sha256=LPVTGHBBJSpE6TrgzZQS2_vw4P9HiUYmykrwo6UMdws,11251
 judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py,sha256=vNLjF4NKZJSV4VNenHzoAUB2xVZz6tt_5AzryKmOVrI,11690
@@ -84,11 +86,9 @@ judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_co
 judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py,sha256=mv6-XeLSV5yj1H98YYV2iTYVd88zKftZJP42Lgl6R80,89
 judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py,sha256=6GnRz2h-6Fwt4sl__0RgQOyo3n3iDO4MNuHWxdu-rrM,10242
 judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
-judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
-judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=8ucE8UrA44Mr-wHgVsFNU9gKunkPxe87VPYrFVi949g,5461
 judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
 judgeval/utils/alerts.py,sha256=RgW5R9Dn3Jtim0OyAYDbNzjoX2s6SA4Mw16GyyaikjI,1424
-judgeval-0.0.19.dist-info/METADATA,sha256=6HqNDRgJ1LI3hleMhMiGId7EULc9xJY0lYXhq4TEZOg,1283
-judgeval-0.0.19.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.0.19.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.0.19.dist-info/RECORD,,
+judgeval-0.0.21.dist-info/METADATA,sha256=jQW4w6jGNaHvPWTcqX3ZGr_SKeCpNl7DsNr-cwrYHsA,1378
+judgeval-0.0.21.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.0.21.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.0.21.dist-info/RECORD,,

judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py DELETED Viewed

@@ -1,19 +0,0 @@
-"""
-`judgeval` tool correctness scorer
-TODO add link to docs page for this scorer
-"""
-# Internal imports
-from judgeval.scorers.api_scorer import APIJudgmentScorer
-from judgeval.constants import APIScorer
-class ToolCorrectnessScorer(APIJudgmentScorer):
-    def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.TOOL_CORRECTNESS)
-    @property
-    def __name__(self):
-        return "Tool Correctness"

judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from judgeval.scorers.judgeval_scorers.local_implementations.tool_correctness.tool_correctness_scorer import ToolCorrectnessScorer
-__all__ = ["ToolCorrectnessScorer"]

{judgeval-0.0.19.dist-info → judgeval-0.0.21.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.0.19.dist-info → judgeval-0.0.21.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl

judgeval 0.0.19py3-none-any.whl → 0.0.21py3-none-any.whl