PyPI - judgeval - Versions diffs - 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl - Mend

judgeval 0.0.8py3-none-any.whl → 0.0.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

judgeval/common/tracer.py +50 -7
judgeval/constants.py +1 -0
judgeval/data/datasets/__init__.py +2 -1
judgeval/data/datasets/dataset.py +1 -122
judgeval/data/datasets/eval_dataset_client.py +193 -0
judgeval/data/result.py +16 -1
judgeval/judgment_client.py +18 -6
judgeval/run_evaluation.py +19 -0
judgeval/scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorers/__init__.py +4 -0
judgeval/scorers/judgeval_scorers/classifiers/__init__.py +3 -0
judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +3 -0
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +54 -0
{judgeval-0.0.8.dist-info → judgeval-0.0.10.dist-info}/METADATA +1 -1
{judgeval-0.0.8.dist-info → judgeval-0.0.10.dist-info}/RECORD +17 -13
{judgeval-0.0.8.dist-info → judgeval-0.0.10.dist-info}/WHEEL +0 -0
{judgeval-0.0.8.dist-info → judgeval-0.0.10.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/tracer.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Tracing system for judgeval that allows for function tracing using decorators.
 """
+import os
 import time
 import functools
 import requests
@@ -20,6 +21,7 @@ import json
 import warnings
 from pydantic import BaseModel
 from http import HTTPStatus
+from rich import print as rprint
 from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL
 from judgeval.judgment_client import JudgmentClient
@@ -121,8 +123,29 @@ class TraceEntry:
         Handles special cases:
         - Pydantic models are converted using model_dump()
+        - We try to serialize into JSON, then string, then the base representation (__repr__)
         - Non-serializable objects return None with a warning
         """
+        def safe_stringify(output, function_name):
+            """
+            Safely converts an object to a string or repr, handling serialization issues gracefully.
+            """
+            try:
+                return str(output)
+            except (TypeError, OverflowError, ValueError):
+                pass
+            try:
+                return repr(output)
+            except (TypeError, OverflowError, ValueError):
+                pass
+            warnings.warn(
+                f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
+            )
+            return None
         if isinstance(self.output, BaseModel):
             return self.output.model_dump()
@@ -131,8 +154,7 @@ class TraceEntry:
             json.dumps(self.output)
             return self.output
         except (TypeError, OverflowError, ValueError):
-            warnings.warn(f"Output for function {self.function} is not JSON serializable. Setting to None.")
-            return None
+            return safe_stringify(self.output, self.function)
 class TraceClient:
     """Client for managing a single trace context"""
@@ -361,6 +383,24 @@ class TraceClient:
         raw_entries = [entry.to_dict() for entry in self.entries]
         condensed_entries = self.condense_trace(raw_entries)
+        # Calculate total token counts from LLM API calls
+        total_prompt_tokens = 0
+        total_completion_tokens = 0
+        total_tokens = 0
+        for entry in condensed_entries:
+            if entry.get("span_type") == "llm" and isinstance(entry.get("output"), dict):
+                usage = entry["output"].get("usage", {})
+                # Handle OpenAI/Together format
+                if "prompt_tokens" in usage:
+                    total_prompt_tokens += usage.get("prompt_tokens", 0)
+                    total_completion_tokens += usage.get("completion_tokens", 0)
+                # Handle Anthropic format
+                elif "input_tokens" in usage:
+                    total_prompt_tokens += usage.get("input_tokens", 0)
+                    total_completion_tokens += usage.get("output_tokens", 0)
+                total_tokens += usage.get("total_tokens", 0)
         # Create trace document
         trace_data = {
             "trace_id": self.trace_id,
@@ -370,10 +410,10 @@ class TraceClient:
             "created_at": datetime.fromtimestamp(self.start_time).isoformat(),
             "duration": total_duration,
             "token_counts": {
-                "prompt_tokens": 0,  # Dummy value
-                "completion_tokens": 0,  # Dummy value
-                "total_tokens": 0,  # Dummy value
-            },  # TODO: Add token counts
+                "prompt_tokens": total_prompt_tokens,
+                "completion_tokens": total_completion_tokens,
+                "total_tokens": total_tokens,
+            },
             "entries": condensed_entries,
             "empty_save": empty_save,
             "overwrite": overwrite
@@ -393,6 +433,9 @@ class TraceClient:
         elif response.status_code != HTTPStatus.OK:
             raise ValueError(f"Failed to save trace data: {response.text}")
+        if not empty_save and "ui_results_url" in response.json():
+            rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
         return self.trace_id, trace_data
 class Tracer:
@@ -403,7 +446,7 @@ class Tracer:
             cls._instance = super(Tracer, cls).__new__(cls)
         return cls._instance
-    def __init__(self, api_key: str):
+    def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY")):
         if not hasattr(self, 'initialized'):
             if not api_key:

judgeval/constants.py CHANGED Viewed

@@ -36,6 +36,7 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
 JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
 JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
 JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
+JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
 JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
 JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
 JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"

judgeval/data/datasets/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from judgeval.data.datasets.dataset import EvalDataset
 from judgeval.data.datasets.ground_truth import GroundTruthExample
+from judgeval.data.datasets.eval_dataset_client import EvalDatasetClient
-__all__ = ["EvalDataset", "GroundTruthExample"]
+__all__ = ["EvalDataset", "EvalDatasetClient", "GroundTruthExample"]

judgeval/data/datasets/dataset.py CHANGED Viewed

@@ -2,16 +2,11 @@ import ast
 import csv
 import datetime
 import json
-from rich.console import Console
-from rich.progress import Progress, SpinnerColumn, TextColumn
-import requests
 from dataclasses import dataclass, field
 import os
 from typing import List, Optional, Union, Literal
-from judgeval.constants import JUDGMENT_DATASETS_PUSH_API_URL, JUDGMENT_DATASETS_PULL_API_URL
 from judgeval.data.datasets.ground_truth import GroundTruthExample
-from judgeval.data.datasets.utils import ground_truths_to_examples, examples_to_ground_truths
 from judgeval.data import Example
 from judgeval.common.logger import debug, error, warning, info
@@ -37,120 +32,6 @@ class EvalDataset:
         self._id = None
         self.judgment_api_key = judgment_api_key
-    def push(self, alias: str, overwrite: Optional[bool] = False) -> bool:
-        debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
-        if overwrite:
-            warning(f"Overwrite enabled for alias '{alias}'")
-        """
-        Pushes the dataset to Judgment platform
-        Mock request:
-        {
-            "alias": alias,
-            "ground_truths": [...],
-            "examples": [...],
-            "overwrite": overwrite
-        } ==>
-        {
-            "_alias": alias,
-            "_id": "..."  # ID of the dataset
-        }
-        """
-        with Progress(
-            SpinnerColumn(style="rgb(106,0,255)"),
-            TextColumn("[progress.description]{task.description}"),
-            transient=False,
-        ) as progress:
-            task_id = progress.add_task(
-                f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
-                total=100,
-            )
-            content = {
-                    "alias": alias,
-                    "ground_truths": [g.to_dict() for g in self.ground_truths],
-                    "examples": [e.to_dict() for e in self.examples],
-                    "overwrite": overwrite,
-                    "judgment_api_key": self.judgment_api_key
-                }
-            try:
-                response = requests.post(
-                    JUDGMENT_DATASETS_PUSH_API_URL,
-                    json=content
-                )
-                if response.status_code == 500:
-                    error(f"Server error during push: {content.get('message')}")
-                    return False
-                response.raise_for_status()
-            except requests.exceptions.HTTPError as err:
-                if response.status_code == 422:
-                    error(f"Validation error during push: {err.response.json()}")
-                else:
-                    error(f"HTTP error during push: {err}")
-            info(f"Successfully pushed dataset with alias '{alias}'")
-            payload = response.json()
-            self._alias = payload.get("_alias")
-            self._id = payload.get("_id")
-            progress.update(
-                    task_id,
-                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
-                )
-            return True
-    def pull(self, alias: str):
-        debug(f"Pulling dataset with alias '{alias}'")
-        """
-        Pulls the dataset from Judgment platform
-        Mock request:
-        {
-            "alias": alias,
-            "user_id": user_id
-        }
-        ==>
-        {
-            "ground_truths": [...],
-            "examples": [...],
-            "_alias": alias,
-            "_id": "..."  # ID of the dataset
-        }
-        """
-        # Make a POST request to the Judgment API to get the dataset
-        with Progress(
-                SpinnerColumn(style="rgb(106,0,255)"),
-                TextColumn("[progress.description]{task.description}"),
-                transient=False,
-            ) as progress:
-                task_id = progress.add_task(
-                    f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
-                    total=100,
-                )
-                request_body = {
-                    "alias": alias,
-                    "judgment_api_key": self.judgment_api_key
-                }
-                try:
-                    response = requests.post(
-                        JUDGMENT_DATASETS_PULL_API_URL,
-                        json=request_body
-                    )
-                    response.raise_for_status()
-                except requests.exceptions.RequestException as e:
-                    error(f"Error pulling dataset: {str(e)}")
-                    raise
-                info(f"Successfully pulled dataset with alias '{alias}'")
-                payload = response.json()
-                self.ground_truths = [GroundTruthExample(**g) for g in payload.get("ground_truths", [])]
-                self.examples = [Example(**e) for e in payload.get("examples", [])]
-                self._alias = payload.get("_alias")
-                self._id = payload.get("_id")
-                progress.update(
-                    task_id,
-                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
-                )
     def add_from_json(self, file_path: str) -> None:
         debug(f"Loading dataset from JSON file: {file_path}")
@@ -402,6 +283,4 @@ class EvalDataset:
             f"_alias={self._alias}, "
             f"_id={self._id}"
             f")"
-        )
+        )

judgeval/data/datasets/eval_dataset_client.py ADDED Viewed

@@ -0,0 +1,193 @@
+from typing import Optional
+import requests
+from rich.progress import Progress, SpinnerColumn, TextColumn
+from judgeval.common.logger import debug, error, warning, info
+from judgeval.constants import (
+    JUDGMENT_DATASETS_PUSH_API_URL,
+    JUDGMENT_DATASETS_PULL_API_URL,
+    JUDGMENT_DATASETS_PULL_ALL_API_URL
+)
+from judgeval.data import Example
+from judgeval.data.datasets import EvalDataset
+from judgeval.data.datasets.ground_truth import GroundTruthExample
+class EvalDatasetClient:
+    def __init__(self, judgment_api_key: str):
+        self.judgment_api_key = judgment_api_key
+    def create_dataset(self) -> EvalDataset:
+        return EvalDataset(judgment_api_key=self.judgment_api_key)
+    def push(self, dataset: EvalDataset, alias: str,overwrite: Optional[bool] = False) -> bool:
+        debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
+        if overwrite:
+            warning(f"Overwrite enabled for alias '{alias}'")
+        """
+        Pushes the dataset to Judgment platform
+        Mock request:
+        dataset = {
+            "alias": alias,
+            "ground_truths": [...],
+            "examples": [...],
+            "overwrite": overwrite
+        } ==>
+        {
+            "_alias": alias,
+            "_id": "..."  # ID of the dataset
+        }
+        """
+        with Progress(
+            SpinnerColumn(style="rgb(106,0,255)"),
+            TextColumn("[progress.description]{task.description}"),
+            transient=False,
+        ) as progress:
+            task_id = progress.add_task(
+                f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
+                total=100,
+            )
+            content = {
+                    "alias": alias,
+                    "ground_truths": [g.to_dict() for g in dataset.ground_truths],
+                    "examples": [e.to_dict() for e in dataset.examples],
+                    "overwrite": overwrite,
+                    "judgment_api_key": dataset.judgment_api_key
+                }
+            try:
+                response = requests.post(
+                    JUDGMENT_DATASETS_PUSH_API_URL,
+                    json=content
+                )
+                if response.status_code == 500:
+                    error(f"Server error during push: {content.get('message')}")
+                    return False
+                response.raise_for_status()
+            except requests.exceptions.HTTPError as err:
+                if response.status_code == 422:
+                    error(f"Validation error during push: {err.response.json()}")
+                else:
+                    error(f"HTTP error during push: {err}")
+            info(f"Successfully pushed dataset with alias '{alias}'")
+            payload = response.json()
+            dataset._alias = payload.get("_alias")
+            dataset._id = payload.get("_id")
+            progress.update(
+                    task_id,
+                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
+                )
+            return True
+    def pull(self, alias: str) -> EvalDataset:
+        debug(f"Pulling dataset with alias '{alias}'")
+        """
+        Pulls the dataset from Judgment platform
+        Mock request:
+        {
+            "alias": alias,
+            "user_id": user_id
+        }
+        ==>
+        {
+            "ground_truths": [...],
+            "examples": [...],
+            "_alias": alias,
+            "_id": "..."  # ID of the dataset
+        }
+        """
+        # Make a POST request to the Judgment API to get the dataset
+        dataset = self.create_dataset()
+        with Progress(
+                SpinnerColumn(style="rgb(106,0,255)"),
+                TextColumn("[progress.description]{task.description}"),
+                transient=False,
+            ) as progress:
+                task_id = progress.add_task(
+                    f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
+                    total=100,
+                )
+                request_body = {
+                    "alias": alias,
+                    "judgment_api_key": self.judgment_api_key
+                }
+                try:
+                    response = requests.post(
+                        JUDGMENT_DATASETS_PULL_API_URL,
+                        json=request_body
+                    )
+                    response.raise_for_status()
+                except requests.exceptions.RequestException as e:
+                    error(f"Error pulling dataset: {str(e)}")
+                    raise
+                info(f"Successfully pulled dataset with alias '{alias}'")
+                payload = response.json()
+                dataset.ground_truths = [GroundTruthExample(**g) for g in payload.get("ground_truths", [])]
+                dataset.examples = [Example(**e) for e in payload.get("examples", [])]
+                dataset._alias = payload.get("_alias")
+                dataset._id = payload.get("_id")
+                progress.update(
+                    task_id,
+                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
+                )
+                return dataset
+    def pull_all_user_dataset_stats(self) -> dict:
+        debug(f"Pulling user datasets stats for user_id: {self.judgment_api_key}'")
+        """
+        Pulls the user datasets stats from Judgment platform
+        Mock request:
+        {
+            "user_id": user_id
+        }
+        ==>
+        {
+            "test_dataset_1": {"examples_count": len(dataset1.examples), "ground_truths_count": len(dataset1.ground_truths)},
+            "test_dataset_2": {"examples_count": len(dataset2.examples), "ground_truths_count": len(dataset2.ground_truths)},
+            ...
+        }
+        """
+        # Make a POST request to the Judgment API to get the dataset
+        with Progress(
+                SpinnerColumn(style="rgb(106,0,255)"),
+                TextColumn("[progress.description]{task.description}"),
+                transient=False,
+            ) as progress:
+                task_id = progress.add_task(
+                    f"Pulling [rgb(106,0,255)]' datasets'[/rgb(106,0,255)] from Judgment...",
+                    total=100,
+                )
+                request_body = {
+                    "judgment_api_key": self.judgment_api_key
+                }
+                try:
+                    response = requests.post(
+                        JUDGMENT_DATASETS_PULL_ALL_API_URL,
+                        json=request_body
+                    )
+                    response.raise_for_status()
+                except requests.exceptions.RequestException as e:
+                    error(f"Error pulling dataset: {str(e)}")
+                    raise
+                info(f"Successfully pulled datasets for userid: {self.judgment_api_key}'")
+                payload = response.json()
+                progress.update(
+                    task_id,
+                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
+                )
+                return payload

judgeval/data/result.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Union, Optional
+from typing import List, Union, Optional, Dict, Any
 from judgeval.data import ScorerData, ProcessExample
@@ -18,6 +18,9 @@ class ScoringResult:
         expected_output (Optional[str]): The expected output of the example
         context (Optional[List[str]]): The context of the example
         retrieval_context (Optional[List[str]]): The retrieval context of the example
+        additional_metadata (Optional[Dict[str, Any]]): The additional metadata of the example
+        tools_called (Optional[List[str]]): The tools called by the example
+        expected_tools (Optional[List[str]]): The expected tools of the example
         trace_id (Optional[str]): The trace id of the example
     """
@@ -31,6 +34,9 @@ class ScoringResult:
     expected_output: Optional[str] = None
     context: Optional[List[str]] = None
     retrieval_context: Optional[List[str]] = None
+    additional_metadata: Optional[Dict[str, Any]] = None
+    tools_called: Optional[List[str]] = None
+    expected_tools: Optional[List[str]] = None
     trace_id: Optional[str] = None
     example_id: Optional[str] = None
@@ -46,6 +52,9 @@ class ScoringResult:
             "expected_output": self.expected_output,
             "context": self.context,
             "retrieval_context": self.retrieval_context,
+            "additional_metadata": self.additional_metadata,
+            "tools_called": self.tools_called,
+            "expected_tools": self.expected_tools,
             "trace_id": self.trace_id,
             "example_id": self.example_id
         }
@@ -59,6 +68,9 @@ class ScoringResult:
             expected_output={self.expected_output}, \
             context={self.context}, \
             retrieval_context={self.retrieval_context}, \
+            additional_metadata={self.additional_metadata}, \
+            tools_called={self.tools_called}, \
+            expected_tools={self.expected_tools}, \
             trace_id={self.trace_id})"
@@ -79,5 +91,8 @@ def generate_scoring_result(
         expected_output=process_example.expected_output,
         context=process_example.context,
         retrieval_context=process_example.retrieval_context,
+        additional_metadata=process_example.additional_metadata,
+        tools_called=process_example.tools_called,
+        expected_tools=process_example.expected_tools,
         trace_id=process_example.trace_id
     )

judgeval/judgment_client.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Optional, List, Dict, Any, Union
 import requests
 from judgeval.constants import ROOT_API
-from judgeval.data.datasets import EvalDataset
+from judgeval.data.datasets import EvalDataset, EvalDatasetClient
 from judgeval.data import (
     ScoringResult,
     Example
@@ -36,6 +36,7 @@ class EvalRunRequestBody(BaseModel):
 class JudgmentClient:
     def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY")):
         self.judgment_api_key = judgment_api_key
+        self.eval_dataset_client = EvalDatasetClient(judgment_api_key)
         # Verify API key is valid
         result, response = self._validate_api_key()
@@ -121,7 +122,7 @@ class JudgmentClient:
             raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
     def create_dataset(self) -> EvalDataset:
-        return EvalDataset(judgment_api_key=self.judgment_api_key)
+        return self.eval_dataset_client.create_dataset()
     def push_dataset(self, alias: str, dataset: EvalDataset, overwrite: Optional[bool] = False) -> bool:
         """
@@ -137,7 +138,7 @@ class JudgmentClient:
         """
         # Set judgment_api_key just in case it was not set
         dataset.judgment_api_key = self.judgment_api_key
-        return dataset.push(alias, overwrite)
+        return self.eval_dataset_client.push(dataset, alias, overwrite)
     def pull_dataset(self, alias: str) -> EvalDataset:
         """
@@ -149,9 +150,20 @@ class JudgmentClient:
         Returns:
             EvalDataset: The retrieved dataset
         """
-        dataset = EvalDataset(judgment_api_key=self.judgment_api_key)
-        dataset.pull(alias)
-        return dataset
+        return self.eval_dataset_client.pull(alias)
+    def pull_all_user_dataset_stats(self) -> dict:
+        """
+        Retrieves all dataset stats from the Judgment platform for the user.
+        Args:
+            alias (str): The name of the dataset to retrieve
+        Returns:
+            EvalDataset: The retrieved dataset
+        """
+        return self.eval_dataset_client.pull_all_user_dataset_stats()
     # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
     def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:

judgeval/run_evaluation.py CHANGED Viewed

@@ -97,6 +97,13 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
             raise ValueError("The API and local results are not aligned.")
         if api_result.retrieval_context != local_result.retrieval_context:
             raise ValueError("The API and local results are not aligned.")
+        if api_result.additional_metadata != local_result.additional_metadata:
+            raise ValueError("The API and local results are not aligned.")
+        if api_result.tools_called != local_result.tools_called:
+            raise ValueError("The API and local results are not aligned.")
+        if api_result.expected_tools != local_result.expected_tools:
+            raise ValueError("The API and local results are not aligned.")
         # Merge ScorerData from the API and local scorers together
         api_scorer_data = api_result.scorers_data
@@ -254,6 +261,12 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
                 debug(f"Context: {example.context}")
             if example.retrieval_context:
                 debug(f"Retrieval context: {example.retrieval_context}")
+            if example.additional_metadata:
+                debug(f"Additional metadata: {example.additional_metadata}")
+            if example.tools_called:
+                debug(f"Tools called: {example.tools_called}")
+            if example.expected_tools:
+                debug(f"Expected tools: {example.expected_tools}")
     debug(f"Starting evaluation run with {len(evaluation_run.examples)} examples")
@@ -379,6 +392,9 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
                 'expected_output': result.expected_output,
                 'context': result.context,
                 'retrieval_context': result.retrieval_context,
+                'additional_metadata': result.additional_metadata,
+                'tools_called': result.tools_called,
+                'expected_tools': result.expected_tools,
                 'eval_run_name': result.eval_run_name,
                 'failed_scorers': []
             }
@@ -397,6 +413,9 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
             error_msg += f"Expected Output: {fail_case['expected_output']}\n"
             error_msg += f"Context: {fail_case['context']}\n"
             error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
+            error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
+            error_msg += f"Tools Called: {fail_case['tools_called']}\n"
+            error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
             error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
             for fail_scorer in fail_case['failed_scorers']:

judgeval/scorers/__init__.py CHANGED Viewed

@@ -13,6 +13,7 @@ from judgeval.scorers.judgeval_scorers import (
     AnswerRelevancyScorer,
     ScorerWrapper,
     AnswerCorrectnessScorer,
+    Text2SQLScorer,
 )
 __all__ = [
@@ -31,4 +32,5 @@ __all__ = [
     "AnswerRelevancyScorer",
     "ScorerWrapper",
     "AnswerCorrectnessScorer",
+    "Text2SQLScorer",
 ]

judgeval/scorers/judgeval_scorers/__init__.py CHANGED Viewed

@@ -28,6 +28,9 @@ from judgeval.scorers.judgeval_scorers.local_implementations import (
     AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer
 )
+from judgeval.scorers.judgeval_scorers.classifiers import Text2SQLScorer
 class ScorerWrapper:
     """
     Wrapper class that can dynamically load either API or local implementation of a scorer.
@@ -141,4 +144,5 @@ __all__ = [
     "ContextualPrecisionScorer",
     "ContextualRecallScorer",
     "AnswerRelevancyScorer",
+    "Text2SQLScorer",
 ]

judgeval/scorers/judgeval_scorers/classifiers/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .text2sql import Text2SQLScorer
+__all__ = ["Text2SQLScorer"]

judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .text2sql_scorer import Text2SQLScorer
+__all__ = ["Text2SQLScorer"]

judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""
+ClassifierScorer implementation for basic Text-to-SQL evaluation.
+Takes a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
+Determines if the LLM-generated SQL query is valid and works for the natural language query.
+"""
+from judgeval.scorers import ClassifierScorer
+Text2SQLScorer = ClassifierScorer(
+    "Text to SQL",
+    slug="text2sql-1010101010",
+    threshold=1.0,
+    conversation=[{
+        "role": "system",
+        "content": """You will be given a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
+** TASK INSTRUCTIONS **
+Your task is to decide whether the LLM generated SQL query properly filters for what the natural language query is asking, based on the table schema + (optional) metadata.
+Additionally, you should check if the SQL query is valid based on the table schema (checking for syntax errors, false column names, etc.)
+** TIPS **
+- Look for correct references to the table schema for column names, table names, etc.
+- Check that the SQL query can be executed; make sure JOINs, GROUP BYs, ORDER BYs, etc. are valid with respect to the table schema.
+- Check that aggregation functions (COUNT, SUM, AVG, etc.) are used appropriately with GROUP BY clauses
+- Verify that WHERE conditions use the correct operators and data types for comparisons
+- Ensure LIMIT and OFFSET clauses make sense for the query's purpose
+- Check that JOINs use the correct keys and maintain referential integrity
+- Verify that ORDER BY clauses use valid column names and sort directions
+- Check for proper handling of NULL values where relevant
+- Ensure subqueries are properly constructed and correlated when needed
+- EVEN IF THE QUERY IS VALID, IF IT DOESN'T WORK FOR THE NATURAL LANGUAGE QUERY, YOU SHOULD CHOOSE "N" AS THE ANSWER.
+** FORMATTING YOUR ANSWER **
+If the SQL query is valid and works for the natural language query, choose option "Y" and otherwise "N". Provide a justification for your decision; if you choose "N", explain what about the LLM-generated SQL query is incorrect, or explain why it doesn't address the natural language query.
+IF YOUR JUSTIFICATION SHOWS THAT THE SQL QUERY IS VALID AND WORKS FOR THE NATURAL LANGUAGE QUERY, YOU SHOULD CHOOSE "Y" AS THE ANSWER.
+IF THE SQL QUERY IS INVALID, YOU SHOULD CHOOSE "N" AS THE ANSWER.
+** YOUR TURN **
+Natural language query:
+{{input}}
+LLM generated SQL query:
+{{actual_output}}
+Table schema:
+{{context}}
+        """
+    }],
+    options={
+        "Y": 1.0,
+        "N": 0.0
+    }
+)

{judgeval-0.0.8.dist-info → judgeval-0.0.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.8
+Version: 0.0.10
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues

{judgeval-0.0.8.dist-info → judgeval-0.0.10.dist-info}/RECORD RENAMED Viewed

@@ -1,21 +1,22 @@
 judgeval/__init__.py,sha256=xiiG4CkeaOtey4fusCd9CBz0BVqzTIbV-K2EFIU0rUM,283
 judgeval/clients.py,sha256=Ns5ljrgPPXUMo7fSPJxO12H64lcPyKeQPIVG_RMi2cM,1162
-judgeval/constants.py,sha256=5O1jWvxMCRyMSWhmkrvPqfBctx42c7kMtgTS7ORVcFw,1965
+judgeval/constants.py,sha256=qwWc3EOpXSn9SHq5rylkHhnzH5WldedqSMCToa7vgZk,2040
 judgeval/evaluation_run.py,sha256=KcIS7mDR_9XEdqYrJXFcrLz5IDMof34HcD5VtjZgV8w,5884
-judgeval/judgment_client.py,sha256=lVVVDxRQ750nd0wT827dca94YzThNjuFWWJ-BTFW7lg,11367
-judgeval/run_evaluation.py,sha256=A9jjtWPH2_5W43a1f98R8u-8PuVczoJZNCZIyCoRqi8,18918
+judgeval/judgment_client.py,sha256=jMeayUI-Z-GX4mVMVC9t5f7ENKLQ8dOepScYu5Yytf0,11777
+judgeval/run_evaluation.py,sha256=YOQ6s9RuUrXPTgoYexf7r6Hl1QKIMSTdvHl9kw-ZMzw,20103
 judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
 judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
 judgeval/common/logger.py,sha256=QXN3UMymmKu2iMEMEgATLBnMDjGr_pE2iOSEFoICgg8,6092
-judgeval/common/tracer.py,sha256=JWUmsjxs2N6Cu5nol7vRbwWKFRLHJlwCnHWgg3W17GM,23812
+judgeval/common/tracer.py,sha256=k5g9ZLeM-fLdV_q9NpodN8gW4nLTIXsbxeTaXVjm9jk,25658
 judgeval/common/utils.py,sha256=3WRyyX0tvnnj_VAVlEdtZrfzyWj6zfX04xdpCtE1m5Y,33736
 judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
 judgeval/data/api_example.py,sha256=vwWFbI6eJr5VgURCRbuSiMtEXLUbTCih_BcaqEBy-pg,4108
 judgeval/data/example.py,sha256=lymGZ3jG818-r2vyFunt6OLFrhESOyJnbhao_ljTjlA,2471
-judgeval/data/result.py,sha256=CVp_mZrBbKjIH9rPB6rg7T2jY1jUy7JVyI7_kUbRC7w,3490
+judgeval/data/result.py,sha256=8FIO-bFKPegZuByKRjA2_sumjb8oGWQ5ZeQ1RVz5z2w,4393
 judgeval/data/scorer_data.py,sha256=pYljblCPZrlMIv5Eg7R-clnmsqzUBAwokKjZpwa0DXE,3280
-judgeval/data/datasets/__init__.py,sha256=Xh6TSsCcEsJeYjjubfeGa3WU8YQfuwKXH3jR9EeDFgg,171
-judgeval/data/datasets/dataset.py,sha256=9GGspdKDhMw2dJAS7ZvOZHSoNGwMzCtgnFYDe6y4yog,16484
+judgeval/data/datasets/__init__.py,sha256=eO6ayeM_bTGwIt0eDSlTBIIBvXvIWRWWSfYZrZROPiQ,265
+judgeval/data/datasets/dataset.py,sha256=AGdU21vZ4iVjqbjQ7JY-u29FzJrdDFTgdvhzvYVJNyo,11833
+judgeval/data/datasets/eval_dataset_client.py,sha256=TaCDzymGFNFjGRrieEdQB8dT8xqNPpsEi2XLGFyrJno,7113
 judgeval/data/datasets/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
 judgeval/data/datasets/utils.py,sha256=lQxyl7mevct7JcDSyIrU_8QOzT-EYPWEvoUiAeOdeek,2502
 judgeval/judges/__init__.py,sha256=tyQ5KY88Kp1Ctfw2IJxnVEpy8DnFCtmy04JdPOpp-As,339
@@ -24,7 +25,7 @@ judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6
 judgeval/judges/mixture_of_judges.py,sha256=OuGWCuXyqe7s_Y74ij90TJFRfHU-VAFyJVVrwBM0RO0,15532
 judgeval/judges/together_judge.py,sha256=x3jf-tq77QPXHeeoF739f69hE_0VceXD9FHLrVFdGVA,2275
 judgeval/judges/utils.py,sha256=YUvivcGV1OKLPMJ9N6aTvhA0r_zzJ2NXriPguiiaVaY,2110
-judgeval/scorers/__init__.py,sha256=3rq2VtszrJk9gZ3oAMVd7EGlSugr8aRlHWprMDgQPaQ,956
+judgeval/scorers/__init__.py,sha256=XcDdLn_s16rSQob0896oj4JXTA8-Xfl271TUEBj6Oew,998
 judgeval/scorers/api_scorer.py,sha256=88kCWr6IetLFn3ziTPG-lwDWvMhFUC6xfINU1MJBoho,2125
 judgeval/scorers/base_scorer.py,sha256=mbOReG88fWaqCnC8F0u5QepRlzgVkuOz89KEKYxrmMc,1794
 judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
@@ -32,7 +33,7 @@ judgeval/scorers/judgeval_scorer.py,sha256=14SZ3sBZtGNM3BCegKebkNad9LTs5Tyhs0kD6
 judgeval/scorers/prompt_scorer.py,sha256=bUv8eZNy1XGVM1gNMt33dgIVX6zj63bGAV6O0o0c7yg,17821
 judgeval/scorers/score.py,sha256=zJKG21h9Njyj2vS36CAFK2wlbOcHSKgrLgHV5_25KKw,18630
 judgeval/scorers/utils.py,sha256=dtueaJm8e3Ph3wj1vC-srzadgK_CoIlOefdvMQ-cwK8,6826
-judgeval/scorers/judgeval_scorers/__init__.py,sha256=077QnuBfw9Sy9RP2TF2oKCtt5PbaqBZLyiP-gczKShk,5092
+judgeval/scorers/judgeval_scorers/__init__.py,sha256=D12jJAKTcfmz8fDBkYeOmdzZMZsURuODIJ5p7Nk1lWE,5189
 judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=zFwH2TC5AFlpDRfVKc6GN4YTtnmeyALl-JRLoZD_Jco,1284
 judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=690G5askjE8dcbKPGvCF6JxAEM9QJUqb-3K-D6lI6oM,463
 judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=CqvvjV7AZqPlXh-PZaPKYPILHr15u4bIYiKBFjlk5i0,457
@@ -44,6 +45,9 @@ judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=ffYwH3CexP
 judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=CAZBQKwNSqpqAoOgStYfr-yP1Brug_6VRimRIQY-zdg,894
 judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=-E3oxYbI0D_0q-_fGWh2jQHW9O4Pu7I7xvLWsHU6cn8,450
 judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py,sha256=17ppPXm962ew67GU5m0npzbPu3CuhgdKY_KmfPvKfu4,457
+judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
+judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
+judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
 judgeval/scorers/judgeval_scorers/local_implementations/__init__.py,sha256=ZDbmYHwIbPD75Gj9JKtEWnpBdSVGGRmbn1_IOR6GR-c,1627
 judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py,sha256=cxxUEspgoIdSzJbwIIioamC0-xDqhYVfYAWxaYF-D_Y,177
 judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py,sha256=PDThn6SzqxgMXT7BpQs2TEBOsgfD5fi6fnKk31qaCTo,10227
@@ -73,7 +77,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
 judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=CBuE6oCxMzTdJoXFt_YPWBte88kedEQ9t3g52ZRztGY,21086
 judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
 judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=CYGRJY5EuyICYzHrmFdLykwXakX8AC7G3Bhj7p6szfY,5493
-judgeval-0.0.8.dist-info/METADATA,sha256=91SMIPO60Q_Ab7yTjL2sKmPgmfl6Bji6_QAzkjaOHlk,1204
-judgeval-0.0.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.0.8.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.0.8.dist-info/RECORD,,
+judgeval-0.0.10.dist-info/METADATA,sha256=i9jeAPs3jY5hAHAdE_rlen4qJdEk0eAqQ0BOzMie97I,1205
+judgeval-0.0.10.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.0.10.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.0.10.dist-info/RECORD,,

{judgeval-0.0.8.dist-info → judgeval-0.0.10.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.0.8.dist-info → judgeval-0.0.10.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl

judgeval 0.0.8py3-none-any.whl → 0.0.10py3-none-any.whl