PyPI - judgeval - Versions diffs - 0.0.30__py3-none-any.whl → 0.0.32__py3-none-any.whl - Mend

judgeval 0.0.30py3-none-any.whl → 0.0.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

judgeval/__init__.py +3 -1
judgeval/common/tracer.py +352 -117
judgeval/constants.py +5 -3
judgeval/data/__init__.py +4 -0
judgeval/data/custom_example.py +18 -0
judgeval/data/datasets/dataset.py +5 -1
judgeval/data/datasets/eval_dataset_client.py +64 -5
judgeval/data/example.py +1 -0
judgeval/data/result.py +7 -6
judgeval/data/sequence.py +55 -0
judgeval/data/sequence_run.py +44 -0
judgeval/evaluation_run.py +12 -7
judgeval/integrations/langgraph.py +89 -72
judgeval/judgment_client.py +70 -68
judgeval/run_evaluation.py +87 -13
judgeval/scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorer.py +3 -0
judgeval/scorers/judgeval_scorers/__init__.py +7 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -1
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +21 -0
judgeval/scorers/score.py +6 -5
judgeval/version_check.py +22 -0
{judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/METADATA +1 -1
{judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/RECORD +26 -22
judgeval/data/custom_api_example.py +0 -91
{judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/WHEEL +0 -0
{judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/licenses/LICENSE.md +0 -0

judgeval/run_evaluation.py CHANGED Viewed

@@ -4,14 +4,15 @@ import time
 import sys
 import itertools
 import threading
-from typing import List, Dict, Any
+from typing import List, Dict, Any, Union
 from datetime import datetime
 from rich import print as rprint
 from judgeval.data import (
     ScorerData,
     ScoringResult,
-    Example
+    Example,
+    CustomExample
 )
 from judgeval.scorers import (
     JudgevalScorer,
@@ -22,6 +23,7 @@ from judgeval.scorers.score import a_execute_scoring
 from judgeval.constants import (
     ROOT_API,
     JUDGMENT_EVAL_API_URL,
+    JUDGMENT_SEQUENCE_EVAL_API_URL,
     JUDGMENT_EVAL_LOG_API_URL,
     MAX_CONCURRENT_EVALUATIONS,
     JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
@@ -34,7 +36,7 @@ from judgeval.common.logger import (
     example_logging_context
 )
 from judgeval.evaluation_run import EvaluationRun
+from judgeval.data.sequence_run import SequenceRun
 def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
     """
@@ -91,6 +93,36 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
         raise JudgmentAPIError(error_message)
     return response_data
+def execute_api_sequence_eval(sequence_run: SequenceRun) -> List[Dict]:
+    """
+    Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
+    """
+    try:
+        # submit API request to execute evals
+        payload = sequence_run.model_dump(warnings=False)
+        response = requests.post(
+            JUDGMENT_SEQUENCE_EVAL_API_URL,
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {sequence_run.judgment_api_key}",
+                "X-Organization-Id": sequence_run.organization_id
+            },
+            json=payload,
+            verify=True
+        )
+        response_data = response.json()
+    except Exception as e:
+        error(f"Error: {e}")
+        details = response.json().get("detail", "No details provided")
+        raise JudgmentAPIError("An error occurred while executing the Judgment API request: " + details)
+    # Check if the response status code is not 2XX
+    # Add check for the duplicate eval run name
+    if not response.ok:
+        error_message = response_data.get('detail', 'An unknown error occurred.')
+        error(f"Error: {error_message=}")
+        raise JudgmentAPIError(error_message)
+    return response_data
 def merge_results(api_results: List[ScoringResult], local_results: List[ScoringResult]) -> List[ScoringResult]:
     """
@@ -197,8 +229,8 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
         )
         if response.status_code == 409:
-            error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `override` flag to true.")
-            raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `override` flag to true.")
+            error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true.")
+            raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true.")
         if not response.ok:
             response_data = response.json()
@@ -211,7 +243,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
         raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
-def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> str:
+def log_evaluation_results(merged_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
     """
     Logs evaluation results to the Judgment API database.
@@ -228,13 +260,12 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
             JUDGMENT_EVAL_LOG_API_URL,
             headers={
                 "Content-Type": "application/json",
-                "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
-                "X-Organization-Id": evaluation_run.organization_id
+                "Authorization": f"Bearer {run.judgment_api_key}",
+                "X-Organization-Id": run.organization_id
             },
             json={
-                "results": [result.to_dict() for result in merged_results],
-                "project_name": evaluation_run.project_name,
-                "eval_name": evaluation_run.eval_name,
+                "results": [result.model_dump(warnings=False) for result in merged_results],
+                "run": run.model_dump(warnings=False)
             },
             verify=True
         )
@@ -303,6 +334,42 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
                     # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
                     print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
+def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
+    # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
+    if not override and sequence_run.log_results and not sequence_run.append:
+        check_eval_run_name_exists(
+            sequence_run.eval_name,
+            sequence_run.project_name,
+            sequence_run.judgment_api_key,
+            sequence_run.organization_id
+        )
+    # Execute evaluation using Judgment API
+    info("Starting API evaluation")
+    try:  # execute an EvaluationRun with just JudgmentScorers
+        debug("Sending request to Judgment API")
+        response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
+        info(f"Received {len(response_data['results'])} results from API")
+    except JudgmentAPIError as e:
+        error(f"An error occurred while executing the Judgment API request: {str(e)}")
+        raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
+    except ValueError as e:
+        raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: {str(e)}")
+    # Convert the response data to `ScoringResult` objects
+    debug("Processing API results")
+    api_results = []
+    for result in response_data["results"]:
+        api_results.append(ScoringResult(**result))
+    # TODO: allow for custom scorer on sequences
+    if sequence_run.log_results:
+        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, api_results, sequence_run)
+        rprint(pretty_str)
 def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
     """
     Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -329,7 +396,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
     """
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
-    if not override and evaluation_run.log_results:
+    if not override and evaluation_run.log_results and not evaluation_run.append:
         check_eval_run_name_exists(
             evaluation_run.eval_name,
             evaluation_run.project_name,
@@ -373,12 +440,20 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
             local_scorers.append(scorer)
             debug(f"Added local scorer: {type(scorer).__name__}")
+    custom_example_check = [scorer.custom_example for scorer in local_scorers]
+    if any(custom_example_check) and not all(custom_example_check):
+        error("All scorers must be custom scorers if using custom examples")
+        raise ValueError("All scorers must be custom scorers if using custom examples")
     debug(f"Found {len(judgment_scorers)} judgment scorers and {len(local_scorers)} local scorers")
     api_results: List[ScoringResult] = []
     local_results: List[ScoringResult] = []
     if async_execution:
+        if len(local_scorers) > 0:
+            error("Local scorers are not supported in async execution")
         check_examples(evaluation_run.examples, evaluation_run.scorers)
         info("Starting async evaluation")
         payload = evaluation_run.model_dump(warnings=False)
@@ -396,7 +471,6 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
     else:
         if judgment_scorers:
             # Execute evaluation using Judgment API
-            check_examples(evaluation_run.examples, evaluation_run.scorers)
             info("Starting API evaluation")
             debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
             try:  # execute an EvaluationRun with just JudgmentScorers

judgeval/scorers/__init__.py CHANGED Viewed

@@ -17,6 +17,7 @@ from judgeval.scorers.judgeval_scorers import (
     ComparisonScorer,
     InstructionAdherenceScorer,
     GroundednessScorer,
+    DerailmentScorer,
 )
 __all__ = [
@@ -39,4 +40,5 @@ __all__ = [
     "ComparisonScorer",
     "InstructionAdherenceScorer",
     "GroundednessScorer",
+    "DerailmentScorer",
 ]

judgeval/scorers/judgeval_scorer.py CHANGED Viewed

@@ -34,6 +34,7 @@ class JudgevalScorer:
     async_mode: bool = True  # Whether to run the scorer in async mode
     verbose_mode: bool = True  # Whether to run the scorer in verbose mode
     include_reason: bool = False  # Whether to include the reason in the output
+    custom_example: bool = False  # Whether the scorer corresponds to CustomExamples
     error: Optional[str] = None  # The error message if the scorer failed
     evaluation_cost: Optional[float] = None  # The cost of running the scorer
     verbose_logs: Optional[str] = None  # The verbose logs of the scorer
@@ -52,6 +53,7 @@ class JudgevalScorer:
         async_mode: bool = True,
         verbose_mode: bool = True,
         include_reason: bool = False,
+        custom_example: bool = False,
         error: Optional[str] = None,
         evaluation_cost: Optional[float] = None,
         verbose_logs: Optional[str] = None,
@@ -78,6 +80,7 @@ class JudgevalScorer:
             self.async_mode = async_mode
             self.verbose_mode = verbose_mode
             self.include_reason = include_reason
+            self.custom_example = custom_example
             self.error = error
             self.evaluation_cost = evaluation_cost
             self.verbose_logs = verbose_logs

judgeval/scorers/judgeval_scorers/__init__.py CHANGED Viewed

@@ -15,6 +15,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
     ComparisonScorer as APIComparisonScorer,
     InstructionAdherenceScorer as APIInstructionAdherenceScorer,
     GroundednessScorer as APIGroundednessScorer,
+    DerailmentScorer as APIDerailmentScorer,
 )
 from judgeval.scorers.judgeval_scorers.local_implementations import (
@@ -153,6 +154,11 @@ GroundednessScorer = ScorerWrapper(
     api_implementation=APIGroundednessScorer,
 )
+DerailmentScorer = ScorerWrapper(
+    api_implementation=APIDerailmentScorer,
+    local_implementation=LocalInstructionAdherenceScorer # TODO: add local implementation
+)
 __all__ = [
     "ExecutionOrderScorer",
     "JSONCorrectnessScorer",
@@ -166,4 +172,5 @@ __all__ = [
     "Text2SQLScorer",
     "ComparisonScorer",
     "GroundednessScorer",
+    "DerailmentScorer",
 ]

judgeval/scorers/judgeval_scorers/api_scorers/__init__.py CHANGED Viewed

@@ -11,7 +11,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import Ans
 from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
+from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
 __all__ = [
     "ExecutionOrderScorer",
     "JSONCorrectnessScorer",
@@ -26,4 +26,5 @@ __all__ = [
     "ComparisonScorer",
     "InstructionAdherenceScorer",
     "GroundednessScorer",
+    "DerailmentScorer",
 ]

judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""
+`judgeval` answer relevancy scorer
+TODO add link to docs page for this scorer
+"""
+# Internal imports
+from judgeval.scorers.api_scorer import APIJudgmentScorer
+from judgeval.constants import APIScorer
+class DerailmentScorer(APIJudgmentScorer):
+    def __init__(self, threshold: float):
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.DERAILMENT,
+        )
+    @property
+    def __name__(self):
+        return "Derailment"

judgeval/scorers/score.py CHANGED Viewed

@@ -11,6 +11,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
 from judgeval.data import (
     Example,
+    CustomExample,
     ScoringResult,
     generate_scoring_result,
     create_scorer_data,
@@ -240,7 +241,7 @@ async def score_with_indicator(
 async def a_execute_scoring(
-    examples: List[Example],
+    examples: Union[List[Example], List[CustomExample]],
     scorers: List[JudgevalScorer],
     model: Optional[Union[str, List[str], JudgevalJudge]] = None,
     ignore_errors: bool = True,
@@ -256,7 +257,7 @@ async def a_execute_scoring(
     Each `Example` will be evaluated by all of the `JudgevalScorer`s in the `scorers` list.
     Args:
-        examples (List[Example]): A list of `Example` objects to be evaluated.
+        examples (Union[List[Example], List[CustomExample]]): A list of `Example` objects to be evaluated.
         scorers (List[JudgevalScorer]): A list of `JudgevalScorer` objects to evaluate the examples.
         model (Union[str, List[str], JudgevalJudge]): The model to use for evaluation.
         ignore_errors (bool): Whether to ignore errors during evaluation.
@@ -313,7 +314,7 @@ async def a_execute_scoring(
                             debug(f"Scorer threshold: {scorer.threshold}")
                         if hasattr(scorer, 'model'):
                             debug(f"Scorer model: {type(scorer.model).__name__}")
-                if isinstance(ex, Example):
+                if isinstance(ex, Example) or isinstance(ex, CustomExample):
                     if len(scorers) == 0:
                         pbar.update(1)
                         continue
@@ -339,7 +340,7 @@ async def a_execute_scoring(
             await asyncio.gather(*tasks)
     else:
         for i, ex in enumerate(examples):
-            if isinstance(ex, Example):
+            if isinstance(ex, Example) or isinstance(ex, CustomExample):
                 if len(scorers) == 0:
                     continue
@@ -366,7 +367,7 @@ async def a_execute_scoring(
 async def a_eval_examples_helper(
     scorers: List[JudgevalScorer],
-    example: Example,
+    example: Union[Example, CustomExample],
     scoring_results: List[ScoringResult],
     score_index: int,
     ignore_errors: bool,

judgeval/version_check.py ADDED Viewed

@@ -0,0 +1,22 @@
+import importlib.metadata
+import requests
+import threading
+def check_latest_version(package_name: str = "judgeval"):
+    def _check():
+        try:
+            current_version = importlib.metadata.version(package_name)
+            response = requests.get(f"https://pypi.org/pypi/{package_name}/json", timeout=2)
+            latest_version = response.json()["info"]["version"]
+            if current_version != latest_version:
+                print(
+                    f"\033[93mUPDATE AVAILABLE:\033[0m You are using '{package_name}=={current_version}', "
+                    f"but the latest version is '{latest_version}'. While this version is still supported, "
+                    f"we recommend upgrading to avoid potential issues or missing features: "
+                    f"`pip install --upgrade {package_name}`"
+                )
+        except Exception:
+            pass
+    threading.Thread(target=_check, daemon=True).start()

{judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.30
+Version: 0.0.32
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues

{judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/RECORD RENAMED Viewed

@@ -1,46 +1,50 @@
-judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
+judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
 judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
-judgeval/constants.py,sha256=ksAXhAXovzJKH0uHOdQtREs168uCJRG79PooHNmEbYQ,5313
-judgeval/evaluation_run.py,sha256=6Kft3wZDWkdBDZoMwOhWf7zSAOF4naI7Pcg_YlZaZY4,6394
-judgeval/judgment_client.py,sha256=uf0V1-eu3qnFTwrQ_Ckcv8IiWRVv7dbvou4P4KjU6hM,26794
+judgeval/constants.py,sha256=_XmVAkebMyGrDvvanAVlMgVd4p6MLHdEVsTQFI0kz1k,5411
+judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
+judgeval/judgment_client.py,sha256=k0q2s5A0RkhF9ElD9o-KWN10H36t3Of2PrvNF-silf8,26141
 judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
-judgeval/run_evaluation.py,sha256=N2ppmEE5WoSReChKjr_n0NcdAUlUR6Nua7M1C_3zHQ8,24949
+judgeval/run_evaluation.py,sha256=hnEY8QckEviXYNJutf-6tLFq2DWCzqWV1EVyPvrVXyA,28512
+judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
 judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
 judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
 judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
-judgeval/common/tracer.py,sha256=b-eQyC_MPwMAVQJS6wtVW0_7hzk8tC9EV6NZZoNjWos,58188
+judgeval/common/tracer.py,sha256=owRRfIZXPUOVCCn0macygnf18mcp8am1eULGnZXD0Kk,68876
 judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
-judgeval/data/__init__.py,sha256=dG5ytBOeOWCTd5o0KP7IblqtW4G1EBaGreLWepM3jas,345
-judgeval/data/custom_api_example.py,sha256=uW_ZBzkDLWumtudmfRHAJQkVYpm2qWgcDf7vBNLpS-o,3444
-judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
-judgeval/data/result.py,sha256=BT4f2FF5EFuiRjOmS4vuIXsrEwSlG16Vw3QaWi6PZzc,3122
+judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
+judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
+judgeval/data/example.py,sha256=cJrmPGLel_P2sy1UaRvuVSAi35EnA9XMR11Lhp4aDLo,5930
+judgeval/data/result.py,sha256=Gb9tiSDsk1amXgh0cFG6JmlW_BMKxS2kuTwNA0rrHjA,3184
 judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
+judgeval/data/sequence.py,sha256=Fkk2HJGnPboH-Fvwgxub_ryG0eUXa3cbsj7ZD0qkeBo,2204
+judgeval/data/sequence_run.py,sha256=RmYjfWKMWg-pcF5PLeiWfrhuDkjDZi5VEmAIEXN3Ib0,2104
 judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
-judgeval/data/datasets/dataset.py,sha256=AFYjksV_wXx5CqFYJsl3aN8yZ6hC50O1myRuOJ8s8_E,12867
-judgeval/data/datasets/eval_dataset_client.py,sha256=P9fEmcNrjPPaiYbbLiEiBziZrIexA39HN9qzClt6uPE,12691
-judgeval/integrations/langgraph.py,sha256=fGDZOTlVbxTO4ErC-m9OSg3h-RkOIIWXCfhjgkKRh4E,11187
+judgeval/data/datasets/dataset.py,sha256=dhLo30hvpmmOK2R6O5wDs_neawUJ4lS8bb4S42SufNQ,13034
+judgeval/data/datasets/eval_dataset_client.py,sha256=xjj66BO9Es9IxXqzQe1RT_e0kpeKlt7OrhRoSuj4KHM,15085
+judgeval/integrations/langgraph.py,sha256=J-cQfFP52TjJewdSTe-fcsUC4HDvjNbXoxmbmF0SgiE,11743
 judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
 judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
 judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
 judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
 judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
 judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
-judgeval/scorers/__init__.py,sha256=gkeKJvjXhswCnkEyjijrVvGVM3Om86egrZ-PUOGvNvI,1158
+judgeval/scorers/__init__.py,sha256=Z_88Sr45gLFAIbMHzG1BF24TUQGCDiuP9QpmVFvSYJM,1204
 judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
 judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1mC0,2183
 judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
-judgeval/scorers/judgeval_scorer.py,sha256=jq_rzfTG0XBTuLCaa6TlaK4YcT-LlgsO1LEm6hpOYdg,6601
+judgeval/scorers/judgeval_scorer.py,sha256=79-JJurqHP-qTaWNWInx4SjvQYwXc9lvfPPNgwsh2yA,6773
 judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
-judgeval/scorers/score.py,sha256=ObFAlMbNRcGrfBpH4WW_6OA3CjrneC539xSWhGH60GQ,18578
+judgeval/scorers/score.py,sha256=r9QiT4-LIvivcJ6XxByrbswKSO8eQTtAD1UlXT_lcmo,18741
 judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
-judgeval/scorers/judgeval_scorers/__init__.py,sha256=xFRb62sp4JmBUSeuAB_pC_7kEGp-lGdqCRIu9--Bbdg,5992
-judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mZ6b_5Dl04k3PaG24ICBajB_j43ody1II1OJhO1DkXo,1648
+judgeval/scorers/judgeval_scorers/__init__.py,sha256=kSmQWKeBvLeZMfLYNQSc2qbJYo1MFIQnf3P-D4ltuSM,6232
+judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=_sDUBxSG536KGqXNi6dFpaYKghjEAadxBxaaxV9HuuE,1764
 judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=Fnd9CVIOZ73sWEWymsU5eBrrZqPFjMZ0BKpeW-PDyTg,711
 judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=oETeN9K0HSIRdL2SDqn82Vskpwh5SlKnZvs5VDm2OBU,658
 judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=kuzf9OWvpY38yYSwlBgneLkUZwJNM4FQqvbS66keA90,1249
 judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=tpSuzFAaW8X9xqA0aLLKwh7qmBK0Pc_bJZMIe_q412U,770
 judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=pFVhk4pLtQ-FnNlbI-dFF-SIh69Jza7erHqiPkFWoBo,758
 judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=RQ6DZwEhChfecd89Ey-T7ke--7qTaXZlRsNxwH8gaME,823
+judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py,sha256=V9WPuwNMm097V7IknKs8UkmAk0yjnBXTcJha_BHXxTA,475
 judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=Pb3CiNF2Ca826B92wJCVAi_68lJjLhqqCKwQKaflSUg,1294
 judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=-BwOapqjryYNKNydtdkUiKIij76dY0O1jBmdc6dKazQ,692
 judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=ntEEeTANEOsGlcbiTAF_3r6BeSJEaVDns8po8T0L6Vg,692
@@ -87,7 +91,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
 judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
 judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
 judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
-judgeval-0.0.30.dist-info/METADATA,sha256=A1AbKJ1CqCjM4ankQWZ8AVKZdxLZBUUWehKHdgT43l0,5418
-judgeval-0.0.30.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.0.30.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.0.30.dist-info/RECORD,,
+judgeval-0.0.32.dist-info/METADATA,sha256=RJzqlHJwfYiOXEcyEEO5WQBM0DC1zQDuoN-Plix6U38,5418
+judgeval-0.0.32.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.0.32.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.0.32.dist-info/RECORD,,

judgeval/data/custom_api_example.py DELETED Viewed

@@ -1,91 +0,0 @@
-from typing import List, Optional, Dict, Any, Union
-from pydantic import BaseModel, ConfigDict, model_validator
-from judgeval.data.example import Example
-from judgeval.data.custom_example import CustomExample
-from judgeval.data.scorer_data import ScorerData
-from judgeval.common.logger import debug, error
-class ProcessExample(BaseModel):
-    """
-    ProcessExample is an `Example` object that contains intermediate information
-    about an undergoing evaluation on the original `Example`. It is used purely for
-    internal operations and keeping track of the evaluation process.
-    """
-    name: str
-    # input: Optional[str] = None
-    # actual_output: Optional[Union[str, List[str]]] = None
-    # expected_output: Optional[Union[str, List[str]]] = None
-    # context: Optional[list] = None
-    # retrieval_context: Optional[list] = None
-    # tools_called: Optional[list] = None
-    # expected_tools: Optional[list] = None
-    # make these optional, not all test cases in a conversation will be evaluated
-    success: Optional[bool] = None
-    scorers_data: Optional[List[ScorerData]] = None
-    run_duration: Optional[float] = None
-    evaluation_cost: Optional[float] = None
-    order: Optional[int] =  None
-    # These should map 1 to 1 from golden
-    additional_metadata: Optional[Dict] = None
-    comments: Optional[str] = None
-    trace_id: Optional[str] = None
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    def update_scorer_data(self, scorer_data: ScorerData):
-        """
-        Updates scorer data field of test case after the scorers have been
-        evaluated on this test case.
-        """
-        debug(f"Updating scorer data for example '{self.name}' with scorer: {scorer_data}")
-        # self.scorers_data is a list of ScorerData objects that contain the
-        # evaluation results of each scorer on this test case
-        if self.scorers_data is None:
-            self.scorers_data = [scorer_data]
-        else:
-            self.scorers_data.append(scorer_data)
-        if self.success is None:
-            # self.success will be None when it is a message
-            # in that case we will be setting success for the first time
-            self.success = scorer_data.success
-        else:
-            if scorer_data.success is False:
-                debug(f"Example '{self.name}' marked as failed due to scorer: {scorer_data}")
-                self.success = False
-    def update_run_duration(self, run_duration: float):
-        self.run_duration = run_duration
-def create_process_custom_example(
-    example: CustomExample,
-) -> ProcessExample:
-    """
-    When an LLM Test Case is executed, we track its progress using an ProcessExample.
-    This will track things like the success of the test case, as well as the metadata (such as verdicts and claims in Faithfulness).
-    """
-    success = True
-    if example.name is not None:
-        name = example.name
-    else:
-        name = "Test Case Placeholder"
-        debug(f"No name provided for example, using default name: {name}")
-    order = None
-    scorers_data = []
-    debug(f"Creating ProcessExample for: {name}")
-    process_ex = ProcessExample(
-        name=name,
-        success=success,
-        scorers_data=scorers_data,
-        run_duration=None,
-        evaluation_cost=None,
-        order=order,
-        additional_metadata=example.additional_metadata,
-        trace_id=example.trace_id
-    )
-    return process_ex

{judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.0.30__py3-none-any.whl → 0.0.32__py3-none-any.whl

judgeval 0.0.30py3-none-any.whl → 0.0.32py3-none-any.whl