PyPI - judgeval - Versions diffs - 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl - Mend

judgeval 0.0.44py3-none-any.whl → 0.0.46py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

judgeval/__init__.py +5 -4
judgeval/clients.py +6 -6
judgeval/common/__init__.py +7 -2
judgeval/common/exceptions.py +2 -3
judgeval/common/logger.py +74 -49
judgeval/common/s3_storage.py +30 -23
judgeval/common/tracer.py +1273 -939
judgeval/common/utils.py +416 -244
judgeval/constants.py +73 -61
judgeval/data/__init__.py +1 -1
judgeval/data/custom_example.py +3 -2
judgeval/data/datasets/dataset.py +80 -54
judgeval/data/datasets/eval_dataset_client.py +131 -181
judgeval/data/example.py +67 -43
judgeval/data/result.py +11 -9
judgeval/data/scorer_data.py +4 -2
judgeval/data/tool.py +25 -16
judgeval/data/trace.py +57 -29
judgeval/data/trace_run.py +5 -11
judgeval/evaluation_run.py +22 -82
judgeval/integrations/langgraph.py +546 -184
judgeval/judges/base_judge.py +1 -2
judgeval/judges/litellm_judge.py +33 -11
judgeval/judges/mixture_of_judges.py +128 -78
judgeval/judges/together_judge.py +22 -9
judgeval/judges/utils.py +14 -5
judgeval/judgment_client.py +259 -271
judgeval/rules.py +169 -142
judgeval/run_evaluation.py +462 -305
judgeval/scorers/api_scorer.py +20 -11
judgeval/scorers/exceptions.py +1 -0
judgeval/scorers/judgeval_scorer.py +77 -58
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
judgeval/scorers/prompt_scorer.py +48 -37
judgeval/scorers/score.py +86 -53
judgeval/scorers/utils.py +11 -7
judgeval/tracer/__init__.py +1 -1
judgeval/utils/alerts.py +23 -12
judgeval/utils/{data_utils.py → file_utils.py} +5 -9
judgeval/utils/requests.py +29 -0
judgeval/version_check.py +5 -2
{judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
judgeval-0.0.46.dist-info/RECORD +69 -0
judgeval-0.0.44.dist-info/RECORD +0 -68
{judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
{judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0

judgeval/scorers/score.py CHANGED Viewed

@@ -2,15 +2,14 @@
 Infrastructure for executing evaluations of `Example`s using one or more `JudgevalScorer`s.
 """
 import asyncio
-import time
+import time
 from tqdm.asyncio import tqdm_asyncio
 from typing import List, Union, Optional, Callable
 from rich.progress import Progress, SpinnerColumn, TextColumn
 from judgeval.data import (
-    Example,
+    Example,
     CustomExample,
     ScoringResult,
     generate_scoring_result,
@@ -22,6 +21,7 @@ from judgeval.common.exceptions import MissingTestCaseParamsError
 from judgeval.common.logger import example_logging_context, debug, error, warning, info
 from judgeval.judges import JudgevalJudge
 async def safe_a_score_example(
     scorer: JudgevalScorer,
     example: Example,
@@ -35,32 +35,42 @@ async def safe_a_score_example(
     Args:
         scorer (JudgevalScorer): The `JudgevalScorer` to use for scoring the example.
         example (Example): The `Example` to be scored.
-        ignore_errors (bool): Whether to ignore errors during the evaluation.
+        ignore_errors (bool): Whether to ignore errors during the evaluation.
         If set to false, any error will be raised and stop the evaluation.
         If set to true, the error will be stored in the `error` attribute of the `JudgevalScorer` and the `success` attribute will be set to False.
-        skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
+        skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
     """
     debug(f"Starting safe_a_score_example for example {example.example_id}")
     try:
         await scorer.a_score_example(example, _show_indicator=False)
         info(f"Successfully scored example {example.example_id}")
     except MissingTestCaseParamsError as e:
-        if skip_on_missing_params:  # Skip the example if the scorer requires parameters that are missing
+        if (
+            skip_on_missing_params
+        ):  # Skip the example if the scorer requires parameters that are missing
             with example_logging_context(example.created_at, example.example_id):
-                warning(f"Skipping example {example.example_id} due to missing parameters")
+                warning(
+                    f"Skipping example {example.example_id} due to missing parameters"
+                )
             scorer.skipped = True
             return
         else:
-            if ignore_errors:  # Gracefully handle the error, does not stop the evaluation
+            if (
+                ignore_errors
+            ):  # Gracefully handle the error, does not stop the evaluation
                 scorer.error = str(e)
                 scorer.success = False
                 with example_logging_context(example.created_at, example.example_id):
-                    warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
+                    warning(
+                        f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters"
+                    )
             else:  # Raise the error and stop the evaluation
                 with example_logging_context(example.created_at, example.example_id):
-                    error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
+                    error(
+                        f"Stopping example {example.example_id}: {str(e)} due to missing parameters"
+                    )
                 raise
     except TypeError:  # in case a_score_example does not accept _show_indicator
         try:
@@ -69,17 +79,27 @@ async def safe_a_score_example(
             if skip_on_missing_params:
                 scorer.skipped = True
                 with example_logging_context(example.created_at, example.example_id):
-                    warning(f"Skipping example {example.example_id} due to missing parameters")
+                    warning(
+                        f"Skipping example {example.example_id} due to missing parameters"
+                    )
                 return
             else:
                 if ignore_errors:
                     scorer.error = str(e)
-                    scorer.success = False
-                    with example_logging_context(example.created_at, example.example_id):
-                        warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
+                    scorer.success = False
+                    with example_logging_context(
+                        example.created_at, example.example_id
+                    ):
+                        warning(
+                            f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters"
+                        )
                 else:
-                    with example_logging_context(example.created_at, example.example_id):
-                        error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
+                    with example_logging_context(
+                        example.created_at, example.example_id
+                    ):
+                        error(
+                            f"Stopping example {example.example_id}: {str(e)} due to missing parameters"
+                        )
                     raise
     except Exception as e:
         if ignore_errors:
@@ -121,7 +141,7 @@ async def score_task(
     """
     while not progress.finished:
         start_time = time.perf_counter()
         try:
             await scorer.a_score_example(example, _show_indicator=False)
             finish_text = "Completed"
@@ -129,7 +149,9 @@ async def score_task(
             if skip_on_missing_params:
                 scorer.skipped = True
                 with example_logging_context(example.created_at, example.example_id):
-                    debug(f"Skipping example {example.example_id} due to missing parameters")
+                    debug(
+                        f"Skipping example {example.example_id} due to missing parameters"
+                    )
                 return
             else:
                 if ignore_errors:
@@ -137,8 +159,12 @@ async def score_task(
                     scorer.success = False  # Override success
                     finish_text = "Failed"
                 else:
-                    with example_logging_context(example.created_at, example.example_id):
-                        error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
+                    with example_logging_context(
+                        example.created_at, example.example_id
+                    ):
+                        error(
+                            f"Stopping example {example.example_id}: {str(e)} due to missing parameters"
+                        )
                     raise
         except TypeError:
             try:
@@ -147,8 +173,12 @@ async def score_task(
             except MissingTestCaseParamsError as e:
                 if skip_on_missing_params:
                     scorer.skipped = True
-                    with example_logging_context(example.created_at, example.example_id):
-                        debug(f"Skipping example {example.example_id} due to missing parameters")
+                    with example_logging_context(
+                        example.created_at, example.example_id
+                    ):
+                        debug(
+                            f"Skipping example {example.example_id} due to missing parameters"
+                        )
                     return
                 else:
                     if ignore_errors:
@@ -156,8 +186,12 @@ async def score_task(
                         scorer.success = False  # Override success
                         finish_text = "Failed"
                     else:
-                        with example_logging_context(example.created_at, example.example_id):
-                            error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
+                        with example_logging_context(
+                            example.created_at, example.example_id
+                        ):
+                            error(
+                                f"Stopping example {example.example_id}: {str(e)} due to missing parameters"
+                            )
                         raise
         except Exception as e:
             if ignore_errors:
@@ -165,7 +199,9 @@ async def score_task(
                 scorer.success = False  # Override success
                 finish_text = "Failed"
                 with example_logging_context(example.created_at, example.example_id):
-                    warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
+                    warning(
+                        f"Ignoring errors for example {example.example_id}: {str(e)}"
+                    )
             else:
                 with example_logging_context(example.created_at, example.example_id):
                     error(f"Stopping example {example.example_id}: {str(e)}")
@@ -213,9 +249,7 @@ async def score_with_indicator(
             tasks = []
             for scorer in scorers:
                 task_id = progress.add_task(
-                    description=scorer_console_msg(
-                        scorer, async_mode=True
-                    ),
+                    description=scorer_console_msg(scorer, async_mode=True),
                     total=100,
                 )  # Add task to progress bar
                 tasks.append(
@@ -231,9 +265,7 @@ async def score_with_indicator(
             await asyncio.gather(*tasks)
     else:
         tasks = [
-            safe_a_score_example(
-                scorer, example, ignore_errors, skip_on_missing_params
-            )
+            safe_a_score_example(scorer, example, ignore_errors, skip_on_missing_params)
             for scorer in scorers
         ]
@@ -280,7 +312,7 @@ async def a_execute_scoring(
                 return await func(*args, **kwargs)
             except Exception as e:
                 print(f"Error executing function: {e}")
-                if kwargs.get('ignore_errors', False):
+                if kwargs.get("ignore_errors", False):
                     # Simply return None when ignoring errors, as expected by the test
                     return None
                 # If we're not ignoring errors, propagate the exception
@@ -290,12 +322,13 @@ async def a_execute_scoring(
         for scorer in scorers:
             scorer.verbose_mode = verbose_mode
-    # Add model to scorers
+    # Add model to scorers
     for scorer in scorers:
         scorer._add_model(model)
     scoring_results: List[ScoringResult] = [None for _ in examples]
     tasks = []
+    cloned_scorers: List[JudgevalScorer]
     if show_indicator and _use_bar_indicator:
         with tqdm_asyncio(
@@ -311,18 +344,16 @@ async def a_execute_scoring(
                     debug(f"Using {len(scorers)} scorers")
                     for scorer in scorers:
                         debug(f"Using scorer: {type(scorer).__name__}")
-                        if hasattr(scorer, 'threshold'):
+                        if hasattr(scorer, "threshold"):
                             debug(f"Scorer threshold: {scorer.threshold}")
-                        if hasattr(scorer, 'model'):
+                        if hasattr(scorer, "model"):
                             debug(f"Scorer model: {type(scorer.model).__name__}")
                 if isinstance(ex, Example) or isinstance(ex, CustomExample):
                     if len(scorers) == 0:
                         pbar.update(1)
                         continue
-                    cloned_scorers: List[JudgevalScorer] = clone_scorers(
-                        scorers
-                    )
+                    cloned_scorers = clone_scorers(scorers)
                     task = execute_with_semaphore(
                         func=a_eval_examples_helper,
                         scorers=cloned_scorers,
@@ -345,9 +376,7 @@ async def a_execute_scoring(
                 if len(scorers) == 0:
                     continue
-                cloned_scorers: List[JudgevalScorer] = clone_scorers(
-                    scorers
-                )
+                cloned_scorers = clone_scorers(scorers)
                 task = execute_with_semaphore(
                     func=a_eval_examples_helper,
                     scorers=cloned_scorers,
@@ -376,10 +405,10 @@ async def a_eval_examples_helper(
     show_indicator: bool,
     _use_bar_indicator: bool,
     pbar: Optional[tqdm_asyncio] = None,
-    ) -> None:
+) -> None:
     """
     Evaluate a single example asynchronously using a list of scorers.
     Args:
         scorers (List[JudgevalScorer]): List of JudgevalScorer objects to evaluate the example.
         example (Example): The example to be evaluated.
@@ -410,23 +439,27 @@ async def a_eval_examples_helper(
         show_indicator=show_metrics_indicator,
     )  # execute the scoring functions of each scorer on the example
-    # Now that all the scoring functions of each scorer have executed, we collect
+    # Now that all the scoring functions of each scorer have executed, we collect
     # the results and update the ScoringResult with the scorer data
     success = True
     scorer_data_list = []
     for scorer in scorers:
         # At this point, the scorer has been executed and already contains data.
-        if getattr(scorer, 'skipped', False):
+        if getattr(scorer, "skipped", False):
             continue
-        scorer_data = create_scorer_data(scorer)  # Fetch scorer data from completed scorer evaluation
+        scorer_data = create_scorer_data(
+            scorer
+        )  # Fetch scorer data from completed scorer evaluation
         success = success and scorer_data.success
         scorer_data_list.append(scorer_data)
     scoring_end_time = time.perf_counter()
     run_duration = scoring_end_time - scoring_start_time
-    scoring_result = generate_scoring_result(example, scorer_data_list, run_duration, success)
+    scoring_result = generate_scoring_result(
+        example, scorer_data_list, run_duration, success
+    )
     scoring_results[score_index] = scoring_result
     if pbar is not None:
         pbar.update(1)

judgeval/scorers/utils.py CHANGED Viewed

@@ -83,7 +83,9 @@ def scorer_progress_meter(
         yield
-def parse_response_json(llm_response: str, scorer: Optional[JudgevalScorer] = None) -> dict:
+def parse_response_json(
+    llm_response: str, scorer: Optional[JudgevalScorer] = None
+) -> dict:
     """
     Extracts JSON output from an LLM response and returns it as a dictionary.
@@ -100,8 +102,12 @@ def parse_response_json(llm_response: str, scorer: Optional[JudgevalScorer] = No
         llm_response = llm_response + "}"
         end = len(llm_response)
-    json_str = llm_response[start:end] if start != -1 and end != 0 else ""  # extract the JSON string
-    json_str = re.sub(r",\s*([\]}])", r"\1", json_str)  # Remove trailing comma if present
+    json_str = (
+        llm_response[start:end] if start != -1 and end != 0 else ""
+    )  # extract the JSON string
+    json_str = re.sub(
+        r",\s*([\]}])", r"\1", json_str
+    )  # Remove trailing comma if present
     try:
         return json.loads(json_str)
@@ -131,7 +137,7 @@ def create_verbose_logs(metric: JudgevalScorer, steps: List[str]) -> str:
     Args:
         metric (JudgevalScorer): The scorer object.
         steps (List[str]): The steps to be included in the verbose logs.
     Returns:
         str: The verbose logs (Concatenated steps).
     """
@@ -157,7 +163,7 @@ def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
     Returns:
         asyncio.AbstractEventLoop: The current or newly created event loop.
     Raises:
         RuntimeError: If the event loop is closed.
     """
@@ -205,5 +211,3 @@ def check_example_params(
         error_str = f"{missing_params_str} fields in example cannot be None for the '{scorer.__name__}' scorer"
         scorer.error = error_str
         raise MissingExampleParamsError(error_str)

judgeval/tracer/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 from judgeval.common.tracer import Tracer, wrap, TraceClient, TraceManagerClient
-__all__ = ["Tracer", "wrap", "TraceClient", "TraceManagerClient"]
+__all__ = ["Tracer", "wrap", "TraceClient", "TraceManagerClient"]

judgeval/utils/alerts.py CHANGED Viewed

@@ -1,19 +1,23 @@
 """
 Handling alerts in Judgeval.
 """
 from enum import Enum
 from typing import Dict, Any, List, Optional
 from pydantic import BaseModel
 class AlertStatus(str, Enum):
     """Status of an alert evaluation."""
     TRIGGERED = "triggered"
     NOT_TRIGGERED = "not_triggered"
 class AlertResult(BaseModel):
     """
     Result of a rule evaluation.
     Attributes:
         rule_name: Name of the rule that was evaluated
         rule_id: Unique identifier of the rule
@@ -25,26 +29,29 @@ class AlertResult(BaseModel):
         project_id: Optional project identifier
         trace_span_id: Optional trace span identifier
     """
     rule_name: str
     rule_id: Optional[str] = None  # The unique identifier of the rule
     status: AlertStatus
     conditions_result: List[Dict[str, Any]] = []
     metadata: Dict[str, Any] = {}
-    notification: Optional[Any] = None  # NotificationConfig when triggered, None otherwise
+    notification: Optional[Any] = (
+        None  # NotificationConfig when triggered, None otherwise
+    )
     combine_type: Optional[str] = None  # "all" or "any"
     project_id: Optional[str] = None  # Project identifier
     trace_span_id: Optional[str] = None  # Trace span identifier
     @property
     def example_id(self) -> Optional[str]:
         """Get example_id from metadata for backward compatibility"""
         return self.metadata.get("example_id")
     @property
     def timestamp(self) -> Optional[str]:
         """Get timestamp from metadata for backward compatibility"""
         return self.metadata.get("timestamp")
     @property
     def conditions_results(self) -> List[Dict[str, Any]]:
         """Backwards compatibility property for the conditions_result field"""
@@ -53,15 +60,19 @@ class AlertResult(BaseModel):
     def model_dump(self, **kwargs):
         """
         Convert the AlertResult to a dictionary for JSON serialization.
         Args:
             **kwargs: Additional arguments to pass to Pydantic's model_dump
         Returns:
             dict: Dictionary representation of the AlertResult
         """
-        data = super().model_dump(**kwargs) if hasattr(super(), "model_dump") else super().dict(**kwargs)
+        data = (
+            super().model_dump(**kwargs)
+            if hasattr(super(), "model_dump")
+            else super().dict(**kwargs)
+        )
         # Handle the NotificationConfig object if it exists
         if hasattr(self, "notification") and self.notification is not None:
             if hasattr(self.notification, "model_dump"):
@@ -76,7 +87,7 @@ class AlertResult(BaseModel):
                     "communication_methods": notif.communication_methods,
                     "email_addresses": notif.email_addresses,
                     "slack_channels": getattr(notif, "slack_channels", []),
-                    "send_at": notif.send_at
+                    "send_at": notif.send_at,
                 }
-        return data
+        return data

judgeval/utils/{data_utils.py → file_utils.py} RENAMED Viewed

@@ -1,15 +1,11 @@
 import yaml
-from judgeval.common.logger import (
-    debug,
-    info,
-    error,
-    example_logging_context
-)
+from typing import List
+from judgeval.common.logger import debug, info, error
 from judgeval.data import Example
-def add_from_yaml(file_path: str) -> None:
+def get_examples_from_yaml(file_path: str) -> List[Example] | None:
     debug(f"Loading dataset from YAML file: {file_path}")
     """
     Adds examples from a YAML file.
@@ -51,7 +47,7 @@ def add_from_yaml(file_path: str) -> None:
     except yaml.YAMLError:
         error(f"Invalid YAML file: {file_path}")
         raise ValueError(f"The file {file_path} is not a valid YAML file.")
     info(f"Added {len(examples)} examples from YAML")
     new_examples = [Example(**e) for e in examples]
-    return new_examples
+    return new_examples

judgeval/utils/requests.py ADDED Viewed

@@ -0,0 +1,29 @@
+import requests as requests_original
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from http import HTTPStatus
+class RetrySession(requests_original.Session):
+    def __init__(
+        self,
+        retries=3,
+        backoff_factor=0.5,
+        status_forcelist=[HTTPStatus.BAD_GATEWAY, HTTPStatus.SERVICE_UNAVAILABLE],
+    ):
+        super().__init__()
+        retry_strategy = Retry(
+            total=retries,
+            read=retries,
+            connect=retries,
+            backoff_factor=backoff_factor,
+            status_forcelist=status_forcelist,
+        )
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        self.mount("http://", adapter)
+        self.mount("https://", adapter)
+requests = RetrySession()

judgeval/version_check.py CHANGED Viewed

@@ -1,12 +1,15 @@
 import importlib.metadata
-import requests
+from judgeval.utils.requests import requests
 import threading
 def check_latest_version(package_name: str = "judgeval"):
     def _check():
         try:
             current_version = importlib.metadata.version(package_name)
-            response = requests.get(f"https://pypi.org/pypi/{package_name}/json", timeout=2)
+            response = requests.get(
+                f"https://pypi.org/pypi/{package_name}/json", timeout=2
+            )
             latest_version = response.json()["info"]["version"]
             if current_version != latest_version:

judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl

judgeval 0.0.44py3-none-any.whl → 0.0.46py3-none-any.whl