PyPI - deepeval - Versions diffs - 3.7.0__tar.gz → 3.7.2__tar.gz - Mend

deepeval 3.7.0tar.gz → 3.7.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (513) hide show

{deepeval-3.7.0 → deepeval-3.7.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deepeval
-Version: 3.7.0
+Version: 3.7.2
 Summary: The LLM Evaluation Framework
 Home-page: https://github.com/confident-ai/deepeval
 License: Apache-2.0
@@ -32,7 +32,7 @@ Requires-Dist: pyfiglet
 Requires-Dist: pytest
 Requires-Dist: pytest-asyncio
 Requires-Dist: pytest-repeat
-Requires-Dist: pytest-rerunfailures (>=12.0,<13.0)
+Requires-Dist: pytest-rerunfailures
 Requires-Dist: pytest-xdist
 Requires-Dist: python-dotenv (>=1.1.1,<2.0.0)
 Requires-Dist: requests (>=2.31.0,<3.0.0)
@@ -439,6 +439,7 @@ Using `.env.local` or `.env` is optional. If they are missing, DeepEval uses you
 ```bash
 cp .env.example .env.local
 # then edit .env.local (ignored by git)
+```
 <br />

{deepeval-3.7.0 → deepeval-3.7.2}/README.md RENAMED Viewed

@@ -389,6 +389,7 @@ Using `.env.local` or `.env` is optional. If they are missing, DeepEval uses you
 ```bash
 cp .env.example .env.local
 # then edit .env.local (ignored by git)
+```
 <br />

{deepeval-3.7.0 → deepeval-3.7.2}/deepeval/__init__.py RENAMED Viewed

@@ -102,9 +102,5 @@ def update_warning_opt_in():
     return os.getenv("DEEPEVAL_UPDATE_WARNING_OPT_IN") == "1"
-def is_read_only_env():
-    return os.getenv("DEEPEVAL_FILE_SYSTEM") == "READ_ONLY"
 if update_warning_opt_in():
     check_for_update()

deepeval-3.7.2/deepeval/_version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__: str = "3.7.2"

{deepeval-3.7.0 → deepeval-3.7.2}/deepeval/cli/main.py RENAMED Viewed

@@ -1484,6 +1484,11 @@ def set_gemini_model_env(
     google_cloud_location: Optional[str] = typer.Option(
         None, "--location", help="Google Cloud location"
     ),
+    google_service_account_key: Optional[str] = typer.Option(
+        None,
+        "--service-account-key",
+        help="Google Service Account Key for Gemini",
+    ),
     save: Optional[str] = typer.Option(
         None,
         "--save",
@@ -1513,6 +1518,8 @@ def set_gemini_model_env(
             settings.GOOGLE_CLOUD_PROJECT = google_cloud_project
         if google_cloud_location:
             settings.GOOGLE_CLOUD_LOCATION = google_cloud_location
+        if google_service_account_key:
+            settings.GOOGLE_SERVICE_ACCOUNT_KEY = google_service_account_key
         if model_name:
             settings.GEMINI_MODEL_NAME = model_name

{deepeval-3.7.0 → deepeval-3.7.2}/deepeval/confident/api.py RENAMED Viewed

@@ -27,6 +27,10 @@ retryable_exceptions = requests.exceptions.SSLError
 def get_base_api_url():
+    s = get_settings()
+    if s.CONFIDENT_BASE_URL:
+        base_url = s.CONFIDENT_BASE_URL.rstrip("/")
+        return base_url
     region = KEY_FILE_HANDLER.fetch_data(KeyValues.CONFIDENT_REGION)
     if region == "EU":
         return API_BASE_URL_EU
@@ -87,6 +91,7 @@ class Endpoints(Enum):
     DATASET_ALIAS_QUEUE_ENDPOINT = "/v1/datasets/:alias/queue"
     TEST_RUN_ENDPOINT = "/v1/test-run"
+    EXPERIMENT_ENDPOINT = "/v1/experiment"
     METRIC_DATA_ENDPOINT = "/v1/metric-data"
     TRACES_ENDPOINT = "/v1/traces"
     ANNOTATIONS_ENDPOINT = "/v1/annotations"
@@ -115,7 +120,7 @@ class Api:
         self.api_key = api_key
         self._headers = {
             "Content-Type": "application/json",
-            "CONFIDENT_API_KEY": api_key,
+            "CONFIDENT-API-KEY": api_key,
             "X-DeepEval-Version": deepeval.__version__,
         }
         self.base_api_url = get_base_api_url()

{deepeval-3.7.0 → deepeval-3.7.2}/deepeval/config/settings.py RENAMED Viewed

@@ -229,6 +229,11 @@ class Settings(BaseSettings):
     API_KEY: Optional[SecretStr] = None
     CONFIDENT_API_KEY: Optional[SecretStr] = None
+    # ======
+    # Base URL for Confident AI API server
+    # ======
+    CONFIDENT_BASE_URL: Optional[str] = None
     # General
     TEMPERATURE: Optional[confloat(ge=0, le=2)] = None

{deepeval-3.7.0 → deepeval-3.7.2}/deepeval/evaluate/compare.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from typing import Optional, List, Dict, Callable
 import asyncio
+import time
 from rich.progress import (
     Progress,
     TextColumn,
@@ -8,24 +9,74 @@ from rich.progress import (
     TaskProgressColumn,
 )
 from collections import Counter
+import json
 from deepeval.errors import MissingTestCaseParamsError
 from deepeval.evaluate.configs import AsyncConfig, DisplayConfig, ErrorConfig
-from deepeval.test_case import ArenaTestCase
+from deepeval.test_case import ArenaTestCase, Contestant
+from deepeval.test_case.api import create_api_test_case
 from deepeval.metrics import ArenaGEval
-from deepeval.utils import add_pbar, update_pbar, custom_console
-from deepeval.utils import get_or_create_event_loop
+from deepeval.utils import (
+    add_pbar,
+    update_pbar,
+    custom_console,
+    get_or_create_event_loop,
+    open_browser,
+)
+from deepeval.test_run.test_run import (
+    TestRun,
+    MetricData,
+    TestRunEncoder,
+    MetricScores,
+    console,
+)
+from deepeval.test_run.hyperparameters import (
+    process_hyperparameters,
+)
+from deepeval.confident.api import Api, Endpoints, HttpMethods, is_confident
 from deepeval.telemetry import capture_evaluation_run
+from deepeval.test_run.api import LLMApiTestCase
+from deepeval.evaluate.utils import create_arena_metric_data
+from deepeval.evaluate.types import PostExperimentRequest
 def compare(
     test_cases: List[ArenaTestCase],
     metric: ArenaGEval,
+    name: str = "compare()",
     # Configs
     async_config: Optional[AsyncConfig] = AsyncConfig(),
     display_config: Optional[DisplayConfig] = DisplayConfig(),
     error_config: Optional[ErrorConfig] = ErrorConfig(),
 ) -> Dict[str, int]:
+    # Prepare test run map
+    unique_contestant_names = set(
+        [
+            contestant.name
+            for test_case in test_cases
+            for contestant in test_case.contestants
+        ]
+    )
+    test_run_map: Dict[str, TestRun] = {}
+    for contestant_name in unique_contestant_names:
+        test_run = TestRun(
+            identifier=contestant_name,
+            test_passed=0,
+            test_failed=0,
+        )
+        test_run.metrics_scores = [
+            MetricScores(
+                metric=metric.name,
+                scores=[],
+                passes=0,
+                fails=0,
+                errors=0,
+            )
+        ]
+        test_run_map[contestant_name] = test_run
+    start_time = time.time()
     with capture_evaluation_run("compare()"):
         if async_config.run_async:
             loop = get_or_create_event_loop()
@@ -39,6 +90,7 @@ def compare(
                     throttle_value=async_config.throttle_value,
                     max_concurrent=async_config.max_concurrent,
                     skip_on_missing_params=error_config.skip_on_missing_params,
+                    test_run_map=test_run_map,
                 )
             )
         else:
@@ -49,7 +101,10 @@ def compare(
                 verbose_mode=display_config.verbose_mode,
                 show_indicator=display_config.show_indicator,
                 skip_on_missing_params=error_config.skip_on_missing_params,
+                test_run_map=test_run_map,
             )
+    end_time = time.time()
+    run_duration = end_time - start_time
     # Aggregate winners
     winner_counts = Counter()
@@ -57,7 +112,13 @@ def compare(
         if winner:
             winner_counts[winner] += 1
-    print(winner_counts)
+    process_test_runs(test_run_map=test_run_map, test_cases=test_cases)
+    wrap_up_experiment(
+        name=name,
+        test_runs=list(test_run_map.values()),
+        winner_counts=winner_counts,
+        run_duration=run_duration,
+    )
     return dict(winner_counts)
@@ -70,6 +131,7 @@ async def a_execute_arena_test_cases(
     throttle_value: int,
     skip_on_missing_params: bool,
     max_concurrent: int,
+    test_run_map: Dict[str, TestRun],
 ) -> List[str]:
     semaphore = asyncio.Semaphore(max_concurrent)
@@ -104,6 +166,8 @@ async def a_execute_arena_test_cases(
                 else metric.verbose_mode
             ),
         )
+        start_time = time.perf_counter()
         winner = await _a_handle_metric_measurement(
             metric=metric_copy,
             test_case=test_case,
@@ -112,10 +176,21 @@ async def a_execute_arena_test_cases(
             _progress=progress,
             _pbar_id=pbar_test_case_id,
         )
+        end_time = time.perf_counter()
+        run_duration = end_time - start_time
         if winner:
             winners.append(winner)
         update_pbar(progress, pbar_id)
+        update_test_run_map(
+            test_case=test_case,
+            index=index,
+            test_run_map=test_run_map,
+            metric_copy=metric_copy,
+            winner=winner,
+            run_duration=run_duration,
+        )
     # Create tasks for all test cases
     if show_indicator:
@@ -156,6 +231,7 @@ def execute_arena_test_cases(
     skip_on_missing_params: bool,
     show_indicator: bool,
     verbose_mode: Optional[bool] = None,
+    test_run_map: Optional[Dict[str, TestRun]] = None,
 ) -> List[str]:
     """
     Non-async version of comparing arena test cases.
@@ -183,6 +259,8 @@ def execute_arena_test_cases(
                     else metric.verbose_mode
                 ),
             )
+            start_time = time.perf_counter()
             winner = _handle_metric_measurement(
                 metric=metric_copy,
                 test_case=test_case,
@@ -191,10 +269,21 @@ def execute_arena_test_cases(
                 _progress=progress,
                 _pbar_id=pbar_test_case_id,
             )
+            end_time = time.perf_counter()
+            run_duration = end_time - start_time
             if winner:
                 winners.append(winner)
             update_pbar(progress, pbar_id)
+            update_test_run_map(
+                test_case=test_case,
+                index=i,
+                test_run_map=test_run_map,
+                metric_copy=metric_copy,
+                winner=winner,
+                run_duration=run_duration,
+            )
     if show_indicator:
         progress = Progress(
@@ -313,3 +402,129 @@ async def _a_handle_metric_measurement(
                 return None
             else:
                 raise
+def update_test_run_map(
+    test_case: ArenaTestCase,
+    index: int,
+    test_run_map: Dict[str, TestRun],
+    metric_copy: ArenaGEval,
+    winner: str,
+    run_duration: float,
+):
+    for contestant in test_case.contestants:
+        test_run = test_run_map.get(contestant.name)
+        # update test cases in test run
+        api_test_case: LLMApiTestCase = create_api_test_case(
+            test_case=contestant.test_case, index=index
+        )
+        metric_data: MetricData = create_arena_metric_data(
+            metric_copy, contestant.name
+        )
+        api_test_case.update_metric_data(metric_data)
+        api_test_case.update_run_duration(run_duration)
+        test_run.add_test_case(api_test_case)
+        # update other test run attributes
+        if test_run.run_duration is None:
+            test_run.run_duration = 0.0
+        test_run.run_duration += run_duration
+        # Ensure test_passed and test_failed are initialized
+        if test_run.test_passed is None:
+            test_run.test_passed = 0
+        if test_run.test_failed is None:
+            test_run.test_failed = 0
+        if winner == contestant:
+            test_run.test_passed += 1
+        else:
+            test_run.test_failed += 1
+        # update metric scores
+        test_run.metrics_scores[0].metric = metric_copy.name
+        test_run.metrics_scores[0].scores.append(
+            1 if winner == contestant else 0
+        )
+        test_run.metrics_scores[0].passes += 1 if winner == contestant else 0
+        test_run.metrics_scores[0].fails += 1 if winner != contestant else 0
+        test_run.metrics_scores[0].errors += 0
+def process_test_runs(
+    test_run_map: Dict[str, TestRun],
+    test_cases: List[ArenaTestCase],
+):
+    hyperparameters_map = {
+        contestant_name: {} for contestant_name in test_run_map.keys()
+    }
+    for test_case in test_cases:
+        for contestant in test_case.contestants:
+            if contestant.hyperparameters:
+                hyperparameters_map[contestant.name].update(
+                    contestant.hyperparameters
+                )
+    for contestant_name, hyperparameters in hyperparameters_map.items():
+        test_run = test_run_map.get(contestant_name)
+        test_run.hyperparameters = process_hyperparameters(hyperparameters)
+def wrap_up_experiment(
+    name: str,
+    test_runs: List[TestRun],
+    winner_counts: Counter,
+    run_duration: float,
+):
+    winner_breakdown = []
+    for contestant, wins in winner_counts.most_common():
+        winner_breakdown.append(
+            f"    » [bold green]{contestant}[/bold green]: {wins} wins"
+        )
+    winner_text = (
+        "\n".join(winner_breakdown) if winner_breakdown else "No winners"
+    )
+    console.print(
+        f"\n🎉 Arena completed! (time taken: {round(run_duration, 2)}s | token cost: {test_runs[0].evaluation_cost if test_runs else 0} USD)\n"
+        f"🏆 Results ({sum(winner_counts.values())} total test cases):\n"
+        f"{winner_text}\n\n"
+    )
+    if not is_confident():
+        console.print(
+            f"{'=' * 80}\n"
+            f"\n» Want to share experiments with your team? ❤️ 🏟️\n"
+            f"  » Run [bold]'deepeval login'[/bold] to analyze and save arena results on [rgb(106,0,255)]Confident AI[/rgb(106,0,255)].\n\n"
+        )
+        return
+    try:
+        api = Api()
+        experiment_request = PostExperimentRequest(
+            testRuns=test_runs, name=name
+        )
+        try:
+            body = experiment_request.model_dump(
+                by_alias=True, exclude_none=True
+            )
+        except AttributeError:
+            body = experiment_request.dict(by_alias=True, exclude_none=True)
+        json_str = json.dumps(body, cls=TestRunEncoder)
+        body = json.loads(json_str)
+        _, link = api.send_request(
+            method=HttpMethods.POST,
+            endpoint=Endpoints.EXPERIMENT_ENDPOINT,
+            body=body,
+        )
+        console.print(
+            "[rgb(5,245,141)]✓[/rgb(5,245,141)] Done 🎉! View results on "
+            f"[link={link}]{link}[/link]"
+        )
+        open_browser(link)
+    except Exception:
+        raise

{deepeval-3.7.0 → deepeval-3.7.2}/deepeval/evaluate/types.py RENAMED Viewed

@@ -4,6 +4,7 @@ from pydantic import BaseModel
 from deepeval.test_run.api import MetricData, TurnApi
 from deepeval.test_case import MLLMImage
+from deepeval.test_run import TestRun
 @dataclass
@@ -29,3 +30,8 @@ class EvaluationResult(BaseModel):
     test_results: List[TestResult]
     confident_link: Optional[str]
     test_run_id: Optional[str]
+class PostExperimentRequest(BaseModel):
+    testRuns: List[TestRun]
+    name: Optional[str]

{deepeval-3.7.0 → deepeval-3.7.2}/deepeval/evaluate/utils.py RENAMED Viewed

@@ -8,6 +8,7 @@ from deepeval.utils import format_turn
 from deepeval.test_run.test_run import TestRunResultDisplay
 from deepeval.dataset import Golden
 from deepeval.metrics import (
+    ArenaGEval,
     BaseMetric,
     BaseConversationalMetric,
     BaseMultimodalMetric,
@@ -84,6 +85,35 @@ def create_metric_data(metric: BaseMetric) -> MetricData:
         )
+def create_arena_metric_data(metric: ArenaGEval, contestant: str) -> MetricData:
+    if metric.error is not None:
+        return MetricData(
+            name=metric.__name__,
+            threshold=1,
+            score=None,
+            reason=None,
+            success=False,
+            strictMode=True,
+            evaluationModel=metric.evaluation_model,
+            error=metric.error,
+            evaluationCost=metric.evaluation_cost,
+            verboseLogs=metric.verbose_logs,
+        )
+    else:
+        return MetricData(
+            name=metric.__name__,
+            score=1 if contestant == metric.winner else 0,
+            threshold=1,
+            reason=metric.reason,
+            success=metric.is_successful(),
+            strictMode=True,
+            evaluationModel=metric.evaluation_model,
+            error=None,
+            evaluationCost=metric.evaluation_cost,
+            verboseLogs=metric.verbose_logs,
+        )
 def create_test_result(
     api_test_case: Union[LLMApiTestCase, ConversationalApiTestCase],
 ) -> TestResult:

{deepeval-3.7.0 → deepeval-3.7.2}/deepeval/key_handler.py RENAMED Viewed

@@ -70,6 +70,7 @@ class ModelKeyValues(Enum):
     GOOGLE_GENAI_USE_VERTEXAI = "GOOGLE_GENAI_USE_VERTEXAI"
     GOOGLE_CLOUD_PROJECT = "GOOGLE_CLOUD_PROJECT"
     GOOGLE_CLOUD_LOCATION = "GOOGLE_CLOUD_LOCATION"
+    GOOGLE_SERVICE_ACCOUNT_KEY = "GOOGLE_SERVICE_ACCOUNT_KEY"
     # LiteLLM
     USE_LITELLM = "USE_LITELLM"
     LITELLM_MODEL_NAME = "LITELLM_MODEL_NAME"

{deepeval-3.7.0 → deepeval-3.7.2}/deepeval/metrics/arena_g_eval/arena_g_eval.py RENAMED Viewed

@@ -46,7 +46,11 @@ class ArenaGEval(BaseArenaMetric):
         self.criteria = criteria
         self.model, self.using_native_model = initialize_model(model)
         self.evaluation_model = self.model.get_model_name()
-        self.evaluation_steps = evaluation_steps
+        self.evaluation_steps = (
+            evaluation_steps
+            if evaluation_steps and len(evaluation_steps) > 0
+            else None
+        )
         self.async_mode = async_mode
         self.verbose_mode = verbose_mode
         self._include_g_eval_suffix = _include_g_eval_suffix

{deepeval-3.7.0 → deepeval-3.7.2}/deepeval/metrics/arena_g_eval/utils.py RENAMED Viewed

@@ -89,10 +89,10 @@ class FormattedArenaTestCase:
 def format_arena_test_case(
     evaluation_params: List[LLMTestCaseParams], test_case: ArenaTestCase
 ) -> Tuple[FormattedArenaTestCase, Dict[str, str]]:
-    case = next(iter(test_case.contestants.values()))
+    case = next(iter([case.test_case for case in test_case.contestants]))
     # Create dummy name mapping
-    real_names = list(test_case.contestants.keys())
+    real_names = list([case.name for case in test_case.contestants])
     available_fake_names = FAKE_NAMES.copy()
     random.shuffle(available_fake_names)
@@ -119,10 +119,10 @@ def format_arena_test_case(
             else None
         ),
         contestants={
-            contestant: construct_formatted_llm_test_case(
-                evaluation_params, test_case
+            contestant.name: construct_formatted_llm_test_case(
+                evaluation_params, contestant.test_case
             )
-            for contestant, test_case in test_case.contestants.items()
+            for contestant in test_case.contestants
         },
         dummy_to_real_names=dummy_to_real_names,
     )

{deepeval-3.7.0 → deepeval-3.7.2}/deepeval/metrics/conversational_g_eval/conversational_g_eval.py RENAMED Viewed

@@ -9,6 +9,8 @@ from deepeval.metrics.g_eval.utils import (
     construct_conversational_g_eval_turn_params_string,
     construct_non_turns_test_case_string,
     format_rubrics,
+    validate_and_sort_rubrics,
+    validate_criteria_and_evaluation_steps,
 )
 from deepeval.test_case import (
     TurnParams,
@@ -63,27 +65,16 @@ class ConversationalGEval(BaseConversationalMetric):
         self.evaluation_params = evaluation_params
-        # Check if both criteria and evaluation_steps are not None at the same time
-        if criteria is None and evaluation_steps is None:
-            raise ValueError(
-                "Either 'criteria' or 'evaluation_steps' must be provided."
-            )
-        # Check if criteria is provided, it cannot be an empty string
-        if criteria is not None and not criteria.strip():
-            raise ValueError("Criteria provided cannot be an empty string.")
-        # Check if evaluation_steps is provided, it cannot be an empty list
-        if evaluation_steps is not None and len(evaluation_steps) == 0:
-            raise ValueError(
-                "'evaluation_steps' must not be an empty list. Either omit evaluation steps or include a non-empty list of steps."
-            )
+        validate_criteria_and_evaluation_steps(criteria, evaluation_steps)
         self.criteria = criteria
-        self.rubric = rubric
+        self.rubric = validate_and_sort_rubrics(rubric)
         self.model, self.using_native_model = initialize_model(model)
         self.evaluation_model = self.model.get_model_name()
-        self.evaluation_steps = evaluation_steps
+        self.evaluation_steps = (
+            evaluation_steps
+            if evaluation_steps and len(evaluation_steps) > 0
+            else None
+        )
         self.threshold = 1 if strict_mode else threshold
         self.strict_mode = strict_mode
         self.async_mode = async_mode

{deepeval-3.7.0 → deepeval-3.7.2}/deepeval/metrics/g_eval/g_eval.py RENAMED Viewed

@@ -61,7 +61,11 @@ class GEval(BaseMetric):
         self.score_range_span = self.score_range[1] - self.score_range[0]
         self.model, self.using_native_model = initialize_model(model)
         self.evaluation_model = self.model.get_model_name()
-        self.evaluation_steps = evaluation_steps
+        self.evaluation_steps = (
+            evaluation_steps
+            if evaluation_steps and len(evaluation_steps) > 0
+            else None
+        )
         self.threshold = 1 if strict_mode else threshold
         self.top_logprobs = top_logprobs
         self.strict_mode = strict_mode

{deepeval-3.7.0 → deepeval-3.7.2}/deepeval/metrics/g_eval/utils.py RENAMED Viewed

@@ -77,7 +77,7 @@ def validate_criteria_and_evaluation_steps(
 def validate_and_sort_rubrics(
     rubrics: Optional[List[Rubric]] = None,
 ) -> Optional[List[Rubric]]:
-    if rubrics is None:
+    if rubrics is None or len(rubrics) == 0:
         return None
     # Sort rubrics by start of range

{deepeval-3.7.0 → deepeval-3.7.2}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py RENAMED Viewed

@@ -64,7 +64,11 @@ class MultimodalGEval(BaseMultimodalMetric):
         self.rubric = validate_and_sort_rubrics(rubric)
         self.model, self.using_native_model = initialize_multimodal_model(model)
         self.evaluation_model = self.model.get_model_name()
-        self.evaluation_steps = evaluation_steps
+        self.evaluation_steps = (
+            evaluation_steps
+            if evaluation_steps and len(evaluation_steps) > 0
+            else None
+        )
         self.threshold = 1 if strict_mode else threshold
         self.top_logprobs = top_logprobs
         self.strict_mode = strict_mode

{deepeval-3.7.0 → deepeval-3.7.2}/deepeval/metrics/utils.py RENAMED Viewed

@@ -270,7 +270,7 @@ def check_arena_test_case_params(
             f"Expected ArenaTestCase, got {type(arena_test_case).__name__}"
         )
-    cases = list(arena_test_case.contestants.values())
+    cases = [contestant.test_case for contestant in arena_test_case.contestants]
     ref_input = cases[0].input
     for case in cases[1:]:
         if case.input != ref_input:

deepeval 3.7.0__tar.gz → 3.7.2__tar.gz

deepeval 3.7.0tar.gz → 3.7.2tar.gz