PyPI - deepeval - Versions diffs - 3.7.0__py3-none-any.whl → 3.7.1__py3-none-any.whl - Mend

deepeval 3.7.0py3-none-any.whl → 3.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

deepeval/__init__.py +0 -4
deepeval/_version.py +1 -1
deepeval/cli/main.py +7 -0
deepeval/confident/api.py +6 -1
deepeval/config/settings.py +5 -0
deepeval/evaluate/compare.py +215 -4
deepeval/evaluate/types.py +6 -0
deepeval/evaluate/utils.py +30 -0
deepeval/key_handler.py +1 -0
deepeval/metrics/arena_g_eval/arena_g_eval.py +5 -1
deepeval/metrics/arena_g_eval/utils.py +5 -5
deepeval/metrics/conversational_g_eval/conversational_g_eval.py +9 -18
deepeval/metrics/g_eval/g_eval.py +5 -1
deepeval/metrics/g_eval/utils.py +1 -1
deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +5 -1
deepeval/metrics/utils.py +1 -1
deepeval/models/llms/gemini_model.py +27 -5
deepeval/openai_agents/callback_handler.py +12 -3
deepeval/prompt/prompt.py +25 -14
deepeval/simulator/template.py +1 -1
deepeval/test_case/__init__.py +2 -1
deepeval/test_case/arena_test_case.py +15 -4
deepeval/test_case/mllm_test_case.py +45 -22
deepeval/test_run/cache.py +31 -10
deepeval/test_run/hyperparameters.py +5 -1
deepeval/test_run/test_run.py +28 -9
deepeval/tracing/tracing.py +1 -1
deepeval/utils.py +4 -0
{deepeval-3.7.0.dist-info → deepeval-3.7.1.dist-info}/METADATA +2 -2
{deepeval-3.7.0.dist-info → deepeval-3.7.1.dist-info}/RECORD +33 -33
{deepeval-3.7.0.dist-info → deepeval-3.7.1.dist-info}/LICENSE.md +0 -0
{deepeval-3.7.0.dist-info → deepeval-3.7.1.dist-info}/WHEEL +0 -0
{deepeval-3.7.0.dist-info → deepeval-3.7.1.dist-info}/entry_points.txt +0 -0

deepeval/openai_agents/callback_handler.py CHANGED Viewed

@@ -1,13 +1,21 @@
+from time import perf_counter
 from deepeval.tracing.tracing import (
     Observer,
     current_span_context,
     trace_manager,
 )
-from deepeval.openai_agents.extractors import *
+from deepeval.openai_agents.extractors import (
+    update_span_properties,
+    update_trace_properties_from_span_data,
+)
 from deepeval.tracing.context import current_trace_context
 from deepeval.tracing.utils import make_json_serializable
-from time import perf_counter
-from deepeval.tracing.types import TraceSpanStatus
+from deepeval.tracing.types import (
+    BaseSpan,
+    LlmSpan,
+    TraceSpanStatus,
+)
 try:
     from agents.tracing import Span, Trace, TracingProcessor
@@ -18,6 +26,7 @@ try:
         GenerationSpanData,
         GuardrailSpanData,
         HandoffSpanData,
+        MCPListToolsSpanData,
         ResponseSpanData,
         SpanData,
     )

deepeval/prompt/prompt.py CHANGED Viewed

@@ -1,3 +1,8 @@
+import logging
+import time
+import json
+import os
 from enum import Enum
 from typing import Optional, List, Dict, Type, Literal
 from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
@@ -5,12 +10,11 @@ from rich.console import Console
 import time
 import json
 import os
-from pydantic import BaseModel, ValidationError, ConfigDict
+from pydantic import BaseModel, ValidationError
 import asyncio
-import portalocker
 import threading
-from deepeval.utils import make_model_config
+from deepeval.utils import make_model_config, is_read_only_env
 from deepeval.prompt.api import (
     PromptHttpResponse,
@@ -24,9 +28,6 @@ from deepeval.prompt.api import (
     ModelSettings,
     OutputSchema,
     OutputType,
-    ReasoningEffort,
-    Verbosity,
-    ModelProvider,
 )
 from deepeval.prompt.utils import (
     interpolate_text,
@@ -36,6 +37,18 @@ from deepeval.prompt.utils import (
 from deepeval.confident.api import Api, Endpoints, HttpMethods
 from deepeval.constants import HIDDEN_DIR
+logger = logging.getLogger(__name__)
+portalocker = None
+if not is_read_only_env():
+    try:
+        import portalocker
+    except Exception as e:
+        logger.warning("failed to import portalocker: %s", e)
+else:
+    logger.warning("READ_ONLY filesystem: skipping disk cache for prompts.")
 CACHE_FILE_NAME = f"{HIDDEN_DIR}/.deepeval-prompt-cache.json"
 VERSION_CACHE_KEY = "version"
 LABEL_CACHE_KEY = "label"
@@ -165,7 +178,7 @@ class Prompt:
             content = f.read()
         try:
             data = json.loads(content)
-        except:
+        except (json.JSONDecodeError, TypeError):
             self.text_template = content
             return content
@@ -203,7 +216,6 @@ class Prompt:
                     "Unable to interpolate empty prompt template. Please pull a prompt from Confident AI or set template manually to continue."
                 )
-            print("@@@@@")
             return interpolate_text(interpolation_type, text_template, **kwargs)
         elif prompt_type == PromptType.LIST:
@@ -248,7 +260,7 @@ class Prompt:
         version: Optional[str] = None,
         label: Optional[str] = None,
     ) -> Optional[CachedPrompt]:
-        if not os.path.exists(CACHE_FILE_NAME):
+        if portalocker is None or not os.path.exists(CACHE_FILE_NAME):
             return None
         try:
@@ -296,13 +308,12 @@ class Prompt:
         output_type: Optional[OutputType] = None,
         output_schema: Optional[OutputSchema] = None,
     ):
-        if not self.alias:
+        if portalocker is None or not self.alias:
             return
-        # Ensure directory exists
-        os.makedirs(HIDDEN_DIR, exist_ok=True)
         try:
+            # Ensure directory exists
+            os.makedirs(HIDDEN_DIR, exist_ok=True)
             # Use r+ mode if file exists, w mode if it doesn't
             mode = "r+" if os.path.exists(CACHE_FILE_NAME) else "w"
@@ -481,7 +492,7 @@ class Prompt:
                             cached_prompt.output_schema
                         )
                     return
-            except:
+            except Exception:
                 pass
         api = Api()

deepeval/simulator/template.py CHANGED Viewed

@@ -112,7 +112,7 @@ class ConversationSimulatorTemplate:
             ]
             Example JSON Output:
             {{
-                "is_complete": False,
+                "is_complete": false,
                 "reason": "The assistant explained how to forget password but ahas not confirmed that the user successfully set a new password."
             }}

deepeval/test_case/__init__.py CHANGED Viewed

@@ -10,7 +10,7 @@ from .conversational_test_case import (
     TurnParams,
 )
 from .mllm_test_case import MLLMTestCase, MLLMTestCaseParams, MLLMImage
-from .arena_test_case import ArenaTestCase
+from .arena_test_case import ArenaTestCase, Contestant
 from .mcp import (
     MCPServer,
     MCPPromptCall,
@@ -35,4 +35,5 @@ __all__ = [
     "MLLMTestCaseParams",
     "MLLMImage",
     "ArenaTestCase",
+    "Contestant",
 ]

deepeval/test_case/arena_test_case.py CHANGED Viewed

@@ -1,20 +1,31 @@
+from typing import List, Dict, Optional, Union
 from dataclasses import dataclass
-from typing import List, Dict
+from pydantic import BaseModel
 from deepeval.test_case import (
     LLMTestCase,
 )
+from deepeval.prompt import Prompt
+class Contestant(BaseModel):
+    name: str
+    test_case: LLMTestCase
+    hyperparameters: Optional[Dict[str, Union[str, int, float, Prompt]]] = None
+    model_config = {"arbitrary_types_allowed": True}
 @dataclass
 class ArenaTestCase:
-    contestants: Dict[str, LLMTestCase]
+    contestants: List[Contestant]
     def __post_init__(self):
-        contestant_names = list(self.contestants.keys())
+        contestant_names = [contestant.name for contestant in self.contestants]
         if len(contestant_names) != len(set(contestant_names)):
             raise ValueError("All contestant names must be unique.")
-        cases = list(self.contestants.values())
+        cases = [contestant.test_case for contestant in self.contestants]
         ref_input = cases[0].input
         for case in cases[1:]:
             if case.input != ref_input:

deepeval/test_case/mllm_test_case.py CHANGED Viewed

@@ -11,33 +11,50 @@ from deepeval.test_case import ToolCall
 @dataclass
 class MLLMImage:
-    url: str
+    dataBase64: Optional[str] = None
+    mimeType: Optional[str] = None
+    url: Optional[str] = None
     local: Optional[bool] = None
-    filename: Optional[str] = field(default=None, init=False, repr=False)
-    mimeType: Optional[str] = field(default=None, init=False, repr=False)
-    dataBase64: Optional[str] = field(default=None, init=False, repr=False)
+    filename: Optional[str] = None
     def __post_init__(self):
-        is_local = self.is_local_path(self.url)
-        if self.local is not None:
-            assert self.local == is_local, "Local path mismatch"
-        else:
-            self.local = is_local
-        # compute filename, mime_type, and Base64 data
-        if self.local:
-            path = self.process_url(self.url)
-            self.filename = os.path.basename(path)
-            self.mimeType = (
-                mimetypes.guess_type(path)[0] or "application/octet-stream"
+        if self.url and self.dataBase64:
+            raise ValueError(
+                "You cannot provide both 'url' and 'dataBase64' at the same time when creating an MLLMImage."
+            )
+        if not self.url and not self.dataBase64:
+            raise ValueError(
+                "You must provide either a 'url' or both 'dataBase64' and 'mimeType' to create an MLLMImage."
             )
-            with open(path, "rb") as f:
-                raw = f.read()
-            self.dataBase64 = base64.b64encode(raw).decode("ascii")
+        if self.dataBase64 is not None:
+            if self.mimeType is None:
+                raise ValueError(
+                    "mimeType must be provided when initializing from Base64 data."
+                )
         else:
-            self.filename = None
-            self.mimeType = None
-            self.dataBase64 = None
+            is_local = self.is_local_path(self.url)
+            if self.local is not None:
+                assert self.local == is_local, "Local path mismatch"
+            else:
+                self.local = is_local
+            # compute filename, mime_type, and Base64 data
+            if self.local:
+                path = self.process_url(self.url)
+                self.filename = os.path.basename(path)
+                self.mimeType = (
+                    mimetypes.guess_type(path)[0] or "application/octet-stream"
+                )
+                with open(path, "rb") as f:
+                    raw = f.read()
+                self.dataBase64 = base64.b64encode(raw).decode("ascii")
+            else:
+                self.filename = None
+                self.mimeType = None
+                self.dataBase64 = None
     @staticmethod
     def process_url(url: str) -> str:
@@ -69,6 +86,12 @@ class MLLMImage:
             return os.path.exists(path)
         return False
+    def as_data_uri(self) -> Optional[str]:
+        """Return the image as a data URI string, if Base64 data is available."""
+        if not self.dataBase64 or not self.mimeType:
+            return None
+        return f"data:{self.mimeType};base64,{self.dataBase64}"
 class MLLMTestCaseParams(Enum):
     INPUT = "input"

deepeval/test_run/cache.py CHANGED Viewed

@@ -1,8 +1,8 @@
-import portalocker
+import logging
 import sys
 import json
 import os
-from typing import List, Optional, Union, Dict, Union
+from typing import List, Optional, Dict, Union
 from enum import Enum
 from pydantic import BaseModel, Field
@@ -12,11 +12,26 @@ from deepeval.test_case import LLMTestCaseParams, LLMTestCase, ToolCallParams
 from deepeval.test_run.api import MetricData
 from deepeval.utils import (
     delete_file_if_exists,
+    is_read_only_env,
     serialize,
 )
 from deepeval.metrics import BaseMetric
 from deepeval.constants import HIDDEN_DIR
+logger = logging.getLogger(__name__)
+portalocker = None
+if not is_read_only_env():
+    try:
+        import portalocker
+    except Exception as e:
+        logger.warning("failed to import portalocker: %s", e)
+else:
+    logger.warning("READ_ONLY filesystem: skipping disk cache for test runs.")
 CACHE_FILE_NAME = f"{HIDDEN_DIR}/.deepeval-cache.json"
 TEMP_CACHE_FILE_NAME = f"{HIDDEN_DIR}/.temp-deepeval-cache.json"
@@ -97,7 +112,7 @@ class TestRunCacheManager:
     def get_cached_test_case(
         self, test_case: LLMTestCase, hyperparameters: Union[Dict, None]
     ) -> Union[CachedTestCase, None]:
-        if self.disable_write_cache:
+        if self.disable_write_cache or portalocker is None:
             return None
         cached_test_run = self.get_cached_test_run()
@@ -122,7 +137,7 @@ class TestRunCacheManager:
         hyperparameters: Union[Dict, None],
         to_temp: bool = False,
     ):
-        if self.disable_write_cache:
+        if self.disable_write_cache or portalocker is None:
             return
         cache_dict = {
             LLMTestCaseParams.INPUT.value: test_case.input,
@@ -142,7 +157,7 @@ class TestRunCacheManager:
     def set_cached_test_run(
         self, cached_test_run: CachedTestRun, temp: bool = False
     ):
-        if self.disable_write_cache:
+        if self.disable_write_cache or portalocker is None:
             return
         if temp:
@@ -151,7 +166,7 @@ class TestRunCacheManager:
             self.cached_test_run = cached_test_run
     def save_cached_test_run(self, to_temp: bool = False):
-        if self.disable_write_cache:
+        if self.disable_write_cache or portalocker is None:
             return
         if to_temp:
@@ -178,7 +193,7 @@ class TestRunCacheManager:
                 )
     def create_cached_test_run(self, temp: bool = False):
-        if self.disable_write_cache:
+        if self.disable_write_cache or portalocker is None:
             return
         cached_test_run = CachedTestRun()
@@ -188,7 +203,7 @@ class TestRunCacheManager:
     def get_cached_test_run(
         self, from_temp: bool = False
     ) -> Union[CachedTestRun, None]:
-        if self.disable_write_cache:
+        if self.disable_write_cache or portalocker is None:
             return
         should_create_cached_test_run = False
@@ -209,7 +224,7 @@ class TestRunCacheManager:
                     try:
                         data = json.loads(content)
                         self.temp_cached_test_run = CachedTestRun.load(data)
-                    except Exception as e:
+                    except Exception:
                         should_create_cached_test_run = True
             except portalocker.exceptions.LockException as e:
                 print(
@@ -217,6 +232,9 @@ class TestRunCacheManager:
                     file=sys.stderr,
                 )
+            if should_create_cached_test_run:
+                self.create_cached_test_run(temp=from_temp)
             return self.temp_cached_test_run
         else:
             if self.cached_test_run:
@@ -250,6 +268,9 @@ class TestRunCacheManager:
             return self.cached_test_run
     def wrap_up_cached_test_run(self):
+        if portalocker is None:
+            return
         if self.disable_write_cache:
             # Clear cache if write cache is disabled
             delete_file_if_exists(self.cache_file_name)
@@ -330,7 +351,7 @@ class Cache:
                             if criteria_value != cached_criteria_value:
                                 return False
                             continue
-                    except:
+                    except Exception:
                         # For non-GEval
                         continue

deepeval/test_run/hyperparameters.py CHANGED Viewed

@@ -33,7 +33,11 @@ def process_hyperparameters(
             )
         if isinstance(value, Prompt):
-            prompt_key = f"{value.alias}_{value.version}"
+            try:
+                prompt_key = f"{value.alias}_{value.version}"
+            except AttributeError:
+                prompt_key = f"{value.alias}_00.00.01"
             if value._prompt_version_id is not None and value.type is not None:
                 processed_hyperparameters[key] = PromptApi(
                     id=value._prompt_version_id,

deepeval/test_run/test_run.py CHANGED Viewed

@@ -6,11 +6,11 @@ from typing import Any, Optional, List, Dict, Union, Tuple
 import shutil
 import sys
 import datetime
-import portalocker
 from rich.table import Table
 from rich.console import Console
 from rich import print
 from deepeval.metrics import BaseMetric
 from deepeval.confident.api import Api, Endpoints, HttpMethods, is_confident
 from deepeval.test_run.api import (
@@ -25,6 +25,7 @@ from deepeval.test_case import LLMTestCase, ConversationalTestCase, MLLMTestCase
 from deepeval.utils import (
     delete_file_if_exists,
     get_is_running_deepeval,
+    is_read_only_env,
     open_browser,
     shorten,
     format_turn,
@@ -42,6 +43,21 @@ from rich.panel import Panel
 from rich.columns import Columns
+portalocker = None
+if not is_read_only_env():
+    try:
+        import portalocker
+    except Exception as e:
+        print(
+            f"Warning: failed to import portalocker: {e}",
+            file=sys.stderr,
+        )
+else:
+    print(
+        "Warning: DeepEval is configured for read only environment. Test runs will not be written to disk."
+    )
 TEMP_FILE_PATH = f"{HIDDEN_DIR}/.temp_test_run_data.json"
 LATEST_TEST_RUN_FILE_PATH = f"{HIDDEN_DIR}/.latest_test_run.json"
 LATEST_TEST_RUN_DATA_KEY = "testRunData"
@@ -456,7 +472,7 @@ class TestRunManager:
         if self.test_run is None:
             self.create_test_run(identifier=identifier)
-        if self.save_to_disk:
+        if portalocker and self.save_to_disk:
             try:
                 with portalocker.Lock(
                     self.temp_file_path,
@@ -479,7 +495,7 @@ class TestRunManager:
         return self.test_run
     def save_test_run(self, path: str, save_under_key: Optional[str] = None):
-        if self.save_to_disk:
+        if portalocker and self.save_to_disk:
             try:
                 # ensure parent directory exists
                 parent = os.path.dirname(path)
@@ -505,11 +521,14 @@ class TestRunManager:
                 pass
     def save_final_test_run_link(self, link: str):
-        try:
-            with portalocker.Lock(LATEST_TEST_RUN_FILE_PATH, mode="w") as file:
-                json.dump({LATEST_TEST_RUN_LINK_KEY: link}, file)
-        except portalocker.exceptions.LockException:
-            pass
+        if portalocker:
+            try:
+                with portalocker.Lock(
+                    LATEST_TEST_RUN_FILE_PATH, mode="w"
+                ) as file:
+                    json.dump({LATEST_TEST_RUN_LINK_KEY: link}, file)
+            except portalocker.exceptions.LockException:
+                pass
     def update_test_run(
         self,
@@ -523,7 +542,7 @@ class TestRunManager:
         ):
             return
-        if self.save_to_disk:
+        if portalocker and self.save_to_disk:
             try:
                 with portalocker.Lock(
                     self.temp_file_path,

deepeval/tracing/tracing.py CHANGED Viewed

@@ -144,7 +144,7 @@ class TraceManager:
     def mask(self, data: Any):
         if self.custom_mask_fn is not None:
-            self.custom_mask_fn(data)
+            return self.custom_mask_fn(data)
         else:
             return data

deepeval/utils.py CHANGED Viewed

@@ -810,3 +810,7 @@ def format_error_text(
         text += " (Run with LOG_LEVEL=DEBUG for stack trace.)"
     return text
+def is_read_only_env():
+    return get_settings().DEEPEVAL_FILE_SYSTEM == "READ_ONLY"

{deepeval-3.7.0.dist-info → deepeval-3.7.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deepeval
-Version: 3.7.0
+Version: 3.7.1
 Summary: The LLM Evaluation Framework
 Home-page: https://github.com/confident-ai/deepeval
 License: Apache-2.0
@@ -32,7 +32,7 @@ Requires-Dist: pyfiglet
 Requires-Dist: pytest
 Requires-Dist: pytest-asyncio
 Requires-Dist: pytest-repeat
-Requires-Dist: pytest-rerunfailures (>=12.0,<13.0)
+Requires-Dist: pytest-rerunfailures
 Requires-Dist: pytest-xdist
 Requires-Dist: python-dotenv (>=1.1.1,<2.0.0)
 Requires-Dist: requests (>=2.31.0,<3.0.0)

deepeval 3.7.0__py3-none-any.whl → 3.7.1__py3-none-any.whl

deepeval 3.7.0py3-none-any.whl → 3.7.1py3-none-any.whl