PyPI - deepeval - Versions diffs - 3.7.0__py3-none-any.whl → 3.7.2__py3-none-any.whl - Mend

deepeval 3.7.0py3-none-any.whl → 3.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

deepeval/__init__.py +0 -4
deepeval/_version.py +1 -1
deepeval/cli/main.py +7 -0
deepeval/confident/api.py +6 -1
deepeval/config/settings.py +5 -0
deepeval/evaluate/compare.py +219 -4
deepeval/evaluate/types.py +6 -0
deepeval/evaluate/utils.py +30 -0
deepeval/key_handler.py +1 -0
deepeval/metrics/arena_g_eval/arena_g_eval.py +5 -1
deepeval/metrics/arena_g_eval/utils.py +5 -5
deepeval/metrics/conversational_g_eval/conversational_g_eval.py +9 -18
deepeval/metrics/g_eval/g_eval.py +5 -1
deepeval/metrics/g_eval/utils.py +1 -1
deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +5 -1
deepeval/metrics/utils.py +1 -1
deepeval/models/llms/gemini_model.py +27 -5
deepeval/openai_agents/callback_handler.py +12 -3
deepeval/prompt/prompt.py +25 -14
deepeval/simulator/template.py +1 -1
deepeval/synthesizer/config.py +9 -0
deepeval/synthesizer/schema.py +23 -0
deepeval/synthesizer/synthesizer.py +1137 -2
deepeval/synthesizer/templates/__init__.py +11 -2
deepeval/synthesizer/templates/template.py +554 -1
deepeval/synthesizer/templates/template_extraction.py +32 -0
deepeval/synthesizer/templates/template_prompt.py +262 -0
deepeval/test_case/__init__.py +2 -1
deepeval/test_case/arena_test_case.py +15 -4
deepeval/test_case/mllm_test_case.py +45 -22
deepeval/test_run/cache.py +31 -10
deepeval/test_run/hyperparameters.py +5 -1
deepeval/test_run/test_run.py +28 -9
deepeval/tracing/tracing.py +1 -1
deepeval/utils.py +4 -0
{deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/METADATA +3 -2
{deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/RECORD +40 -40
{deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/LICENSE.md +0 -0
{deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/WHEEL +0 -0
{deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/entry_points.txt +0 -0

deepeval/synthesizer/templates/template_prompt.py CHANGED Viewed

@@ -37,6 +37,69 @@ class PromptSynthesizerTemplate:
         JSON:
         """
+    @staticmethod
+    def generate_synthetic_conversational_scenarios(
+        scenario: str,
+        conversational_task: str,
+        participant_roles: str,
+        num_goldens: int,
+    ):
+        return f"""
+        Generate a series of conversational SCENARIOS from scratch based on the provided scenario description,
+        conversational task, and participant roles.
+        A SCENARIO is a narrative description of a situation in which a conversation naturally occurs.
+        It is NOT a question, NOT a prompt, and NOT a user query. It MUST purely describe context.
+        Each scenario MUST depict a realistic MULTI-TURN conversational situation involving the given participants.
+        **
+        IMPORTANT FORMAT:
+        - Only return JSON
+        - JSON MUST contain: {{ "data": [ {{ "scenario": "..." }}, ... ] }}
+        - You MUST TRY to generate {num_goldens} items
+        **
+        Example of GOOD scenarios (situational descriptions):
+        - "During a late afternoon code review session, a junior engineer asks their senior engineer why an async function is inconsistent, leading to a detailed back-and-forth about race conditions."
+        - "While preparing for a sprint demo, a senior engineer helps a junior engineer interpret stack traces, prompting a step-by-step explanation."
+        Example of BAD scenarios (DO NOT DO):
+        - "Why does my async function return inconsistent results?" (This is a prompt)
+        - "Explain how to debug race conditions." (Instruction)
+        - "What is the freezing point of water?" (Question)
+        CRITICAL REQUIREMENTS:
+        - Scenario MUST be a narrative description of a SITUATION.
+        - Scenario MUST involve these participant roles: {participant_roles}
+        - Scenario MUST align with this conversational task: {conversational_task}
+        - Scenario MUST feel natural, real-world, and MULTI-TURN.
+        - Scenario MUST NOT contain:
+            • direct questions
+            • instructions
+            • tasks
+            • explicit prompts
+            • standalone facts
+        - Scenario MUST be grounded in the meaning of the provided base scenario description.
+        You MUST TRY to generate {num_goldens} high-quality, non-repetitive scenarios.
+        **
+        Base Scenario Description:
+        {scenario}
+        Conversational Task:
+        {conversational_task}
+        Participant Roles:
+        {participant_roles}
+        Num Scenarios:
+        {num_goldens}
+        JSON:
+        """
 ######################################################################################################
 ##### Approach similar to https://github.com/nlpxucan/WizardLM/blob/main/Evol_Instruct/depth.py ######
@@ -282,3 +345,202 @@ class PromptEvolutionTemplate:
             Rewritten Input:
             """
         )
+class ConversationalPromptEvolutionTemplate:
+    base_instruction = """I want you to act as a conversational scenario rewriter.
+    Your objective is to rewrite the given `Scenario`. You MUST complicate the `Scenario` using the following method:"""
+    @staticmethod
+    def reasoning_evolution(scenario):
+        return (
+            ConversationalPromptEvolutionTemplate.base_instruction
+            + f"""
+            1. Rewrite `Scenario` to force participants into multi-step conversational reasoning.
+            2. Add layered inferences or analytical leaps required in dialogue.
+            3. `Rewritten Scenario` must stay concise, human-readable, and remain a conversation setup.
+            4. Do NOT exceed **15 words**.
+            **
+            EXAMPLES
+            Example scenario:
+            Two students discuss climate change.
+            Example rewritten scenario:
+            Two students debate climate impacts, tracing cause-effect chains across multiple evidence sources.
+            --------------------------
+            Example scenario:
+            A doctor explains treatment options.
+            Example rewritten scenario:
+            Doctor and patient reason through symptoms requiring sequential diagnostic logic.
+            --------------------------
+            Scenario:
+            {scenario}
+            Rewritten Scenario:
+            """
+        )
+    @staticmethod
+    def concretizing_evolution(scenario):
+        return (
+            ConversationalPromptEvolutionTemplate.base_instruction
+            + f"""
+            1. Replace broad conversation setup with a **more specific, concrete** conversational scene.
+            2. Add real-world detail (location, constraint, specific topic).
+            3. Keep under **15 words**, concise, and still a dialogue setup.
+            **
+            EXAMPLES
+            Example scenario:
+            Two engineers talk about safety.
+            Example rewritten scenario:
+            Two engineers argue over failing brake-system logs during late-night review.
+            --------------------------
+            Example scenario:
+            Two friends discuss exercise.
+            Example rewritten scenario:
+            Two friends compare heart-rate sensor issues during a marathon-training chat.
+            --------------------------
+            Scenario:
+            {scenario}
+            Rewritten Scenario:
+            """
+        )
+    @staticmethod
+    def constrained_evolution(scenario):
+        return (
+            ConversationalPromptEvolutionTemplate.base_instruction
+            + f"""
+            1. Add at least one new constraint shaping the conversation.
+            2. Constraint must significantly affect the dialogue.
+            3. Keep under **15 words**, concise, conversational.
+            **
+            EXAMPLES
+            Example scenario:
+            Two coworkers plan a report.
+            Example rewritten scenario:
+            Two coworkers plan a report with strict no-internet constraint.
+            --------------------------
+            Example scenario:
+            A teacher reviews homework.
+            Example rewritten scenario:
+            Teacher and student discuss homework under urgent submission deadline.
+            --------------------------
+            Scenario:
+            {scenario}
+            Rewritten Scenario:
+            """
+        )
+    @staticmethod
+    def comparative_question_evolution(scenario):
+        return (
+            ConversationalPromptEvolutionTemplate.base_instruction
+            + f"""
+            1. Rewrite `Scenario` so the conversation centers on comparing two+ items.
+            2. Must highlight similarities/differences through dialogue.
+            3. Keep under **15 words**, concise, conversational.
+            **
+            EXAMPLES
+            Example scenario:
+            Two analysts discuss tools.
+            Example rewritten scenario:
+            Two analysts compare legacy analytics pipeline vs. new automated system.
+            --------------------------
+            Example scenario:
+            Two students study history.
+            Example rewritten scenario:
+            Two students contrast Renaissance ideals with Enlightenment philosophies.
+            --------------------------
+            Scenario:
+            {scenario}
+            Rewritten Scenario:
+            """
+        )
+    @staticmethod
+    def hypothetical_scenario_evolution(scenario):
+        return (
+            ConversationalPromptEvolutionTemplate.base_instruction
+            + f"""
+            1. Rewrite `Scenario` to introduce a hypothetical twist derived from the setup.
+            2. The hypothetical MUST drive the conversation.
+            3. Keep under **15 words**, concise, conversational.
+            **
+            EXAMPLES
+            Example scenario:
+            Two scientists discuss pollution.
+            Example rewritten scenario:
+            Two scientists debate effects if emissions doubled overnight.
+            --------------------------
+            Example scenario:
+            A medic trains a recruit.
+            Example rewritten scenario:
+            Medic and recruit plan response to hypothetical antibiotic-resistant outbreak.
+            --------------------------
+            Scenario:
+            {scenario}
+            Rewritten Scenario:
+            """
+        )
+    @staticmethod
+    def in_breadth_evolution(scenario):
+        return (
+            ConversationalPromptEvolutionTemplate.base_instruction
+            + f"""
+            1. Rewrite `Scenario` into a new conversation within the same domain.
+            2. The new conversation must explore a rarer, niche angle.
+            3. Keep under **15 words**, concise, conversational.
+            **
+            EXAMPLES
+            Example scenario:
+            Two doctors discuss patient care.
+            Example rewritten scenario:
+            Two doctors debate rare autoimmune disorder diagnostics.
+            --------------------------
+            Example scenario:
+            Two programmers discuss bugs.
+            Example rewritten scenario:
+            Two programmers examine obscure concurrency race-condition failures.
+            --------------------------
+            Scenario:
+            {scenario}
+            Rewritten Scenario:
+            """
+        )

deepeval/test_case/__init__.py CHANGED Viewed

@@ -10,7 +10,7 @@ from .conversational_test_case import (
     TurnParams,
 )
 from .mllm_test_case import MLLMTestCase, MLLMTestCaseParams, MLLMImage
-from .arena_test_case import ArenaTestCase
+from .arena_test_case import ArenaTestCase, Contestant
 from .mcp import (
     MCPServer,
     MCPPromptCall,
@@ -35,4 +35,5 @@ __all__ = [
     "MLLMTestCaseParams",
     "MLLMImage",
     "ArenaTestCase",
+    "Contestant",
 ]

deepeval/test_case/arena_test_case.py CHANGED Viewed

@@ -1,20 +1,31 @@
+from typing import List, Dict, Optional, Union
 from dataclasses import dataclass
-from typing import List, Dict
+from pydantic import BaseModel
 from deepeval.test_case import (
     LLMTestCase,
 )
+from deepeval.prompt import Prompt
+class Contestant(BaseModel):
+    name: str
+    test_case: LLMTestCase
+    hyperparameters: Optional[Dict[str, Union[str, int, float, Prompt]]] = None
+    model_config = {"arbitrary_types_allowed": True}
 @dataclass
 class ArenaTestCase:
-    contestants: Dict[str, LLMTestCase]
+    contestants: List[Contestant]
     def __post_init__(self):
-        contestant_names = list(self.contestants.keys())
+        contestant_names = [contestant.name for contestant in self.contestants]
         if len(contestant_names) != len(set(contestant_names)):
             raise ValueError("All contestant names must be unique.")
-        cases = list(self.contestants.values())
+        cases = [contestant.test_case for contestant in self.contestants]
         ref_input = cases[0].input
         for case in cases[1:]:
             if case.input != ref_input:

deepeval/test_case/mllm_test_case.py CHANGED Viewed

@@ -11,33 +11,50 @@ from deepeval.test_case import ToolCall
 @dataclass
 class MLLMImage:
-    url: str
+    dataBase64: Optional[str] = None
+    mimeType: Optional[str] = None
+    url: Optional[str] = None
     local: Optional[bool] = None
-    filename: Optional[str] = field(default=None, init=False, repr=False)
-    mimeType: Optional[str] = field(default=None, init=False, repr=False)
-    dataBase64: Optional[str] = field(default=None, init=False, repr=False)
+    filename: Optional[str] = None
     def __post_init__(self):
-        is_local = self.is_local_path(self.url)
-        if self.local is not None:
-            assert self.local == is_local, "Local path mismatch"
-        else:
-            self.local = is_local
-        # compute filename, mime_type, and Base64 data
-        if self.local:
-            path = self.process_url(self.url)
-            self.filename = os.path.basename(path)
-            self.mimeType = (
-                mimetypes.guess_type(path)[0] or "application/octet-stream"
+        if self.url and self.dataBase64:
+            raise ValueError(
+                "You cannot provide both 'url' and 'dataBase64' at the same time when creating an MLLMImage."
+            )
+        if not self.url and not self.dataBase64:
+            raise ValueError(
+                "You must provide either a 'url' or both 'dataBase64' and 'mimeType' to create an MLLMImage."
             )
-            with open(path, "rb") as f:
-                raw = f.read()
-            self.dataBase64 = base64.b64encode(raw).decode("ascii")
+        if self.dataBase64 is not None:
+            if self.mimeType is None:
+                raise ValueError(
+                    "mimeType must be provided when initializing from Base64 data."
+                )
         else:
-            self.filename = None
-            self.mimeType = None
-            self.dataBase64 = None
+            is_local = self.is_local_path(self.url)
+            if self.local is not None:
+                assert self.local == is_local, "Local path mismatch"
+            else:
+                self.local = is_local
+            # compute filename, mime_type, and Base64 data
+            if self.local:
+                path = self.process_url(self.url)
+                self.filename = os.path.basename(path)
+                self.mimeType = (
+                    mimetypes.guess_type(path)[0] or "application/octet-stream"
+                )
+                with open(path, "rb") as f:
+                    raw = f.read()
+                self.dataBase64 = base64.b64encode(raw).decode("ascii")
+            else:
+                self.filename = None
+                self.mimeType = None
+                self.dataBase64 = None
     @staticmethod
     def process_url(url: str) -> str:
@@ -69,6 +86,12 @@ class MLLMImage:
             return os.path.exists(path)
         return False
+    def as_data_uri(self) -> Optional[str]:
+        """Return the image as a data URI string, if Base64 data is available."""
+        if not self.dataBase64 or not self.mimeType:
+            return None
+        return f"data:{self.mimeType};base64,{self.dataBase64}"
 class MLLMTestCaseParams(Enum):
     INPUT = "input"

deepeval/test_run/cache.py CHANGED Viewed

@@ -1,8 +1,8 @@
-import portalocker
+import logging
 import sys
 import json
 import os
-from typing import List, Optional, Union, Dict, Union
+from typing import List, Optional, Dict, Union
 from enum import Enum
 from pydantic import BaseModel, Field
@@ -12,11 +12,26 @@ from deepeval.test_case import LLMTestCaseParams, LLMTestCase, ToolCallParams
 from deepeval.test_run.api import MetricData
 from deepeval.utils import (
     delete_file_if_exists,
+    is_read_only_env,
     serialize,
 )
 from deepeval.metrics import BaseMetric
 from deepeval.constants import HIDDEN_DIR
+logger = logging.getLogger(__name__)
+portalocker = None
+if not is_read_only_env():
+    try:
+        import portalocker
+    except Exception as e:
+        logger.warning("failed to import portalocker: %s", e)
+else:
+    logger.warning("READ_ONLY filesystem: skipping disk cache for test runs.")
 CACHE_FILE_NAME = f"{HIDDEN_DIR}/.deepeval-cache.json"
 TEMP_CACHE_FILE_NAME = f"{HIDDEN_DIR}/.temp-deepeval-cache.json"
@@ -97,7 +112,7 @@ class TestRunCacheManager:
     def get_cached_test_case(
         self, test_case: LLMTestCase, hyperparameters: Union[Dict, None]
     ) -> Union[CachedTestCase, None]:
-        if self.disable_write_cache:
+        if self.disable_write_cache or portalocker is None:
             return None
         cached_test_run = self.get_cached_test_run()
@@ -122,7 +137,7 @@ class TestRunCacheManager:
         hyperparameters: Union[Dict, None],
         to_temp: bool = False,
     ):
-        if self.disable_write_cache:
+        if self.disable_write_cache or portalocker is None:
             return
         cache_dict = {
             LLMTestCaseParams.INPUT.value: test_case.input,
@@ -142,7 +157,7 @@ class TestRunCacheManager:
     def set_cached_test_run(
         self, cached_test_run: CachedTestRun, temp: bool = False
     ):
-        if self.disable_write_cache:
+        if self.disable_write_cache or portalocker is None:
             return
         if temp:
@@ -151,7 +166,7 @@ class TestRunCacheManager:
             self.cached_test_run = cached_test_run
     def save_cached_test_run(self, to_temp: bool = False):
-        if self.disable_write_cache:
+        if self.disable_write_cache or portalocker is None:
             return
         if to_temp:
@@ -178,7 +193,7 @@ class TestRunCacheManager:
                 )
     def create_cached_test_run(self, temp: bool = False):
-        if self.disable_write_cache:
+        if self.disable_write_cache or portalocker is None:
             return
         cached_test_run = CachedTestRun()
@@ -188,7 +203,7 @@ class TestRunCacheManager:
     def get_cached_test_run(
         self, from_temp: bool = False
     ) -> Union[CachedTestRun, None]:
-        if self.disable_write_cache:
+        if self.disable_write_cache or portalocker is None:
             return
         should_create_cached_test_run = False
@@ -209,7 +224,7 @@ class TestRunCacheManager:
                     try:
                         data = json.loads(content)
                         self.temp_cached_test_run = CachedTestRun.load(data)
-                    except Exception as e:
+                    except Exception:
                         should_create_cached_test_run = True
             except portalocker.exceptions.LockException as e:
                 print(
@@ -217,6 +232,9 @@ class TestRunCacheManager:
                     file=sys.stderr,
                 )
+            if should_create_cached_test_run:
+                self.create_cached_test_run(temp=from_temp)
             return self.temp_cached_test_run
         else:
             if self.cached_test_run:
@@ -250,6 +268,9 @@ class TestRunCacheManager:
             return self.cached_test_run
     def wrap_up_cached_test_run(self):
+        if portalocker is None:
+            return
         if self.disable_write_cache:
             # Clear cache if write cache is disabled
             delete_file_if_exists(self.cache_file_name)
@@ -330,7 +351,7 @@ class Cache:
                             if criteria_value != cached_criteria_value:
                                 return False
                             continue
-                    except:
+                    except Exception:
                         # For non-GEval
                         continue

deepeval/test_run/hyperparameters.py CHANGED Viewed

@@ -33,7 +33,11 @@ def process_hyperparameters(
             )
         if isinstance(value, Prompt):
-            prompt_key = f"{value.alias}_{value.version}"
+            try:
+                prompt_key = f"{value.alias}_{value.version}"
+            except AttributeError:
+                prompt_key = f"{value.alias}_00.00.01"
             if value._prompt_version_id is not None and value.type is not None:
                 processed_hyperparameters[key] = PromptApi(
                     id=value._prompt_version_id,

deepeval/test_run/test_run.py CHANGED Viewed

@@ -6,11 +6,11 @@ from typing import Any, Optional, List, Dict, Union, Tuple
 import shutil
 import sys
 import datetime
-import portalocker
 from rich.table import Table
 from rich.console import Console
 from rich import print
 from deepeval.metrics import BaseMetric
 from deepeval.confident.api import Api, Endpoints, HttpMethods, is_confident
 from deepeval.test_run.api import (
@@ -25,6 +25,7 @@ from deepeval.test_case import LLMTestCase, ConversationalTestCase, MLLMTestCase
 from deepeval.utils import (
     delete_file_if_exists,
     get_is_running_deepeval,
+    is_read_only_env,
     open_browser,
     shorten,
     format_turn,
@@ -42,6 +43,21 @@ from rich.panel import Panel
 from rich.columns import Columns
+portalocker = None
+if not is_read_only_env():
+    try:
+        import portalocker
+    except Exception as e:
+        print(
+            f"Warning: failed to import portalocker: {e}",
+            file=sys.stderr,
+        )
+else:
+    print(
+        "Warning: DeepEval is configured for read only environment. Test runs will not be written to disk."
+    )
 TEMP_FILE_PATH = f"{HIDDEN_DIR}/.temp_test_run_data.json"
 LATEST_TEST_RUN_FILE_PATH = f"{HIDDEN_DIR}/.latest_test_run.json"
 LATEST_TEST_RUN_DATA_KEY = "testRunData"
@@ -456,7 +472,7 @@ class TestRunManager:
         if self.test_run is None:
             self.create_test_run(identifier=identifier)
-        if self.save_to_disk:
+        if portalocker and self.save_to_disk:
             try:
                 with portalocker.Lock(
                     self.temp_file_path,
@@ -479,7 +495,7 @@ class TestRunManager:
         return self.test_run
     def save_test_run(self, path: str, save_under_key: Optional[str] = None):
-        if self.save_to_disk:
+        if portalocker and self.save_to_disk:
             try:
                 # ensure parent directory exists
                 parent = os.path.dirname(path)
@@ -505,11 +521,14 @@ class TestRunManager:
                 pass
     def save_final_test_run_link(self, link: str):
-        try:
-            with portalocker.Lock(LATEST_TEST_RUN_FILE_PATH, mode="w") as file:
-                json.dump({LATEST_TEST_RUN_LINK_KEY: link}, file)
-        except portalocker.exceptions.LockException:
-            pass
+        if portalocker:
+            try:
+                with portalocker.Lock(
+                    LATEST_TEST_RUN_FILE_PATH, mode="w"
+                ) as file:
+                    json.dump({LATEST_TEST_RUN_LINK_KEY: link}, file)
+            except portalocker.exceptions.LockException:
+                pass
     def update_test_run(
         self,
@@ -523,7 +542,7 @@ class TestRunManager:
         ):
             return
-        if self.save_to_disk:
+        if portalocker and self.save_to_disk:
             try:
                 with portalocker.Lock(
                     self.temp_file_path,

deepeval/tracing/tracing.py CHANGED Viewed

@@ -144,7 +144,7 @@ class TraceManager:
     def mask(self, data: Any):
         if self.custom_mask_fn is not None:
-            self.custom_mask_fn(data)
+            return self.custom_mask_fn(data)
         else:
             return data

deepeval/utils.py CHANGED Viewed

@@ -810,3 +810,7 @@ def format_error_text(
         text += " (Run with LOG_LEVEL=DEBUG for stack trace.)"
     return text
+def is_read_only_env():
+    return get_settings().DEEPEVAL_FILE_SYSTEM == "READ_ONLY"

{deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deepeval
-Version: 3.7.0
+Version: 3.7.2
 Summary: The LLM Evaluation Framework
 Home-page: https://github.com/confident-ai/deepeval
 License: Apache-2.0
@@ -32,7 +32,7 @@ Requires-Dist: pyfiglet
 Requires-Dist: pytest
 Requires-Dist: pytest-asyncio
 Requires-Dist: pytest-repeat
-Requires-Dist: pytest-rerunfailures (>=12.0,<13.0)
+Requires-Dist: pytest-rerunfailures
 Requires-Dist: pytest-xdist
 Requires-Dist: python-dotenv (>=1.1.1,<2.0.0)
 Requires-Dist: requests (>=2.31.0,<3.0.0)
@@ -439,6 +439,7 @@ Using `.env.local` or `.env` is optional. If they are missing, DeepEval uses you
 ```bash
 cp .env.example .env.local
 # then edit .env.local (ignored by git)
+```
 <br />

deepeval 3.7.0__py3-none-any.whl → 3.7.2__py3-none-any.whl

deepeval 3.7.0py3-none-any.whl → 3.7.2py3-none-any.whl