PyPI - deepeval - Versions diffs - 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl - Mend

deepeval 3.7.5py3-none-any.whl → 3.7.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

deepeval/_version.py +1 -1
deepeval/cli/main.py +2022 -759
deepeval/cli/utils.py +208 -36
deepeval/config/dotenv_handler.py +19 -0
deepeval/config/settings.py +675 -245
deepeval/config/utils.py +9 -1
deepeval/dataset/api.py +23 -1
deepeval/dataset/golden.py +106 -21
deepeval/evaluate/evaluate.py +0 -3
deepeval/evaluate/execute.py +162 -315
deepeval/evaluate/utils.py +6 -30
deepeval/key_handler.py +124 -51
deepeval/metrics/__init__.py +0 -4
deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
deepeval/metrics/answer_relevancy/template.py +102 -179
deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
deepeval/metrics/arena_g_eval/template.py +17 -1
deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
deepeval/metrics/argument_correctness/template.py +19 -2
deepeval/metrics/base_metric.py +19 -41
deepeval/metrics/bias/bias.py +102 -108
deepeval/metrics/bias/template.py +14 -2
deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
deepeval/metrics/conversation_completeness/template.py +23 -3
deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
deepeval/metrics/conversational_dag/nodes.py +66 -123
deepeval/metrics/conversational_dag/templates.py +16 -0
deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
deepeval/metrics/dag/dag.py +10 -0
deepeval/metrics/dag/nodes.py +63 -126
deepeval/metrics/dag/templates.py +14 -0
deepeval/metrics/exact_match/exact_match.py +9 -1
deepeval/metrics/faithfulness/faithfulness.py +82 -136
deepeval/metrics/g_eval/g_eval.py +93 -79
deepeval/metrics/g_eval/template.py +18 -1
deepeval/metrics/g_eval/utils.py +7 -6
deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
deepeval/metrics/goal_accuracy/template.py +21 -3
deepeval/metrics/hallucination/hallucination.py +60 -75
deepeval/metrics/hallucination/template.py +13 -0
deepeval/metrics/indicator.py +11 -10
deepeval/metrics/json_correctness/json_correctness.py +40 -38
deepeval/metrics/json_correctness/template.py +10 -0
deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
deepeval/metrics/knowledge_retention/schema.py +9 -3
deepeval/metrics/knowledge_retention/template.py +12 -0
deepeval/metrics/mcp/mcp_task_completion.py +72 -43
deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
deepeval/metrics/mcp/schema.py +4 -0
deepeval/metrics/mcp/template.py +59 -0
deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
deepeval/metrics/mcp_use_metric/template.py +12 -0
deepeval/metrics/misuse/misuse.py +77 -97
deepeval/metrics/misuse/template.py +15 -0
deepeval/metrics/multimodal_metrics/__init__.py +0 -1
deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
deepeval/metrics/non_advice/non_advice.py +79 -105
deepeval/metrics/non_advice/template.py +12 -0
deepeval/metrics/pattern_match/pattern_match.py +12 -4
deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
deepeval/metrics/pii_leakage/template.py +14 -0
deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
deepeval/metrics/plan_adherence/template.py +11 -0
deepeval/metrics/plan_quality/plan_quality.py +63 -87
deepeval/metrics/plan_quality/template.py +9 -0
deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
deepeval/metrics/prompt_alignment/template.py +12 -0
deepeval/metrics/role_adherence/role_adherence.py +48 -71
deepeval/metrics/role_adherence/template.py +14 -0
deepeval/metrics/role_violation/role_violation.py +75 -108
deepeval/metrics/role_violation/template.py +12 -0
deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
deepeval/metrics/step_efficiency/template.py +11 -0
deepeval/metrics/summarization/summarization.py +115 -183
deepeval/metrics/summarization/template.py +19 -0
deepeval/metrics/task_completion/task_completion.py +67 -73
deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
deepeval/metrics/tool_use/schema.py +4 -0
deepeval/metrics/tool_use/template.py +16 -2
deepeval/metrics/tool_use/tool_use.py +72 -94
deepeval/metrics/topic_adherence/schema.py +4 -0
deepeval/metrics/topic_adherence/template.py +21 -1
deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
deepeval/metrics/toxicity/template.py +13 -0
deepeval/metrics/toxicity/toxicity.py +80 -99
deepeval/metrics/turn_contextual_precision/schema.py +3 -3
deepeval/metrics/turn_contextual_precision/template.py +9 -2
deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
deepeval/metrics/turn_contextual_recall/schema.py +3 -3
deepeval/metrics/turn_contextual_recall/template.py +8 -1
deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
deepeval/metrics/turn_faithfulness/schema.py +1 -1
deepeval/metrics/turn_faithfulness/template.py +8 -1
deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
deepeval/metrics/turn_relevancy/template.py +14 -0
deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
deepeval/metrics/utils.py +161 -91
deepeval/models/__init__.py +2 -0
deepeval/models/base_model.py +44 -6
deepeval/models/embedding_models/azure_embedding_model.py +34 -12
deepeval/models/embedding_models/local_embedding_model.py +22 -7
deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
deepeval/models/embedding_models/openai_embedding_model.py +3 -2
deepeval/models/llms/__init__.py +2 -0
deepeval/models/llms/amazon_bedrock_model.py +229 -73
deepeval/models/llms/anthropic_model.py +143 -48
deepeval/models/llms/azure_model.py +169 -95
deepeval/models/llms/constants.py +2032 -0
deepeval/models/llms/deepseek_model.py +82 -35
deepeval/models/llms/gemini_model.py +126 -67
deepeval/models/llms/grok_model.py +128 -65
deepeval/models/llms/kimi_model.py +129 -87
deepeval/models/llms/litellm_model.py +94 -18
deepeval/models/llms/local_model.py +115 -16
deepeval/models/llms/ollama_model.py +97 -76
deepeval/models/llms/openai_model.py +169 -311
deepeval/models/llms/portkey_model.py +58 -16
deepeval/models/llms/utils.py +5 -2
deepeval/models/retry_policy.py +10 -5
deepeval/models/utils.py +56 -4
deepeval/simulator/conversation_simulator.py +49 -2
deepeval/simulator/template.py +16 -1
deepeval/synthesizer/synthesizer.py +19 -17
deepeval/test_case/api.py +24 -45
deepeval/test_case/arena_test_case.py +7 -2
deepeval/test_case/conversational_test_case.py +55 -6
deepeval/test_case/llm_test_case.py +60 -6
deepeval/test_run/api.py +3 -0
deepeval/test_run/test_run.py +6 -1
deepeval/utils.py +26 -0
{deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
{deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
{deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
{deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
{deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0

deepeval/models/llms/portkey_model.py CHANGED Viewed

@@ -3,10 +3,13 @@ import requests
 from typing import Any, Dict, List, Optional, Union
 from pydantic import AnyUrl, SecretStr
+from deepeval.errors import DeepEvalError
 from deepeval.config.settings import get_settings
 from deepeval.models.utils import (
     require_secret_api_key,
 )
+from deepeval.test_case import MLLMImage
+from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
 from deepeval.models import DeepEvalBaseLLM
 from deepeval.utils import require_param
@@ -29,16 +32,9 @@ class PortkeyModel(DeepEvalBaseLLM):
         settings = get_settings()
         model = model or settings.PORTKEY_MODEL_NAME
-        self.name = require_param(
-            model,
-            provider_label="Portkey",
-            env_var_name="PORTKEY_MODEL_NAME",
-            param_hint="model",
-        )
         if api_key is not None:
             # keep it secret, keep it safe from serializings, logging and alike
-            self.api_key: SecretStr | None = SecretStr(api_key)
+            self.api_key: Optional[SecretStr] = SecretStr(api_key)
         else:
             self.api_key = settings.PORTKEY_API_KEY
@@ -47,6 +43,16 @@ class PortkeyModel(DeepEvalBaseLLM):
         elif settings.PORTKEY_BASE_URL is not None:
             base_url = str(settings.PORTKEY_BASE_URL).rstrip("/")
+        provider = provider or settings.PORTKEY_PROVIDER_NAME
+        # validation
+        model = require_param(
+            model,
+            provider_label="Portkey",
+            env_var_name="PORTKEY_MODEL_NAME",
+            param_hint="model",
+        )
         self.base_url = require_param(
             base_url,
             provider_label="Portkey",
@@ -54,7 +60,6 @@ class PortkeyModel(DeepEvalBaseLLM):
             param_hint="base_url",
         )
-        provider = provider or settings.PORTKEY_PROVIDER_NAME
         self.provider = require_param(
             provider,
             provider_label="Portkey",
@@ -64,6 +69,7 @@ class PortkeyModel(DeepEvalBaseLLM):
         # Keep sanitized kwargs for client call to strip legacy keys
         self.kwargs = kwargs
         self.generation_kwargs = generation_kwargs or {}
+        super().__init__(model)
     def _headers(self) -> Dict[str, str]:
         api_key = require_secret_api_key(
@@ -82,18 +88,51 @@ class PortkeyModel(DeepEvalBaseLLM):
         return headers
     def _payload(self, prompt: str) -> Dict[str, Any]:
+        if check_if_multimodal(prompt):
+            prompt = convert_to_multi_modal_array(input=prompt)
+            content = self.generate_content(prompt)
+        else:
+            content = [{"type": "text", "text": prompt}]
         payload = {
             "model": self.name,
-            "messages": [{"role": "user", "content": prompt}],
+            "messages": [{"role": "user", "content": content}],
         }
         if self.generation_kwargs:
             payload.update(self.generation_kwargs)
         return payload
+    def generate_content(
+        self, multimodal_input: List[Union[str, MLLMImage]] = []
+    ):
+        content = []
+        for element in multimodal_input:
+            if isinstance(element, str):
+                content.append({"type": "text", "text": element})
+            elif isinstance(element, MLLMImage):
+                if element.url and not element.local:
+                    content.append(
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": element.url},
+                        }
+                    )
+                else:
+                    element.ensure_images_loaded()
+                    data_uri = (
+                        f"data:{element.mimeType};base64,{element.dataBase64}"
+                    )
+                    content.append(
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": data_uri},
+                        }
+                    )
+        return content
     def _extract_content(self, data: Dict[str, Any]) -> str:
         choices: Union[List[Dict[str, Any]], None] = data.get("choices")
         if not choices:
-            raise ValueError("Portkey response did not include any choices.")
+            raise DeepEvalError("Portkey response did not include any choices.")
         message = choices[0].get("message", {})
         content: Union[str, List[Dict[str, Any]], None] = message.get("content")
         if isinstance(content, str):
@@ -109,7 +148,7 @@ class PortkeyModel(DeepEvalBaseLLM):
                 f"{self.base_url}/chat/completions",
                 json=self._payload(prompt),
                 headers=self._headers(),
-                timeout=60,
+                timeout=_request_timeout_seconds(),
             )
             response.raise_for_status()
         except requests.HTTPError as error:
@@ -118,11 +157,11 @@ class PortkeyModel(DeepEvalBaseLLM):
                 body = response.json()
             except Exception:
                 body = response.text
-            raise ValueError(
+            raise DeepEvalError(
                 f"Portkey request failed with status {response.status_code}: {body}"
             ) from error
         except requests.RequestException as error:
-            raise ValueError(f"Portkey request failed: {error}") from error
+            raise DeepEvalError(f"Portkey request failed: {error}") from error
         return self._extract_content(response.json())
     async def a_generate(self, prompt: str) -> str:
@@ -132,11 +171,11 @@ class PortkeyModel(DeepEvalBaseLLM):
                 f"{self.base_url}/chat/completions",
                 json=self._payload(prompt),
                 headers=self._headers(),
-                timeout=60,
+                timeout=_request_timeout_seconds(),
             ) as response:
                 if response.status >= 400:
                     body = await response.text()
-                    raise ValueError(
+                    raise DeepEvalError(
                         f"Portkey request failed with status {response.status}: {body}"
                     )
                 data = await response.json()
@@ -147,3 +186,6 @@ class PortkeyModel(DeepEvalBaseLLM):
     def get_model_name(self):
         return f"{self.name} (Portkey)"
+    def supports_multimodal(self):
+        return True

deepeval/models/llms/utils.py CHANGED Viewed

@@ -1,8 +1,11 @@
-from typing import Dict, List, Optional
+from typing import Dict
 import re
 import json
 import asyncio
+from deepeval.errors import DeepEvalError
 MULTIMODAL_MODELS = ["GPTModel", "AzureModel", "GeminiModel", "OllamaModel"]
@@ -20,7 +23,7 @@ def trim_and_load_json(
         return json.loads(jsonStr)
     except json.JSONDecodeError:
         error_str = "Evaluation LLM outputted an invalid JSON. Please use a better evaluation model."
-        raise ValueError(error_str)
+        raise DeepEvalError(error_str)
     except Exception as e:
         raise Exception(f"An unexpected error occurred: {str(e)}")

deepeval/models/retry_policy.py CHANGED Viewed

@@ -87,6 +87,8 @@ def set_outer_deadline(seconds: float | None):
         call, which must be passed to `reset_outer_deadline` to restore the
         previous value.
     """
+    if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
+        return _OUTER_DEADLINE.set(None)
     if seconds and seconds > 0:
         return _OUTER_DEADLINE.set(time.monotonic() + seconds)
     return _OUTER_DEADLINE.set(None)
@@ -131,11 +133,10 @@ def resolve_effective_attempt_timeout():
         float: Seconds to use for the inner per-attempt timeout. `0` means
         disable inner timeout and rely on the outer budget instead.
     """
-    per_attempt = float(
-        get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
-    )
+    settings = get_settings()
+    per_attempt = float(settings.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
     # 0 or None disable inner wait_for. That means rely on outer task cap for timeouts instead.
-    if per_attempt <= 0:
+    if settings.DEEPEVAL_DISABLE_TIMEOUTS or per_attempt <= 0:
         return 0
     # If we do have a positive per-attempt, use up to remaining outer budget.
     rem = _remaining_budget()
@@ -557,7 +558,11 @@ def run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
         BaseException: If `func` raises, the same exception is re-raised with its
                        original traceback.
     """
-    if not timeout_seconds or timeout_seconds <= 0:
+    if (
+        get_settings().DEEPEVAL_DISABLE_TIMEOUTS
+        or not timeout_seconds
+        or timeout_seconds <= 0
+    ):
         return func(*args, **kwargs)
     # try to respect the global cap on concurrent timeout workers

deepeval/models/utils.py CHANGED Viewed

@@ -8,7 +8,7 @@ from deepeval.errors import DeepEvalError
 logger = logging.getLogger(__name__)
-def parse_model_name(model_name: Optional[str] = None) -> str:
+def parse_model_name(model_name: Optional[str] = None) -> Optional[str]:
     """Extract base model name from provider-prefixed format.
     This function is useful for extracting the actual model name from a
@@ -32,9 +32,9 @@ def parse_model_name(model_name: Optional[str] = None) -> str:
     if model_name is None:
         return None
-    if "/" in model_name:
-        _, parsed_model_name = model_name.split("/", 1)
-        return parsed_model_name
+    # if "/" in model_name:
+    #     _, parsed_model_name = model_name.split("/", 1)
+    #     return parsed_model_name
     return model_name
@@ -80,6 +80,58 @@ def require_secret_api_key(
     return api_key
+def require_costs(
+    model_data,
+    model_name: str,
+    input_token_envvar: str,
+    output_token_envvar: str,
+    cost_per_input_token: Optional[float] = None,
+    cost_per_output_token: Optional[float] = None,
+) -> Tuple[Optional[float], Optional[float]]:
+    """
+    Validates and returns the cost parameters (input and output tokens) for a model.
+    Arguments:
+    - model_data: The model's data object, which should contain `input_price` and `output_price`.
+    - model_name: The model name used for error messaging.
+    - cost_per_input_token: The input token cost provided during model initialization (optional).
+    - cost_per_output_token: The output token cost provided during model initialization (optional).
+    - input_token_envvar: The environment variable name for input cost.
+    - output_token_envvar: The environment variable name for output cost.
+    Returns:
+    - A tuple of validated values (input_cost, output_cost). If the values are provided, they are returned.
+      If not provided, they are fetched from settings or environment variables.
+    """
+    def validate_cost(
+        value: Optional[float], envvar_name: str
+    ) -> Optional[float]:
+        """Helper function to validate the cost values."""
+        if value is not None and value < 0:
+            raise DeepEvalError(f"{envvar_name} must be >= 0.")
+        return value
+    # Validate provided token costs
+    cost_per_input_token = validate_cost(
+        cost_per_input_token, input_token_envvar
+    )
+    cost_per_output_token = validate_cost(
+        cost_per_output_token, output_token_envvar
+    )
+    # If model data doesn't have pricing, use provided values or environment variables
+    if model_data.input_price is None or model_data.output_price is None:
+        if cost_per_input_token is None or cost_per_output_token is None:
+            return None, None
+        # Return the validated cost values as a tuple
+        return cost_per_input_token, cost_per_output_token
+    # If no custom cost values are provided, return model's default cost values
+    return model_data.input_price, model_data.output_price
 def normalize_kwargs_and_extract_aliases(
     provider_label: str,
     kwargs: Dict[str, Any],

deepeval/simulator/conversation_simulator.py CHANGED Viewed

@@ -20,6 +20,7 @@ from deepeval.simulator.template import (
     ConversationSimulatorTemplate,
 )
 from deepeval.models import DeepEvalBaseLLM
+from deepeval.metrics.utils import MULTIMODAL_SUPPORTED_MODELS
 from deepeval.simulator.schema import (
     SimulatedInput,
     ConversationCompletion,
@@ -94,6 +95,26 @@ class ConversationSimulator:
                     )
                 )
             else:
+                multimodal = any(
+                    [golden.multimodal for golden in conversational_goldens]
+                )
+                if multimodal:
+                    if (
+                        not self.simulator_model
+                        or not self.simulator_model.supports_multimodal()
+                    ):
+                        if (
+                            self.simulator_model
+                            and type(self.simulator_model)
+                            in MULTIMODAL_SUPPORTED_MODELS
+                        ):
+                            raise ValueError(
+                                f"The evaluation model {self.simulator_model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {self.simulator_model.__class__.__name__} provider includes {', '.join(self.simulator_model.__class__.valid_multimodal_models)}."
+                            )
+                        else:
+                            raise ValueError(
+                                f"The evaluation model {self.simulator_model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS])}"
+                            )
                 conversational_test_cases: List[ConversationalTestCase] = []
                 for conversation_index, golden in enumerate(
                     conversational_goldens
@@ -124,6 +145,28 @@ class ConversationSimulator:
         progress: Optional[Progress] = None,
         pbar_id: Optional[int] = None,
     ) -> List[ConversationalTestCase]:
+        multimodal = any(
+            [golden.multimodal for golden in conversational_goldens]
+        )
+        if multimodal:
+            if (
+                not self.simulator_model
+                or not self.simulator_model.supports_multimodal()
+            ):
+                if (
+                    self.simulator_model
+                    and type(self.simulator_model)
+                    in MULTIMODAL_SUPPORTED_MODELS
+                ):
+                    raise ValueError(
+                        f"The evaluation model {self.simulator_model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {self.simulator_model.__class__.__name__} provider includes {', '.join(self.simulator_model.__class__.valid_multimodal_models)}."
+                    )
+                else:
+                    raise ValueError(
+                        f"The evaluation model {self.simulator_model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS])}"
+                    )
         self.simulation_cost = 0 if self.using_native_model else None
         async def simulate_conversations(
@@ -471,7 +514,9 @@ class ConversationSimulator:
     ):
         if not self.run_remote:
             conversation_history = json.dumps(
-                [t.model_dump() for t in turns], indent=4
+                [t.model_dump() for t in turns],
+                indent=4,
+                ensure_ascii=False,
             )
             prompt = self.template.stop_simulation(
                 conversation_history, golden.expected_outcome
@@ -516,7 +561,9 @@ class ConversationSimulator:
     ):
         if not self.run_remote:
             conversation_history = json.dumps(
-                [t.model_dump() for t in turns], indent=4
+                [t.model_dump() for t in turns],
+                indent=4,
+                ensure_ascii=False,
             )
             prompt = self.template.stop_simulation(
                 conversation_history, golden.expected_outcome

deepeval/simulator/template.py CHANGED Viewed

@@ -7,6 +7,13 @@ from deepeval.test_case import Turn
 class ConversationSimulatorTemplate:
+    multimodal_rules = """
+        --- MULTIMODAL INPUT RULES ---
+        - Treat image content as factual evidence.
+        - Only reference visual details that are explicitly and clearly visible.
+        - Do not infer or guess objects, text, or details not visibly present.
+        - If an image is unclear or ambiguous, mark uncertainty explicitly.
+    """
     @staticmethod
     def simulate_first_user_turn(
@@ -23,6 +30,8 @@ class ConversationSimulatorTemplate:
             3. Avoid providing excessive details upfront; the goal is to initiate the conversation and build rapport, not to solve it in the first message.
             4. The message should be concise, ideally no more than 1-3 sentences.
+            {ConversationSimulatorTemplate.multimodal_rules}
             IMPORTANT: The output must be formatted as a JSON object with a single key `simulated_input`, where the value is the generated opening message in {language}.
             Example Language: english
@@ -48,7 +57,9 @@ class ConversationSimulatorTemplate:
         language: str,
     ) -> str:
         previous_conversation = json.dumps(
-            [t.model_dump() for t in turns], indent=4
+            [t.model_dump() for t in turns],
+            indent=4,
+            ensure_ascii=False,
         )
         prompt = textwrap.dedent(
             f"""
@@ -61,6 +72,8 @@ class ConversationSimulatorTemplate:
             3. Keep the tone consistent with the previous user inputs.
             4. The generated user input should be concise, ideally no more than 1-2 sentences.
+            {ConversationSimulatorTemplate.multimodal_rules}
             IMPORTANT: The output must be formatted as a JSON object with a single key `simulated_input`,
             where the value is the generated user input in {language}.
@@ -101,6 +114,8 @@ class ConversationSimulatorTemplate:
             2. If the expected outcome has been met, mark the conversation as complete.
             3. If not, mark it as incomplete and briefly describe what remains to be done.
+            {ConversationSimulatorTemplate.multimodal_rules}
             IMPORTANT: The output must be formatted as a JSON object with two keys:
             `is_complete` (a boolean) and `reason` (a string).

deepeval/synthesizer/synthesizer.py CHANGED Viewed

@@ -25,7 +25,7 @@ from deepeval.metrics.utils import (
 from deepeval.progress_context import synthesizer_progress_context
 from deepeval.models import DeepEvalBaseLLM
 from deepeval.dataset.golden import Golden, ConversationalGolden
-from deepeval.synthesizer.types import *
+from deepeval.synthesizer.types import Evolution, PromptEvolution
 from deepeval.synthesizer.templates import (
     EvolutionTemplate,
     SynthesizerTemplate,
@@ -246,7 +246,7 @@ class Synthesizer:
                 )
                 if self.cost_tracking and self.using_native_model:
                     print(f"💰 API cost: {self.synthesis_cost:.6f}")
-                if _send_data == True:
+                if _send_data:
                     pass
                 remove_pbars(
                     progress,
@@ -546,7 +546,7 @@ class Synthesizer:
                 # Remove pbar if not from docs
                 remove_pbars(progress, [pbar_id]) if _progress is None else None
-        if _send_data == True:
+        if _send_data:
             pass
         if _reset_cost and self.cost_tracking and self.using_native_model:
             print(f"💰 API cost: {self.synthesis_cost:.6f}")
@@ -567,7 +567,8 @@ class Synthesizer:
         if _reset_cost:
             self.synthetic_goldens = []
             self.synthesis_cost = 0 if self.using_native_model else None
-        semaphore = asyncio.Semaphore(self.max_concurrent)
+        context_semaphore = asyncio.Semaphore(self.max_concurrent)
+        worker_semaphore = asyncio.Semaphore(self.max_concurrent)
         goldens: List[Golden] = []
         with synthesizer_progress_context(
@@ -586,9 +587,9 @@ class Synthesizer:
         ):
             tasks = [
                 self.task_wrapper(
-                    semaphore,
+                    context_semaphore,
                     self._a_generate_from_context,
-                    semaphore=semaphore,
+                    semaphore=worker_semaphore,
                     context=context,
                     goldens=goldens,
                     include_expected_output=include_expected_output,
@@ -965,7 +966,7 @@ class Synthesizer:
         # Wrap up Synthesis
         self.synthetic_goldens.extend(goldens)
-        if _send_data == True:
+        if _send_data:
             pass
         return goldens
@@ -1023,7 +1024,7 @@ class Synthesizer:
                 source_files.append(golden.source_file)
             # Extract styles from goldens if not already set
-            if self.set_styling_config == False:
+            if not self.set_styling_config:
                 example_inputs = random.sample(
                     [golden.input for golden in goldens], min(len(goldens), 10)
                 )
@@ -1069,7 +1070,7 @@ class Synthesizer:
             source_files.append(golden.source_file)
         # Extract styles from goldens if not already set
-        if self.set_styling_config == False:
+        if not self.set_styling_config:
             example_inputs = random.sample(
                 [golden.input for golden in goldens], min(len(goldens), 10)
             )
@@ -1637,7 +1638,7 @@ class Synthesizer:
                 )
                 if self.cost_tracking and self.using_native_model:
                     print(f"💰 API cost: {self.synthesis_cost:.6f}")
-                if _send_data == True:
+                if _send_data:
                     pass
                 remove_pbars(
                     progress,
@@ -1949,7 +1950,7 @@ class Synthesizer:
                 # Remove pbar if not from docs
                 remove_pbars(progress, [pbar_id]) if _progress is None else None
-        if _send_data == True:
+        if _send_data:
             pass
         if _reset_cost and self.cost_tracking and self.using_native_model:
             print(f"💰 API cost: {self.synthesis_cost:.6f}")
@@ -1970,7 +1971,8 @@ class Synthesizer:
         if _reset_cost:
             self.synthetic_conversational_goldens = []
             self.synthesis_cost = 0 if self.using_native_model else None
-        semaphore = asyncio.Semaphore(self.max_concurrent)
+        context_semaphore = asyncio.Semaphore(self.max_concurrent)
+        worker_semaphore = asyncio.Semaphore(self.max_concurrent)
         goldens: List[ConversationalGolden] = []
         with synthesizer_progress_context(
@@ -1989,9 +1991,9 @@ class Synthesizer:
         ):
             tasks = [
                 self.task_wrapper(
-                    semaphore,
+                    context_semaphore,
                     self._a_generate_conversational_from_context,
-                    semaphore=semaphore,
+                    semaphore=worker_semaphore,
                     context=context,
                     goldens=goldens,
                     include_expected_outcome=include_expected_outcome,
@@ -2335,7 +2337,7 @@ class Synthesizer:
         # Wrap up Synthesis
         self.synthetic_conversational_goldens.extend(goldens)
-        if _send_data == True:
+        if _send_data:
             pass
         return goldens
@@ -2567,7 +2569,7 @@ class Synthesizer:
                 contexts.append(golden.context)
             # Extract styles from conversational goldens if not already set
-            if self.set_conversational_styling_config == False:
+            if not self.set_conversational_styling_config:
                 example_scenarios = random.sample(
                     [golden.scenario for golden in goldens],
                     min(len(goldens), 10),
@@ -2612,7 +2614,7 @@ class Synthesizer:
             contexts.append(golden.context)
         # Extract styles from conversational goldens if not already set
-        if self.set_conversational_styling_config == False:
+        if not self.set_conversational_styling_config:
             example_scenarios = random.sample(
                 [golden.scenario for golden in goldens], min(len(goldens), 10)
             )

deepeval/test_case/api.py CHANGED Viewed

@@ -12,7 +12,6 @@ from deepeval.test_case import (
     ConversationalTestCase,
     Turn,
 )
-from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
 from deepeval.constants import PYTEST_RUN_TEST_NAME
@@ -33,7 +32,6 @@ def create_api_test_case(
     trace: Optional[TraceApi] = None,
     index: Optional[int] = None,
 ) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
-    from deepeval.utils import convert_to_multi_modal_array
     if isinstance(test_case, ConversationalTestCase):
         order = (
@@ -61,8 +59,10 @@ def create_api_test_case(
             context=test_case.context,
             tags=test_case.tags,
             comments=test_case.comments,
+            imagesMapping=test_case._get_images_mapping(),
             additionalMetadata=test_case.additional_metadata,
         )
         api_test_case.turns = [
             create_api_turn(
                 turn=turn,
@@ -86,48 +86,27 @@ def create_api_test_case(
             name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
         metrics_data = []
-        if isinstance(test_case, LLMTestCase) and test_case.multimodal is False:
-            api_test_case = LLMApiTestCase(
-                name=name,
-                input=test_case.input,
-                actualOutput=test_case.actual_output,
-                expectedOutput=test_case.expected_output,
-                context=test_case.context,
-                retrievalContext=test_case.retrieval_context,
-                toolsCalled=test_case.tools_called,
-                expectedTools=test_case.expected_tools,
-                tokenCost=test_case.token_cost,
-                completionTime=test_case.completion_time,
-                tags=test_case.tags,
-                success=success,
-                metricsData=metrics_data,
-                runDuration=None,
-                evaluationCost=None,
-                order=order,
-                additionalMetadata=test_case.additional_metadata,
-                comments=test_case.comments,
-                trace=trace,
-            )
-        elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
-            api_test_case = LLMApiTestCase(
-                name=name,
-                input=test_case.input,
-                actualOutput=test_case.actual_output,
-                expectedOutput=test_case.expected_output,
-                retrievalContext=test_case.retrieval_context,
-                context=test_case.context,
-                imagesMapping=_MLLM_IMAGE_REGISTRY,
-                toolsCalled=test_case.tools_called,
-                expectedTools=test_case.expected_tools,
-                tokenCost=test_case.token_cost,
-                completionTime=test_case.completion_time,
-                success=success,
-                metricsData=metrics_data,
-                runDuration=None,
-                evaluationCost=None,
-                order=order,
-                additionalMetadata=test_case.additional_metadata,
-                comments=test_case.comments,
-            )
+        api_test_case = LLMApiTestCase(
+            name=name,
+            input=test_case.input,
+            actualOutput=test_case.actual_output,
+            expectedOutput=test_case.expected_output,
+            retrievalContext=test_case.retrieval_context,
+            context=test_case.context,
+            imagesMapping=test_case._get_images_mapping(),
+            toolsCalled=test_case.tools_called,
+            expectedTools=test_case.expected_tools,
+            tokenCost=test_case.token_cost,
+            completionTime=test_case.completion_time,
+            success=success,
+            metricsData=metrics_data,
+            runDuration=None,
+            evaluationCost=None,
+            order=order,
+            additionalMetadata=test_case.additional_metadata,
+            comments=test_case.comments,
+            tags=test_case.tags,
+            trace=trace,
+        )
         # llm_test_case_lookup_map[instance_id] = api_test_case
         return api_test_case

deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

deepeval 3.7.5py3-none-any.whl → 3.7.7py3-none-any.whl