deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +106 -21
  5. deepeval/evaluate/evaluate.py +0 -3
  6. deepeval/evaluate/execute.py +10 -222
  7. deepeval/evaluate/utils.py +6 -30
  8. deepeval/key_handler.py +3 -0
  9. deepeval/metrics/__init__.py +0 -4
  10. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  11. deepeval/metrics/answer_relevancy/template.py +102 -179
  12. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  13. deepeval/metrics/arena_g_eval/template.py +17 -1
  14. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  15. deepeval/metrics/argument_correctness/template.py +19 -2
  16. deepeval/metrics/base_metric.py +13 -41
  17. deepeval/metrics/bias/bias.py +102 -108
  18. deepeval/metrics/bias/template.py +14 -2
  19. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  20. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  22. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  23. deepeval/metrics/conversation_completeness/template.py +23 -3
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  25. deepeval/metrics/conversational_dag/nodes.py +66 -123
  26. deepeval/metrics/conversational_dag/templates.py +16 -0
  27. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  28. deepeval/metrics/dag/dag.py +10 -0
  29. deepeval/metrics/dag/nodes.py +63 -126
  30. deepeval/metrics/dag/templates.py +14 -0
  31. deepeval/metrics/exact_match/exact_match.py +9 -1
  32. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  33. deepeval/metrics/g_eval/g_eval.py +87 -78
  34. deepeval/metrics/g_eval/template.py +18 -1
  35. deepeval/metrics/g_eval/utils.py +7 -6
  36. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  37. deepeval/metrics/goal_accuracy/template.py +21 -3
  38. deepeval/metrics/hallucination/hallucination.py +60 -75
  39. deepeval/metrics/hallucination/template.py +13 -0
  40. deepeval/metrics/indicator.py +3 -6
  41. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  42. deepeval/metrics/json_correctness/template.py +10 -0
  43. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  44. deepeval/metrics/knowledge_retention/schema.py +9 -3
  45. deepeval/metrics/knowledge_retention/template.py +12 -0
  46. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  47. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  48. deepeval/metrics/mcp/template.py +52 -0
  49. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  50. deepeval/metrics/mcp_use_metric/template.py +12 -0
  51. deepeval/metrics/misuse/misuse.py +77 -97
  52. deepeval/metrics/misuse/template.py +15 -0
  53. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  58. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  59. deepeval/metrics/non_advice/non_advice.py +79 -105
  60. deepeval/metrics/non_advice/template.py +12 -0
  61. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  62. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  63. deepeval/metrics/pii_leakage/template.py +14 -0
  64. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  65. deepeval/metrics/plan_adherence/template.py +11 -0
  66. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  67. deepeval/metrics/plan_quality/template.py +9 -0
  68. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  69. deepeval/metrics/prompt_alignment/template.py +12 -0
  70. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  71. deepeval/metrics/role_adherence/template.py +14 -0
  72. deepeval/metrics/role_violation/role_violation.py +75 -108
  73. deepeval/metrics/role_violation/template.py +12 -0
  74. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  75. deepeval/metrics/step_efficiency/template.py +11 -0
  76. deepeval/metrics/summarization/summarization.py +115 -183
  77. deepeval/metrics/summarization/template.py +19 -0
  78. deepeval/metrics/task_completion/task_completion.py +67 -73
  79. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  80. deepeval/metrics/tool_use/tool_use.py +42 -66
  81. deepeval/metrics/topic_adherence/template.py +13 -0
  82. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  83. deepeval/metrics/toxicity/template.py +13 -0
  84. deepeval/metrics/toxicity/toxicity.py +80 -99
  85. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  86. deepeval/metrics/turn_contextual_precision/template.py +1 -1
  87. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
  88. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  89. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
  90. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  91. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
  92. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  93. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
  94. deepeval/metrics/turn_relevancy/template.py +14 -0
  95. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  96. deepeval/metrics/utils.py +145 -90
  97. deepeval/models/base_model.py +44 -6
  98. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  99. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  100. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  101. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  102. deepeval/models/llms/amazon_bedrock_model.py +226 -71
  103. deepeval/models/llms/anthropic_model.py +141 -47
  104. deepeval/models/llms/azure_model.py +167 -94
  105. deepeval/models/llms/constants.py +2032 -0
  106. deepeval/models/llms/deepseek_model.py +79 -29
  107. deepeval/models/llms/gemini_model.py +126 -67
  108. deepeval/models/llms/grok_model.py +125 -59
  109. deepeval/models/llms/kimi_model.py +126 -81
  110. deepeval/models/llms/litellm_model.py +92 -18
  111. deepeval/models/llms/local_model.py +114 -15
  112. deepeval/models/llms/ollama_model.py +97 -76
  113. deepeval/models/llms/openai_model.py +167 -310
  114. deepeval/models/llms/portkey_model.py +58 -16
  115. deepeval/models/llms/utils.py +5 -2
  116. deepeval/models/utils.py +60 -4
  117. deepeval/simulator/conversation_simulator.py +43 -0
  118. deepeval/simulator/template.py +13 -0
  119. deepeval/test_case/api.py +24 -45
  120. deepeval/test_case/arena_test_case.py +7 -2
  121. deepeval/test_case/conversational_test_case.py +55 -6
  122. deepeval/test_case/llm_test_case.py +60 -6
  123. deepeval/test_run/api.py +3 -0
  124. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
  125. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
  126. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  127. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  128. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  129. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  130. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  131. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  132. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  133. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -3,10 +3,13 @@ import requests
3
3
  from typing import Any, Dict, List, Optional, Union
4
4
  from pydantic import AnyUrl, SecretStr
5
5
 
6
+ from deepeval.errors import DeepEvalError
6
7
  from deepeval.config.settings import get_settings
7
8
  from deepeval.models.utils import (
8
9
  require_secret_api_key,
9
10
  )
11
+ from deepeval.test_case import MLLMImage
12
+ from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
10
13
  from deepeval.models import DeepEvalBaseLLM
11
14
  from deepeval.utils import require_param
12
15
 
@@ -29,16 +32,9 @@ class PortkeyModel(DeepEvalBaseLLM):
29
32
  settings = get_settings()
30
33
  model = model or settings.PORTKEY_MODEL_NAME
31
34
 
32
- self.name = require_param(
33
- model,
34
- provider_label="Portkey",
35
- env_var_name="PORTKEY_MODEL_NAME",
36
- param_hint="model",
37
- )
38
-
39
35
  if api_key is not None:
40
36
  # keep it secret, keep it safe from serializings, logging and alike
41
- self.api_key: SecretStr | None = SecretStr(api_key)
37
+ self.api_key: Optional[SecretStr] = SecretStr(api_key)
42
38
  else:
43
39
  self.api_key = settings.PORTKEY_API_KEY
44
40
 
@@ -47,6 +43,16 @@ class PortkeyModel(DeepEvalBaseLLM):
47
43
  elif settings.PORTKEY_BASE_URL is not None:
48
44
  base_url = str(settings.PORTKEY_BASE_URL).rstrip("/")
49
45
 
46
+ provider = provider or settings.PORTKEY_PROVIDER_NAME
47
+
48
+ # validation
49
+ model = require_param(
50
+ model,
51
+ provider_label="Portkey",
52
+ env_var_name="PORTKEY_MODEL_NAME",
53
+ param_hint="model",
54
+ )
55
+
50
56
  self.base_url = require_param(
51
57
  base_url,
52
58
  provider_label="Portkey",
@@ -54,7 +60,6 @@ class PortkeyModel(DeepEvalBaseLLM):
54
60
  param_hint="base_url",
55
61
  )
56
62
 
57
- provider = provider or settings.PORTKEY_PROVIDER_NAME
58
63
  self.provider = require_param(
59
64
  provider,
60
65
  provider_label="Portkey",
@@ -64,6 +69,7 @@ class PortkeyModel(DeepEvalBaseLLM):
64
69
  # Keep sanitized kwargs for client call to strip legacy keys
65
70
  self.kwargs = kwargs
66
71
  self.generation_kwargs = generation_kwargs or {}
72
+ super().__init__(model)
67
73
 
68
74
  def _headers(self) -> Dict[str, str]:
69
75
  api_key = require_secret_api_key(
@@ -82,18 +88,51 @@ class PortkeyModel(DeepEvalBaseLLM):
82
88
  return headers
83
89
 
84
90
  def _payload(self, prompt: str) -> Dict[str, Any]:
91
+ if check_if_multimodal(prompt):
92
+ prompt = convert_to_multi_modal_array(input=prompt)
93
+ content = self.generate_content(prompt)
94
+ else:
95
+ content = [{"type": "text", "text": prompt}]
85
96
  payload = {
86
97
  "model": self.name,
87
- "messages": [{"role": "user", "content": prompt}],
98
+ "messages": [{"role": "user", "content": content}],
88
99
  }
89
100
  if self.generation_kwargs:
90
101
  payload.update(self.generation_kwargs)
91
102
  return payload
92
103
 
104
+ def generate_content(
105
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
106
+ ):
107
+ content = []
108
+ for element in multimodal_input:
109
+ if isinstance(element, str):
110
+ content.append({"type": "text", "text": element})
111
+ elif isinstance(element, MLLMImage):
112
+ if element.url and not element.local:
113
+ content.append(
114
+ {
115
+ "type": "image_url",
116
+ "image_url": {"url": element.url},
117
+ }
118
+ )
119
+ else:
120
+ element.ensure_images_loaded()
121
+ data_uri = (
122
+ f"data:{element.mimeType};base64,{element.dataBase64}"
123
+ )
124
+ content.append(
125
+ {
126
+ "type": "image_url",
127
+ "image_url": {"url": data_uri},
128
+ }
129
+ )
130
+ return content
131
+
93
132
  def _extract_content(self, data: Dict[str, Any]) -> str:
94
133
  choices: Union[List[Dict[str, Any]], None] = data.get("choices")
95
134
  if not choices:
96
- raise ValueError("Portkey response did not include any choices.")
135
+ raise DeepEvalError("Portkey response did not include any choices.")
97
136
  message = choices[0].get("message", {})
98
137
  content: Union[str, List[Dict[str, Any]], None] = message.get("content")
99
138
  if isinstance(content, str):
@@ -109,7 +148,7 @@ class PortkeyModel(DeepEvalBaseLLM):
109
148
  f"{self.base_url}/chat/completions",
110
149
  json=self._payload(prompt),
111
150
  headers=self._headers(),
112
- timeout=60,
151
+ timeout=_request_timeout_seconds(),
113
152
  )
114
153
  response.raise_for_status()
115
154
  except requests.HTTPError as error:
@@ -118,11 +157,11 @@ class PortkeyModel(DeepEvalBaseLLM):
118
157
  body = response.json()
119
158
  except Exception:
120
159
  body = response.text
121
- raise ValueError(
160
+ raise DeepEvalError(
122
161
  f"Portkey request failed with status {response.status_code}: {body}"
123
162
  ) from error
124
163
  except requests.RequestException as error:
125
- raise ValueError(f"Portkey request failed: {error}") from error
164
+ raise DeepEvalError(f"Portkey request failed: {error}") from error
126
165
  return self._extract_content(response.json())
127
166
 
128
167
  async def a_generate(self, prompt: str) -> str:
@@ -132,11 +171,11 @@ class PortkeyModel(DeepEvalBaseLLM):
132
171
  f"{self.base_url}/chat/completions",
133
172
  json=self._payload(prompt),
134
173
  headers=self._headers(),
135
- timeout=60,
174
+ timeout=_request_timeout_seconds(),
136
175
  ) as response:
137
176
  if response.status >= 400:
138
177
  body = await response.text()
139
- raise ValueError(
178
+ raise DeepEvalError(
140
179
  f"Portkey request failed with status {response.status}: {body}"
141
180
  )
142
181
  data = await response.json()
@@ -147,3 +186,6 @@ class PortkeyModel(DeepEvalBaseLLM):
147
186
 
148
187
  def get_model_name(self):
149
188
  return f"{self.name} (Portkey)"
189
+
190
+ def supports_multimodal(self):
191
+ return True
@@ -1,8 +1,11 @@
1
- from typing import Dict, List, Optional
1
+ from typing import Dict
2
2
  import re
3
3
  import json
4
4
  import asyncio
5
5
 
6
+ from deepeval.errors import DeepEvalError
7
+
8
+
6
9
  MULTIMODAL_MODELS = ["GPTModel", "AzureModel", "GeminiModel", "OllamaModel"]
7
10
 
8
11
 
@@ -20,7 +23,7 @@ def trim_and_load_json(
20
23
  return json.loads(jsonStr)
21
24
  except json.JSONDecodeError:
22
25
  error_str = "Evaluation LLM outputted an invalid JSON. Please use a better evaluation model."
23
- raise ValueError(error_str)
26
+ raise DeepEvalError(error_str)
24
27
  except Exception as e:
25
28
  raise Exception(f"An unexpected error occurred: {str(e)}")
26
29
 
deepeval/models/utils.py CHANGED
@@ -8,7 +8,7 @@ from deepeval.errors import DeepEvalError
8
8
  logger = logging.getLogger(__name__)
9
9
 
10
10
 
11
- def parse_model_name(model_name: Optional[str] = None) -> str:
11
+ def parse_model_name(model_name: Optional[str] = None) -> Optional[str]:
12
12
  """Extract base model name from provider-prefixed format.
13
13
 
14
14
  This function is useful for extracting the actual model name from a
@@ -32,9 +32,9 @@ def parse_model_name(model_name: Optional[str] = None) -> str:
32
32
  if model_name is None:
33
33
  return None
34
34
 
35
- if "/" in model_name:
36
- _, parsed_model_name = model_name.split("/", 1)
37
- return parsed_model_name
35
+ # if "/" in model_name:
36
+ # _, parsed_model_name = model_name.split("/", 1)
37
+ # return parsed_model_name
38
38
  return model_name
39
39
 
40
40
 
@@ -80,6 +80,62 @@ def require_secret_api_key(
80
80
  return api_key
81
81
 
82
82
 
83
+ def require_costs(
84
+ model_data,
85
+ model_name: str,
86
+ input_token_envvar: str,
87
+ output_token_envvar: str,
88
+ cost_per_input_token: Optional[float] = None,
89
+ cost_per_output_token: Optional[float] = None,
90
+ ) -> Tuple[Optional[float], Optional[float]]:
91
+ """
92
+ Validates and returns the cost parameters (input and output tokens) for a model.
93
+
94
+ Arguments:
95
+ - model_data: The model's data object, which should contain `input_price` and `output_price`.
96
+ - model_name: The model name used for error messaging.
97
+ - cost_per_input_token: The input token cost provided during model initialization (optional).
98
+ - cost_per_output_token: The output token cost provided during model initialization (optional).
99
+ - input_token_envvar: The environment variable name for input cost.
100
+ - output_token_envvar: The environment variable name for output cost.
101
+
102
+ Returns:
103
+ - A tuple of validated values (input_cost, output_cost). If the values are provided, they are returned.
104
+ If not provided, they are fetched from settings or environment variables.
105
+ """
106
+
107
+ def validate_cost(
108
+ value: Optional[float], envvar_name: str
109
+ ) -> Optional[float]:
110
+ """Helper function to validate the cost values."""
111
+ if value is not None and value < 0:
112
+ raise DeepEvalError(f"{envvar_name} must be >= 0.")
113
+ return value
114
+
115
+ # Validate provided token costs
116
+ cost_per_input_token = validate_cost(
117
+ cost_per_input_token, input_token_envvar
118
+ )
119
+ cost_per_output_token = validate_cost(
120
+ cost_per_output_token, output_token_envvar
121
+ )
122
+
123
+ # If model data doesn't have pricing, use provided values or environment variables
124
+ if model_data.input_price is None or model_data.output_price is None:
125
+ if cost_per_input_token is None or cost_per_output_token is None:
126
+ raise DeepEvalError(
127
+ f"No pricing available for `{model_name}`. "
128
+ f"Please provide both `cost_per_input_token` and `cost_per_output_token` when initializing `{model_name}`, "
129
+ f"or set {input_token_envvar} and {output_token_envvar} environment variables."
130
+ )
131
+
132
+ # Return the validated cost values as a tuple
133
+ return cost_per_input_token, cost_per_output_token
134
+
135
+ # If no custom cost values are provided, return model's default cost values
136
+ return model_data.input_price, model_data.output_price
137
+
138
+
83
139
  def normalize_kwargs_and_extract_aliases(
84
140
  provider_label: str,
85
141
  kwargs: Dict[str, Any],
@@ -20,6 +20,7 @@ from deepeval.simulator.template import (
20
20
  ConversationSimulatorTemplate,
21
21
  )
22
22
  from deepeval.models import DeepEvalBaseLLM
23
+ from deepeval.metrics.utils import MULTIMODAL_SUPPORTED_MODELS
23
24
  from deepeval.simulator.schema import (
24
25
  SimulatedInput,
25
26
  ConversationCompletion,
@@ -94,6 +95,26 @@ class ConversationSimulator:
94
95
  )
95
96
  )
96
97
  else:
98
+ multimodal = any(
99
+ [golden.multimodal for golden in conversational_goldens]
100
+ )
101
+ if multimodal:
102
+ if (
103
+ not self.simulator_model
104
+ or not self.simulator_model.supports_multimodal()
105
+ ):
106
+ if (
107
+ self.simulator_model
108
+ and type(self.simulator_model)
109
+ in MULTIMODAL_SUPPORTED_MODELS
110
+ ):
111
+ raise ValueError(
112
+ f"The evaluation model {self.simulator_model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {self.simulator_model.__class__.__name__} provider includes {', '.join(self.simulator_model.__class__.valid_multimodal_models)}."
113
+ )
114
+ else:
115
+ raise ValueError(
116
+ f"The evaluation model {self.simulator_model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS])}"
117
+ )
97
118
  conversational_test_cases: List[ConversationalTestCase] = []
98
119
  for conversation_index, golden in enumerate(
99
120
  conversational_goldens
@@ -124,6 +145,28 @@ class ConversationSimulator:
124
145
  progress: Optional[Progress] = None,
125
146
  pbar_id: Optional[int] = None,
126
147
  ) -> List[ConversationalTestCase]:
148
+
149
+ multimodal = any(
150
+ [golden.multimodal for golden in conversational_goldens]
151
+ )
152
+ if multimodal:
153
+ if (
154
+ not self.simulator_model
155
+ or not self.simulator_model.supports_multimodal()
156
+ ):
157
+ if (
158
+ self.simulator_model
159
+ and type(self.simulator_model)
160
+ in MULTIMODAL_SUPPORTED_MODELS
161
+ ):
162
+ raise ValueError(
163
+ f"The evaluation model {self.simulator_model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {self.simulator_model.__class__.__name__} provider includes {', '.join(self.simulator_model.__class__.valid_multimodal_models)}."
164
+ )
165
+ else:
166
+ raise ValueError(
167
+ f"The evaluation model {self.simulator_model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS])}"
168
+ )
169
+
127
170
  self.simulation_cost = 0 if self.using_native_model else None
128
171
 
129
172
  async def simulate_conversations(
@@ -7,6 +7,13 @@ from deepeval.test_case import Turn
7
7
 
8
8
 
9
9
  class ConversationSimulatorTemplate:
10
+ multimodal_rules = """
11
+ --- MULTIMODAL INPUT RULES ---
12
+ - Treat image content as factual evidence.
13
+ - Only reference visual details that are explicitly and clearly visible.
14
+ - Do not infer or guess objects, text, or details not visibly present.
15
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
16
+ """
10
17
 
11
18
  @staticmethod
12
19
  def simulate_first_user_turn(
@@ -23,6 +30,8 @@ class ConversationSimulatorTemplate:
23
30
  3. Avoid providing excessive details upfront; the goal is to initiate the conversation and build rapport, not to solve it in the first message.
24
31
  4. The message should be concise, ideally no more than 1-3 sentences.
25
32
 
33
+ {ConversationSimulatorTemplate.multimodal_rules}
34
+
26
35
  IMPORTANT: The output must be formatted as a JSON object with a single key `simulated_input`, where the value is the generated opening message in {language}.
27
36
 
28
37
  Example Language: english
@@ -61,6 +70,8 @@ class ConversationSimulatorTemplate:
61
70
  3. Keep the tone consistent with the previous user inputs.
62
71
  4. The generated user input should be concise, ideally no more than 1-2 sentences.
63
72
 
73
+ {ConversationSimulatorTemplate.multimodal_rules}
74
+
64
75
  IMPORTANT: The output must be formatted as a JSON object with a single key `simulated_input`,
65
76
  where the value is the generated user input in {language}.
66
77
 
@@ -101,6 +112,8 @@ class ConversationSimulatorTemplate:
101
112
  2. If the expected outcome has been met, mark the conversation as complete.
102
113
  3. If not, mark it as incomplete and briefly describe what remains to be done.
103
114
 
115
+ {ConversationSimulatorTemplate.multimodal_rules}
116
+
104
117
  IMPORTANT: The output must be formatted as a JSON object with two keys:
105
118
  `is_complete` (a boolean) and `reason` (a string).
106
119
 
deepeval/test_case/api.py CHANGED
@@ -12,7 +12,6 @@ from deepeval.test_case import (
12
12
  ConversationalTestCase,
13
13
  Turn,
14
14
  )
15
- from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
16
15
  from deepeval.constants import PYTEST_RUN_TEST_NAME
17
16
 
18
17
 
@@ -33,7 +32,6 @@ def create_api_test_case(
33
32
  trace: Optional[TraceApi] = None,
34
33
  index: Optional[int] = None,
35
34
  ) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
36
- from deepeval.utils import convert_to_multi_modal_array
37
35
 
38
36
  if isinstance(test_case, ConversationalTestCase):
39
37
  order = (
@@ -61,8 +59,10 @@ def create_api_test_case(
61
59
  context=test_case.context,
62
60
  tags=test_case.tags,
63
61
  comments=test_case.comments,
62
+ imagesMapping=test_case._get_images_mapping(),
64
63
  additionalMetadata=test_case.additional_metadata,
65
64
  )
65
+
66
66
  api_test_case.turns = [
67
67
  create_api_turn(
68
68
  turn=turn,
@@ -86,48 +86,27 @@ def create_api_test_case(
86
86
  name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
87
87
  metrics_data = []
88
88
 
89
- if isinstance(test_case, LLMTestCase) and test_case.multimodal is False:
90
- api_test_case = LLMApiTestCase(
91
- name=name,
92
- input=test_case.input,
93
- actualOutput=test_case.actual_output,
94
- expectedOutput=test_case.expected_output,
95
- context=test_case.context,
96
- retrievalContext=test_case.retrieval_context,
97
- toolsCalled=test_case.tools_called,
98
- expectedTools=test_case.expected_tools,
99
- tokenCost=test_case.token_cost,
100
- completionTime=test_case.completion_time,
101
- tags=test_case.tags,
102
- success=success,
103
- metricsData=metrics_data,
104
- runDuration=None,
105
- evaluationCost=None,
106
- order=order,
107
- additionalMetadata=test_case.additional_metadata,
108
- comments=test_case.comments,
109
- trace=trace,
110
- )
111
- elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
112
- api_test_case = LLMApiTestCase(
113
- name=name,
114
- input=test_case.input,
115
- actualOutput=test_case.actual_output,
116
- expectedOutput=test_case.expected_output,
117
- retrievalContext=test_case.retrieval_context,
118
- context=test_case.context,
119
- imagesMapping=_MLLM_IMAGE_REGISTRY,
120
- toolsCalled=test_case.tools_called,
121
- expectedTools=test_case.expected_tools,
122
- tokenCost=test_case.token_cost,
123
- completionTime=test_case.completion_time,
124
- success=success,
125
- metricsData=metrics_data,
126
- runDuration=None,
127
- evaluationCost=None,
128
- order=order,
129
- additionalMetadata=test_case.additional_metadata,
130
- comments=test_case.comments,
131
- )
89
+ api_test_case = LLMApiTestCase(
90
+ name=name,
91
+ input=test_case.input,
92
+ actualOutput=test_case.actual_output,
93
+ expectedOutput=test_case.expected_output,
94
+ retrievalContext=test_case.retrieval_context,
95
+ context=test_case.context,
96
+ imagesMapping=test_case._get_images_mapping(),
97
+ toolsCalled=test_case.tools_called,
98
+ expectedTools=test_case.expected_tools,
99
+ tokenCost=test_case.token_cost,
100
+ completionTime=test_case.completion_time,
101
+ success=success,
102
+ metricsData=metrics_data,
103
+ runDuration=None,
104
+ evaluationCost=None,
105
+ order=order,
106
+ additionalMetadata=test_case.additional_metadata,
107
+ comments=test_case.comments,
108
+ tags=test_case.tags,
109
+ trace=trace,
110
+ )
132
111
  # llm_test_case_lookup_map[instance_id] = api_test_case
133
112
  return api_test_case
@@ -1,7 +1,7 @@
1
1
  from typing import List, Dict, Optional, Union
2
- from dataclasses import dataclass
2
+ from dataclasses import dataclass, field
3
3
  from pydantic import BaseModel
4
-
4
+ import re
5
5
  from deepeval.test_case import (
6
6
  LLMTestCase,
7
7
  )
@@ -19,6 +19,7 @@ class Contestant(BaseModel):
19
19
  @dataclass
20
20
  class ArenaTestCase:
21
21
  contestants: List[Contestant]
22
+ multimodal: bool = field(default=False)
22
23
 
23
24
  def __post_init__(self):
24
25
  contestant_names = [contestant.name for contestant in self.contestants]
@@ -38,6 +39,10 @@ class ArenaTestCase:
38
39
  "All contestants must have the same 'expected_output'."
39
40
  )
40
41
 
42
+ for contestant in self.contestants:
43
+ if contestant.test_case.multimodal:
44
+ self.multimodal = True
45
+
41
46
 
42
47
  class Arena:
43
48
  test_cases: List[ArenaTestCase]
@@ -1,3 +1,4 @@
1
+ import re
1
2
  from pydantic import (
2
3
  BaseModel,
3
4
  Field,
@@ -17,6 +18,7 @@ from deepeval.test_case.mcp import (
17
18
  MCPToolCall,
18
19
  validate_mcp_servers,
19
20
  )
21
+ from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
20
22
 
21
23
 
22
24
  class TurnParams(Enum):
@@ -170,12 +172,28 @@ class ConversationalTestCase(BaseModel):
170
172
  return self
171
173
 
172
174
  pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
173
- self.multimodal = any(
174
- [
175
- re.search(pattern, turn.content) is not None
176
- for turn in self.turns
177
- ]
178
- )
175
+ if self.scenario:
176
+ if re.search(pattern, self.scenario) is not None:
177
+ self.multimodal = True
178
+ return self
179
+ if self.expected_outcome:
180
+ if re.search(pattern, self.expected_outcome) is not None:
181
+ self.multimodal = True
182
+ return self
183
+ if self.user_description:
184
+ if re.search(pattern, self.user_description) is not None:
185
+ self.multimodal = True
186
+ return self
187
+ if self.turns:
188
+ for turn in self.turns:
189
+ if re.search(pattern, turn.content) is not None:
190
+ self.multimodal = True
191
+ return self
192
+ if turn.retrieval_context is not None:
193
+ self.multimodal = any(
194
+ re.search(pattern, context) is not None
195
+ for context in turn.retrieval_context
196
+ )
179
197
 
180
198
  return self
181
199
 
@@ -215,3 +233,34 @@ class ConversationalTestCase(BaseModel):
215
233
  data["turns"] = copied_turns
216
234
 
217
235
  return data
236
+
237
+ def _get_images_mapping(self) -> Dict[str, MLLMImage]:
238
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
239
+ image_ids = set()
240
+
241
+ def extract_ids_from_string(s: Optional[str]) -> None:
242
+ """Helper to extract image IDs from a string."""
243
+ if s is not None and isinstance(s, str):
244
+ matches = re.findall(pattern, s)
245
+ image_ids.update(matches)
246
+
247
+ def extract_ids_from_list(lst: Optional[List[str]]) -> None:
248
+ """Helper to extract image IDs from a list of strings."""
249
+ if lst is not None:
250
+ for item in lst:
251
+ extract_ids_from_string(item)
252
+
253
+ extract_ids_from_string(self.scenario)
254
+ extract_ids_from_string(self.expected_outcome)
255
+ extract_ids_from_list(self.context)
256
+ extract_ids_from_string(self.user_description)
257
+ for turn in self.turns:
258
+ extract_ids_from_string(turn.content)
259
+ extract_ids_from_list(turn.retrieval_context)
260
+
261
+ images_mapping = {}
262
+ for img_id in image_ids:
263
+ if img_id in _MLLM_IMAGE_REGISTRY:
264
+ images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
265
+
266
+ return images_mapping if len(images_mapping) > 0 else None
@@ -60,19 +60,34 @@ class MLLMImage:
60
60
  if self.local:
61
61
  path = self.process_url(self.url)
62
62
  self.filename = os.path.basename(path)
63
- self.mimeType = (
64
- mimetypes.guess_type(path)[0] or "application/octet-stream"
65
- )
66
- with open(path, "rb") as f:
67
- raw = f.read()
68
- self.dataBase64 = base64.b64encode(raw).decode("ascii")
63
+ self.mimeType = mimetypes.guess_type(path)[0] or "image/jpeg"
64
+
65
+ if not os.path.exists(path):
66
+ raise FileNotFoundError(f"Image file not found: {path}")
67
+
68
+ self._load_base64(path)
69
69
  else:
70
+ if not self.url.startswith(("http://", "https://")):
71
+ raise ValueError(
72
+ f"Invalid remote URL format: {self.url}. URL must start with http:// or https://"
73
+ )
70
74
  self.filename = None
71
75
  self.mimeType = None
72
76
  self.dataBase64 = None
73
77
 
74
78
  _MLLM_IMAGE_REGISTRY[self._id] = self
75
79
 
80
+ def _load_base64(self, path: str):
81
+ with open(path, "rb") as f:
82
+ raw = f.read()
83
+ self.dataBase64 = base64.b64encode(raw).decode("ascii")
84
+
85
+ def ensure_images_loaded(self):
86
+ if self.local and self.dataBase64 is None:
87
+ path = self.process_url(self.url)
88
+ self._load_base64(path)
89
+ return self
90
+
76
91
  def _placeholder(self) -> str:
77
92
  return f"[DEEPEVAL:IMAGE:{self._id}]"
78
93
 
@@ -376,6 +391,16 @@ class LLMTestCase(BaseModel):
376
391
  if isinstance(self.input, str)
377
392
  else self.multimodal
378
393
  )
394
+ if self.retrieval_context is not None:
395
+ auto_detect = auto_detect or any(
396
+ re.search(pattern, context) is not None
397
+ for context in self.retrieval_context
398
+ )
399
+ if self.context is not None:
400
+ auto_detect = auto_detect or any(
401
+ re.search(pattern, context) is not None
402
+ for context in self.context
403
+ )
379
404
 
380
405
  self.multimodal = auto_detect
381
406
  return self
@@ -486,3 +511,32 @@ class LLMTestCase(BaseModel):
486
511
  )
487
512
 
488
513
  return data
514
+
515
+ def _get_images_mapping(self) -> Dict[str, MLLMImage]:
516
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
517
+ image_ids = set()
518
+
519
+ def extract_ids_from_string(s: Optional[str]) -> None:
520
+ """Helper to extract image IDs from a string."""
521
+ if s is not None and isinstance(s, str):
522
+ matches = re.findall(pattern, s)
523
+ image_ids.update(matches)
524
+
525
+ def extract_ids_from_list(lst: Optional[List[str]]) -> None:
526
+ """Helper to extract image IDs from a list of strings."""
527
+ if lst is not None:
528
+ for item in lst:
529
+ extract_ids_from_string(item)
530
+
531
+ extract_ids_from_string(self.input)
532
+ extract_ids_from_string(self.actual_output)
533
+ extract_ids_from_string(self.expected_output)
534
+ extract_ids_from_list(self.context)
535
+ extract_ids_from_list(self.retrieval_context)
536
+
537
+ images_mapping = {}
538
+ for img_id in image_ids:
539
+ if img_id in _MLLM_IMAGE_REGISTRY:
540
+ images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
541
+
542
+ return images_mapping if len(images_mapping) > 0 else None
deepeval/test_run/api.py CHANGED
@@ -126,6 +126,9 @@ class ConversationalApiTestCase(BaseModel):
126
126
  additional_metadata: Optional[Dict] = Field(
127
127
  None, alias="additionalMetadata"
128
128
  )
129
+ images_mapping: Optional[Dict[str, MLLMImage]] = Field(
130
+ None, alias="imagesMapping"
131
+ )
129
132
  tags: Optional[List[str]] = Field(None)
130
133
 
131
134
  def update_metric_data(self, metrics_data: MetricData):