deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -3,10 +3,13 @@ import requests
3
3
  from typing import Any, Dict, List, Optional, Union
4
4
  from pydantic import AnyUrl, SecretStr
5
5
 
6
+ from deepeval.errors import DeepEvalError
6
7
  from deepeval.config.settings import get_settings
7
8
  from deepeval.models.utils import (
8
9
  require_secret_api_key,
9
10
  )
11
+ from deepeval.test_case import MLLMImage
12
+ from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
10
13
  from deepeval.models import DeepEvalBaseLLM
11
14
  from deepeval.utils import require_param
12
15
 
@@ -29,16 +32,9 @@ class PortkeyModel(DeepEvalBaseLLM):
29
32
  settings = get_settings()
30
33
  model = model or settings.PORTKEY_MODEL_NAME
31
34
 
32
- self.name = require_param(
33
- model,
34
- provider_label="Portkey",
35
- env_var_name="PORTKEY_MODEL_NAME",
36
- param_hint="model",
37
- )
38
-
39
35
  if api_key is not None:
40
36
  # keep it secret, keep it safe from serializings, logging and alike
41
- self.api_key: SecretStr | None = SecretStr(api_key)
37
+ self.api_key: Optional[SecretStr] = SecretStr(api_key)
42
38
  else:
43
39
  self.api_key = settings.PORTKEY_API_KEY
44
40
 
@@ -47,6 +43,16 @@ class PortkeyModel(DeepEvalBaseLLM):
47
43
  elif settings.PORTKEY_BASE_URL is not None:
48
44
  base_url = str(settings.PORTKEY_BASE_URL).rstrip("/")
49
45
 
46
+ provider = provider or settings.PORTKEY_PROVIDER_NAME
47
+
48
+ # validation
49
+ model = require_param(
50
+ model,
51
+ provider_label="Portkey",
52
+ env_var_name="PORTKEY_MODEL_NAME",
53
+ param_hint="model",
54
+ )
55
+
50
56
  self.base_url = require_param(
51
57
  base_url,
52
58
  provider_label="Portkey",
@@ -54,7 +60,6 @@ class PortkeyModel(DeepEvalBaseLLM):
54
60
  param_hint="base_url",
55
61
  )
56
62
 
57
- provider = provider or settings.PORTKEY_PROVIDER_NAME
58
63
  self.provider = require_param(
59
64
  provider,
60
65
  provider_label="Portkey",
@@ -64,6 +69,7 @@ class PortkeyModel(DeepEvalBaseLLM):
64
69
  # Keep sanitized kwargs for client call to strip legacy keys
65
70
  self.kwargs = kwargs
66
71
  self.generation_kwargs = generation_kwargs or {}
72
+ super().__init__(model)
67
73
 
68
74
  def _headers(self) -> Dict[str, str]:
69
75
  api_key = require_secret_api_key(
@@ -82,18 +88,51 @@ class PortkeyModel(DeepEvalBaseLLM):
82
88
  return headers
83
89
 
84
90
  def _payload(self, prompt: str) -> Dict[str, Any]:
91
+ if check_if_multimodal(prompt):
92
+ prompt = convert_to_multi_modal_array(input=prompt)
93
+ content = self.generate_content(prompt)
94
+ else:
95
+ content = [{"type": "text", "text": prompt}]
85
96
  payload = {
86
97
  "model": self.name,
87
- "messages": [{"role": "user", "content": prompt}],
98
+ "messages": [{"role": "user", "content": content}],
88
99
  }
89
100
  if self.generation_kwargs:
90
101
  payload.update(self.generation_kwargs)
91
102
  return payload
92
103
 
104
+ def generate_content(
105
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
106
+ ):
107
+ content = []
108
+ for element in multimodal_input:
109
+ if isinstance(element, str):
110
+ content.append({"type": "text", "text": element})
111
+ elif isinstance(element, MLLMImage):
112
+ if element.url and not element.local:
113
+ content.append(
114
+ {
115
+ "type": "image_url",
116
+ "image_url": {"url": element.url},
117
+ }
118
+ )
119
+ else:
120
+ element.ensure_images_loaded()
121
+ data_uri = (
122
+ f"data:{element.mimeType};base64,{element.dataBase64}"
123
+ )
124
+ content.append(
125
+ {
126
+ "type": "image_url",
127
+ "image_url": {"url": data_uri},
128
+ }
129
+ )
130
+ return content
131
+
93
132
  def _extract_content(self, data: Dict[str, Any]) -> str:
94
133
  choices: Union[List[Dict[str, Any]], None] = data.get("choices")
95
134
  if not choices:
96
- raise ValueError("Portkey response did not include any choices.")
135
+ raise DeepEvalError("Portkey response did not include any choices.")
97
136
  message = choices[0].get("message", {})
98
137
  content: Union[str, List[Dict[str, Any]], None] = message.get("content")
99
138
  if isinstance(content, str):
@@ -109,7 +148,7 @@ class PortkeyModel(DeepEvalBaseLLM):
109
148
  f"{self.base_url}/chat/completions",
110
149
  json=self._payload(prompt),
111
150
  headers=self._headers(),
112
- timeout=60,
151
+ timeout=_request_timeout_seconds(),
113
152
  )
114
153
  response.raise_for_status()
115
154
  except requests.HTTPError as error:
@@ -118,11 +157,11 @@ class PortkeyModel(DeepEvalBaseLLM):
118
157
  body = response.json()
119
158
  except Exception:
120
159
  body = response.text
121
- raise ValueError(
160
+ raise DeepEvalError(
122
161
  f"Portkey request failed with status {response.status_code}: {body}"
123
162
  ) from error
124
163
  except requests.RequestException as error:
125
- raise ValueError(f"Portkey request failed: {error}") from error
164
+ raise DeepEvalError(f"Portkey request failed: {error}") from error
126
165
  return self._extract_content(response.json())
127
166
 
128
167
  async def a_generate(self, prompt: str) -> str:
@@ -132,11 +171,11 @@ class PortkeyModel(DeepEvalBaseLLM):
132
171
  f"{self.base_url}/chat/completions",
133
172
  json=self._payload(prompt),
134
173
  headers=self._headers(),
135
- timeout=60,
174
+ timeout=_request_timeout_seconds(),
136
175
  ) as response:
137
176
  if response.status >= 400:
138
177
  body = await response.text()
139
- raise ValueError(
178
+ raise DeepEvalError(
140
179
  f"Portkey request failed with status {response.status}: {body}"
141
180
  )
142
181
  data = await response.json()
@@ -147,3 +186,6 @@ class PortkeyModel(DeepEvalBaseLLM):
147
186
 
148
187
  def get_model_name(self):
149
188
  return f"{self.name} (Portkey)"
189
+
190
+ def supports_multimodal(self):
191
+ return True
@@ -1,8 +1,11 @@
1
- from typing import Dict, List, Optional
1
+ from typing import Dict
2
2
  import re
3
3
  import json
4
4
  import asyncio
5
5
 
6
+ from deepeval.errors import DeepEvalError
7
+
8
+
6
9
  MULTIMODAL_MODELS = ["GPTModel", "AzureModel", "GeminiModel", "OllamaModel"]
7
10
 
8
11
 
@@ -20,7 +23,7 @@ def trim_and_load_json(
20
23
  return json.loads(jsonStr)
21
24
  except json.JSONDecodeError:
22
25
  error_str = "Evaluation LLM outputted an invalid JSON. Please use a better evaluation model."
23
- raise ValueError(error_str)
26
+ raise DeepEvalError(error_str)
24
27
  except Exception as e:
25
28
  raise Exception(f"An unexpected error occurred: {str(e)}")
26
29
 
@@ -87,6 +87,8 @@ def set_outer_deadline(seconds: float | None):
87
87
  call, which must be passed to `reset_outer_deadline` to restore the
88
88
  previous value.
89
89
  """
90
+ if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
91
+ return _OUTER_DEADLINE.set(None)
90
92
  if seconds and seconds > 0:
91
93
  return _OUTER_DEADLINE.set(time.monotonic() + seconds)
92
94
  return _OUTER_DEADLINE.set(None)
@@ -131,11 +133,10 @@ def resolve_effective_attempt_timeout():
131
133
  float: Seconds to use for the inner per-attempt timeout. `0` means
132
134
  disable inner timeout and rely on the outer budget instead.
133
135
  """
134
- per_attempt = float(
135
- get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
136
- )
136
+ settings = get_settings()
137
+ per_attempt = float(settings.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
137
138
  # 0 or None disable inner wait_for. That means rely on outer task cap for timeouts instead.
138
- if per_attempt <= 0:
139
+ if settings.DEEPEVAL_DISABLE_TIMEOUTS or per_attempt <= 0:
139
140
  return 0
140
141
  # If we do have a positive per-attempt, use up to remaining outer budget.
141
142
  rem = _remaining_budget()
@@ -557,7 +558,11 @@ def run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
557
558
  BaseException: If `func` raises, the same exception is re-raised with its
558
559
  original traceback.
559
560
  """
560
- if not timeout_seconds or timeout_seconds <= 0:
561
+ if (
562
+ get_settings().DEEPEVAL_DISABLE_TIMEOUTS
563
+ or not timeout_seconds
564
+ or timeout_seconds <= 0
565
+ ):
561
566
  return func(*args, **kwargs)
562
567
 
563
568
  # try to respect the global cap on concurrent timeout workers
deepeval/models/utils.py CHANGED
@@ -8,7 +8,7 @@ from deepeval.errors import DeepEvalError
8
8
  logger = logging.getLogger(__name__)
9
9
 
10
10
 
11
- def parse_model_name(model_name: Optional[str] = None) -> str:
11
+ def parse_model_name(model_name: Optional[str] = None) -> Optional[str]:
12
12
  """Extract base model name from provider-prefixed format.
13
13
 
14
14
  This function is useful for extracting the actual model name from a
@@ -32,9 +32,9 @@ def parse_model_name(model_name: Optional[str] = None) -> str:
32
32
  if model_name is None:
33
33
  return None
34
34
 
35
- if "/" in model_name:
36
- _, parsed_model_name = model_name.split("/", 1)
37
- return parsed_model_name
35
+ # if "/" in model_name:
36
+ # _, parsed_model_name = model_name.split("/", 1)
37
+ # return parsed_model_name
38
38
  return model_name
39
39
 
40
40
 
@@ -80,6 +80,58 @@ def require_secret_api_key(
80
80
  return api_key
81
81
 
82
82
 
83
+ def require_costs(
84
+ model_data,
85
+ model_name: str,
86
+ input_token_envvar: str,
87
+ output_token_envvar: str,
88
+ cost_per_input_token: Optional[float] = None,
89
+ cost_per_output_token: Optional[float] = None,
90
+ ) -> Tuple[Optional[float], Optional[float]]:
91
+ """
92
+ Validates and returns the cost parameters (input and output tokens) for a model.
93
+
94
+ Arguments:
95
+ - model_data: The model's data object, which should contain `input_price` and `output_price`.
96
+ - model_name: The model name used for error messaging.
97
+ - cost_per_input_token: The input token cost provided during model initialization (optional).
98
+ - cost_per_output_token: The output token cost provided during model initialization (optional).
99
+ - input_token_envvar: The environment variable name for input cost.
100
+ - output_token_envvar: The environment variable name for output cost.
101
+
102
+ Returns:
103
+ - A tuple of validated values (input_cost, output_cost). If the values are provided, they are returned.
104
+ If not provided, they are fetched from settings or environment variables.
105
+ """
106
+
107
+ def validate_cost(
108
+ value: Optional[float], envvar_name: str
109
+ ) -> Optional[float]:
110
+ """Helper function to validate the cost values."""
111
+ if value is not None and value < 0:
112
+ raise DeepEvalError(f"{envvar_name} must be >= 0.")
113
+ return value
114
+
115
+ # Validate provided token costs
116
+ cost_per_input_token = validate_cost(
117
+ cost_per_input_token, input_token_envvar
118
+ )
119
+ cost_per_output_token = validate_cost(
120
+ cost_per_output_token, output_token_envvar
121
+ )
122
+
123
+ # If model data doesn't have pricing, use provided values or environment variables
124
+ if model_data.input_price is None or model_data.output_price is None:
125
+ if cost_per_input_token is None or cost_per_output_token is None:
126
+ return None, None
127
+
128
+ # Return the validated cost values as a tuple
129
+ return cost_per_input_token, cost_per_output_token
130
+
131
+ # If no custom cost values are provided, return model's default cost values
132
+ return model_data.input_price, model_data.output_price
133
+
134
+
83
135
  def normalize_kwargs_and_extract_aliases(
84
136
  provider_label: str,
85
137
  kwargs: Dict[str, Any],
@@ -20,6 +20,7 @@ from deepeval.simulator.template import (
20
20
  ConversationSimulatorTemplate,
21
21
  )
22
22
  from deepeval.models import DeepEvalBaseLLM
23
+ from deepeval.metrics.utils import MULTIMODAL_SUPPORTED_MODELS
23
24
  from deepeval.simulator.schema import (
24
25
  SimulatedInput,
25
26
  ConversationCompletion,
@@ -94,6 +95,26 @@ class ConversationSimulator:
94
95
  )
95
96
  )
96
97
  else:
98
+ multimodal = any(
99
+ [golden.multimodal for golden in conversational_goldens]
100
+ )
101
+ if multimodal:
102
+ if (
103
+ not self.simulator_model
104
+ or not self.simulator_model.supports_multimodal()
105
+ ):
106
+ if (
107
+ self.simulator_model
108
+ and type(self.simulator_model)
109
+ in MULTIMODAL_SUPPORTED_MODELS
110
+ ):
111
+ raise ValueError(
112
+ f"The evaluation model {self.simulator_model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {self.simulator_model.__class__.__name__} provider includes {', '.join(self.simulator_model.__class__.valid_multimodal_models)}."
113
+ )
114
+ else:
115
+ raise ValueError(
116
+ f"The evaluation model {self.simulator_model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS])}"
117
+ )
97
118
  conversational_test_cases: List[ConversationalTestCase] = []
98
119
  for conversation_index, golden in enumerate(
99
120
  conversational_goldens
@@ -124,6 +145,28 @@ class ConversationSimulator:
124
145
  progress: Optional[Progress] = None,
125
146
  pbar_id: Optional[int] = None,
126
147
  ) -> List[ConversationalTestCase]:
148
+
149
+ multimodal = any(
150
+ [golden.multimodal for golden in conversational_goldens]
151
+ )
152
+ if multimodal:
153
+ if (
154
+ not self.simulator_model
155
+ or not self.simulator_model.supports_multimodal()
156
+ ):
157
+ if (
158
+ self.simulator_model
159
+ and type(self.simulator_model)
160
+ in MULTIMODAL_SUPPORTED_MODELS
161
+ ):
162
+ raise ValueError(
163
+ f"The evaluation model {self.simulator_model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {self.simulator_model.__class__.__name__} provider includes {', '.join(self.simulator_model.__class__.valid_multimodal_models)}."
164
+ )
165
+ else:
166
+ raise ValueError(
167
+ f"The evaluation model {self.simulator_model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS])}"
168
+ )
169
+
127
170
  self.simulation_cost = 0 if self.using_native_model else None
128
171
 
129
172
  async def simulate_conversations(
@@ -471,7 +514,9 @@ class ConversationSimulator:
471
514
  ):
472
515
  if not self.run_remote:
473
516
  conversation_history = json.dumps(
474
- [t.model_dump() for t in turns], indent=4
517
+ [t.model_dump() for t in turns],
518
+ indent=4,
519
+ ensure_ascii=False,
475
520
  )
476
521
  prompt = self.template.stop_simulation(
477
522
  conversation_history, golden.expected_outcome
@@ -516,7 +561,9 @@ class ConversationSimulator:
516
561
  ):
517
562
  if not self.run_remote:
518
563
  conversation_history = json.dumps(
519
- [t.model_dump() for t in turns], indent=4
564
+ [t.model_dump() for t in turns],
565
+ indent=4,
566
+ ensure_ascii=False,
520
567
  )
521
568
  prompt = self.template.stop_simulation(
522
569
  conversation_history, golden.expected_outcome
@@ -7,6 +7,13 @@ from deepeval.test_case import Turn
7
7
 
8
8
 
9
9
  class ConversationSimulatorTemplate:
10
+ multimodal_rules = """
11
+ --- MULTIMODAL INPUT RULES ---
12
+ - Treat image content as factual evidence.
13
+ - Only reference visual details that are explicitly and clearly visible.
14
+ - Do not infer or guess objects, text, or details not visibly present.
15
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
16
+ """
10
17
 
11
18
  @staticmethod
12
19
  def simulate_first_user_turn(
@@ -23,6 +30,8 @@ class ConversationSimulatorTemplate:
23
30
  3. Avoid providing excessive details upfront; the goal is to initiate the conversation and build rapport, not to solve it in the first message.
24
31
  4. The message should be concise, ideally no more than 1-3 sentences.
25
32
 
33
+ {ConversationSimulatorTemplate.multimodal_rules}
34
+
26
35
  IMPORTANT: The output must be formatted as a JSON object with a single key `simulated_input`, where the value is the generated opening message in {language}.
27
36
 
28
37
  Example Language: english
@@ -48,7 +57,9 @@ class ConversationSimulatorTemplate:
48
57
  language: str,
49
58
  ) -> str:
50
59
  previous_conversation = json.dumps(
51
- [t.model_dump() for t in turns], indent=4
60
+ [t.model_dump() for t in turns],
61
+ indent=4,
62
+ ensure_ascii=False,
52
63
  )
53
64
  prompt = textwrap.dedent(
54
65
  f"""
@@ -61,6 +72,8 @@ class ConversationSimulatorTemplate:
61
72
  3. Keep the tone consistent with the previous user inputs.
62
73
  4. The generated user input should be concise, ideally no more than 1-2 sentences.
63
74
 
75
+ {ConversationSimulatorTemplate.multimodal_rules}
76
+
64
77
  IMPORTANT: The output must be formatted as a JSON object with a single key `simulated_input`,
65
78
  where the value is the generated user input in {language}.
66
79
 
@@ -101,6 +114,8 @@ class ConversationSimulatorTemplate:
101
114
  2. If the expected outcome has been met, mark the conversation as complete.
102
115
  3. If not, mark it as incomplete and briefly describe what remains to be done.
103
116
 
117
+ {ConversationSimulatorTemplate.multimodal_rules}
118
+
104
119
  IMPORTANT: The output must be formatted as a JSON object with two keys:
105
120
  `is_complete` (a boolean) and `reason` (a string).
106
121
 
@@ -25,7 +25,7 @@ from deepeval.metrics.utils import (
25
25
  from deepeval.progress_context import synthesizer_progress_context
26
26
  from deepeval.models import DeepEvalBaseLLM
27
27
  from deepeval.dataset.golden import Golden, ConversationalGolden
28
- from deepeval.synthesizer.types import *
28
+ from deepeval.synthesizer.types import Evolution, PromptEvolution
29
29
  from deepeval.synthesizer.templates import (
30
30
  EvolutionTemplate,
31
31
  SynthesizerTemplate,
@@ -246,7 +246,7 @@ class Synthesizer:
246
246
  )
247
247
  if self.cost_tracking and self.using_native_model:
248
248
  print(f"💰 API cost: {self.synthesis_cost:.6f}")
249
- if _send_data == True:
249
+ if _send_data:
250
250
  pass
251
251
  remove_pbars(
252
252
  progress,
@@ -546,7 +546,7 @@ class Synthesizer:
546
546
  # Remove pbar if not from docs
547
547
  remove_pbars(progress, [pbar_id]) if _progress is None else None
548
548
 
549
- if _send_data == True:
549
+ if _send_data:
550
550
  pass
551
551
  if _reset_cost and self.cost_tracking and self.using_native_model:
552
552
  print(f"💰 API cost: {self.synthesis_cost:.6f}")
@@ -567,7 +567,8 @@ class Synthesizer:
567
567
  if _reset_cost:
568
568
  self.synthetic_goldens = []
569
569
  self.synthesis_cost = 0 if self.using_native_model else None
570
- semaphore = asyncio.Semaphore(self.max_concurrent)
570
+ context_semaphore = asyncio.Semaphore(self.max_concurrent)
571
+ worker_semaphore = asyncio.Semaphore(self.max_concurrent)
571
572
  goldens: List[Golden] = []
572
573
 
573
574
  with synthesizer_progress_context(
@@ -586,9 +587,9 @@ class Synthesizer:
586
587
  ):
587
588
  tasks = [
588
589
  self.task_wrapper(
589
- semaphore,
590
+ context_semaphore,
590
591
  self._a_generate_from_context,
591
- semaphore=semaphore,
592
+ semaphore=worker_semaphore,
592
593
  context=context,
593
594
  goldens=goldens,
594
595
  include_expected_output=include_expected_output,
@@ -965,7 +966,7 @@ class Synthesizer:
965
966
 
966
967
  # Wrap up Synthesis
967
968
  self.synthetic_goldens.extend(goldens)
968
- if _send_data == True:
969
+ if _send_data:
969
970
  pass
970
971
  return goldens
971
972
 
@@ -1023,7 +1024,7 @@ class Synthesizer:
1023
1024
  source_files.append(golden.source_file)
1024
1025
 
1025
1026
  # Extract styles from goldens if not already set
1026
- if self.set_styling_config == False:
1027
+ if not self.set_styling_config:
1027
1028
  example_inputs = random.sample(
1028
1029
  [golden.input for golden in goldens], min(len(goldens), 10)
1029
1030
  )
@@ -1069,7 +1070,7 @@ class Synthesizer:
1069
1070
  source_files.append(golden.source_file)
1070
1071
 
1071
1072
  # Extract styles from goldens if not already set
1072
- if self.set_styling_config == False:
1073
+ if not self.set_styling_config:
1073
1074
  example_inputs = random.sample(
1074
1075
  [golden.input for golden in goldens], min(len(goldens), 10)
1075
1076
  )
@@ -1637,7 +1638,7 @@ class Synthesizer:
1637
1638
  )
1638
1639
  if self.cost_tracking and self.using_native_model:
1639
1640
  print(f"💰 API cost: {self.synthesis_cost:.6f}")
1640
- if _send_data == True:
1641
+ if _send_data:
1641
1642
  pass
1642
1643
  remove_pbars(
1643
1644
  progress,
@@ -1949,7 +1950,7 @@ class Synthesizer:
1949
1950
  # Remove pbar if not from docs
1950
1951
  remove_pbars(progress, [pbar_id]) if _progress is None else None
1951
1952
 
1952
- if _send_data == True:
1953
+ if _send_data:
1953
1954
  pass
1954
1955
  if _reset_cost and self.cost_tracking and self.using_native_model:
1955
1956
  print(f"💰 API cost: {self.synthesis_cost:.6f}")
@@ -1970,7 +1971,8 @@ class Synthesizer:
1970
1971
  if _reset_cost:
1971
1972
  self.synthetic_conversational_goldens = []
1972
1973
  self.synthesis_cost = 0 if self.using_native_model else None
1973
- semaphore = asyncio.Semaphore(self.max_concurrent)
1974
+ context_semaphore = asyncio.Semaphore(self.max_concurrent)
1975
+ worker_semaphore = asyncio.Semaphore(self.max_concurrent)
1974
1976
  goldens: List[ConversationalGolden] = []
1975
1977
 
1976
1978
  with synthesizer_progress_context(
@@ -1989,9 +1991,9 @@ class Synthesizer:
1989
1991
  ):
1990
1992
  tasks = [
1991
1993
  self.task_wrapper(
1992
- semaphore,
1994
+ context_semaphore,
1993
1995
  self._a_generate_conversational_from_context,
1994
- semaphore=semaphore,
1996
+ semaphore=worker_semaphore,
1995
1997
  context=context,
1996
1998
  goldens=goldens,
1997
1999
  include_expected_outcome=include_expected_outcome,
@@ -2335,7 +2337,7 @@ class Synthesizer:
2335
2337
 
2336
2338
  # Wrap up Synthesis
2337
2339
  self.synthetic_conversational_goldens.extend(goldens)
2338
- if _send_data == True:
2340
+ if _send_data:
2339
2341
  pass
2340
2342
  return goldens
2341
2343
 
@@ -2567,7 +2569,7 @@ class Synthesizer:
2567
2569
  contexts.append(golden.context)
2568
2570
 
2569
2571
  # Extract styles from conversational goldens if not already set
2570
- if self.set_conversational_styling_config == False:
2572
+ if not self.set_conversational_styling_config:
2571
2573
  example_scenarios = random.sample(
2572
2574
  [golden.scenario for golden in goldens],
2573
2575
  min(len(goldens), 10),
@@ -2612,7 +2614,7 @@ class Synthesizer:
2612
2614
  contexts.append(golden.context)
2613
2615
 
2614
2616
  # Extract styles from conversational goldens if not already set
2615
- if self.set_conversational_styling_config == False:
2617
+ if not self.set_conversational_styling_config:
2616
2618
  example_scenarios = random.sample(
2617
2619
  [golden.scenario for golden in goldens], min(len(goldens), 10)
2618
2620
  )
deepeval/test_case/api.py CHANGED
@@ -12,7 +12,6 @@ from deepeval.test_case import (
12
12
  ConversationalTestCase,
13
13
  Turn,
14
14
  )
15
- from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
16
15
  from deepeval.constants import PYTEST_RUN_TEST_NAME
17
16
 
18
17
 
@@ -33,7 +32,6 @@ def create_api_test_case(
33
32
  trace: Optional[TraceApi] = None,
34
33
  index: Optional[int] = None,
35
34
  ) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
36
- from deepeval.utils import convert_to_multi_modal_array
37
35
 
38
36
  if isinstance(test_case, ConversationalTestCase):
39
37
  order = (
@@ -61,8 +59,10 @@ def create_api_test_case(
61
59
  context=test_case.context,
62
60
  tags=test_case.tags,
63
61
  comments=test_case.comments,
62
+ imagesMapping=test_case._get_images_mapping(),
64
63
  additionalMetadata=test_case.additional_metadata,
65
64
  )
65
+
66
66
  api_test_case.turns = [
67
67
  create_api_turn(
68
68
  turn=turn,
@@ -86,48 +86,27 @@ def create_api_test_case(
86
86
  name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
87
87
  metrics_data = []
88
88
 
89
- if isinstance(test_case, LLMTestCase) and test_case.multimodal is False:
90
- api_test_case = LLMApiTestCase(
91
- name=name,
92
- input=test_case.input,
93
- actualOutput=test_case.actual_output,
94
- expectedOutput=test_case.expected_output,
95
- context=test_case.context,
96
- retrievalContext=test_case.retrieval_context,
97
- toolsCalled=test_case.tools_called,
98
- expectedTools=test_case.expected_tools,
99
- tokenCost=test_case.token_cost,
100
- completionTime=test_case.completion_time,
101
- tags=test_case.tags,
102
- success=success,
103
- metricsData=metrics_data,
104
- runDuration=None,
105
- evaluationCost=None,
106
- order=order,
107
- additionalMetadata=test_case.additional_metadata,
108
- comments=test_case.comments,
109
- trace=trace,
110
- )
111
- elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
112
- api_test_case = LLMApiTestCase(
113
- name=name,
114
- input=test_case.input,
115
- actualOutput=test_case.actual_output,
116
- expectedOutput=test_case.expected_output,
117
- retrievalContext=test_case.retrieval_context,
118
- context=test_case.context,
119
- imagesMapping=_MLLM_IMAGE_REGISTRY,
120
- toolsCalled=test_case.tools_called,
121
- expectedTools=test_case.expected_tools,
122
- tokenCost=test_case.token_cost,
123
- completionTime=test_case.completion_time,
124
- success=success,
125
- metricsData=metrics_data,
126
- runDuration=None,
127
- evaluationCost=None,
128
- order=order,
129
- additionalMetadata=test_case.additional_metadata,
130
- comments=test_case.comments,
131
- )
89
+ api_test_case = LLMApiTestCase(
90
+ name=name,
91
+ input=test_case.input,
92
+ actualOutput=test_case.actual_output,
93
+ expectedOutput=test_case.expected_output,
94
+ retrievalContext=test_case.retrieval_context,
95
+ context=test_case.context,
96
+ imagesMapping=test_case._get_images_mapping(),
97
+ toolsCalled=test_case.tools_called,
98
+ expectedTools=test_case.expected_tools,
99
+ tokenCost=test_case.token_cost,
100
+ completionTime=test_case.completion_time,
101
+ success=success,
102
+ metricsData=metrics_data,
103
+ runDuration=None,
104
+ evaluationCost=None,
105
+ order=order,
106
+ additionalMetadata=test_case.additional_metadata,
107
+ comments=test_case.comments,
108
+ tags=test_case.tags,
109
+ trace=trace,
110
+ )
132
111
  # llm_test_case_lookup_map[instance_id] = api_test_case
133
112
  return api_test_case