deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -2,11 +2,14 @@ from __future__ import annotations
2
2
  import inspect
3
3
  import random
4
4
  import re
5
+ import statistics
5
6
  from typing import (
6
7
  Any,
7
8
  Callable,
8
9
  List,
9
10
  Optional,
11
+ Protocol,
12
+ Sequence,
10
13
  Tuple,
11
14
  TYPE_CHECKING,
12
15
  Union,
@@ -17,11 +20,13 @@ from typing import (
17
20
  from deepeval.errors import DeepEvalError
18
21
  from deepeval.metrics.base_metric import BaseMetric, BaseConversationalMetric
19
22
  from deepeval.prompt.prompt import Prompt
20
- from deepeval.prompt.api import PromptType, PromptMessage
21
- from deepeval.optimization.types import (
23
+ from deepeval.prompt.api import PromptMessage
24
+ from deepeval.optimizer.types import (
25
+ ModelCallback,
22
26
  ModuleId,
23
27
  PromptConfigurationId,
24
28
  PromptConfiguration,
29
+ PromptConfigSnapshot,
25
30
  OptimizationReport,
26
31
  )
27
32
 
@@ -54,7 +59,7 @@ def split_goldens(
54
59
  pareto_size: Number of items to allocate to the Pareto set bound between [0, len(goldens)].
55
60
  random_state: A shared `random.Random` instance that provides the source
56
61
  of randomness. For reproducible runs, pass the same object used by
57
- the GEPA loop constructed from `GEPAConfig.random_seed`
62
+ the GEPA loop constructed from `GEPA.random_seed`
58
63
 
59
64
  Returns:
60
65
  (d_feedback, d_pareto)
@@ -151,87 +156,22 @@ def normalize_seed_prompts(
151
156
  return mapping
152
157
 
153
158
 
154
- def build_model_callback_kwargs(
155
- *,
156
- # scoring context
157
- golden: Optional[Union["Golden", "ConversationalGolden"]] = None,
158
- # rewriter context
159
- feedback_text: Optional[str] = None,
160
- # shared
161
- prompt: Optional[Prompt] = None,
162
- prompt_type: Optional[str] = None,
163
- prompt_text: Optional[str] = None,
164
- prompt_messages: Optional[List["PromptMessage"]] = None,
165
- ) -> Dict[str, Any]:
166
- """
167
- Build a superset of kwargs for GEPA model callbacks.
168
-
169
- All keys are present in the dict so callbacks can declare any subset of:
170
-
171
- hook: str # injected by (a_)invoke_model_callback
172
- prompt: Prompt
173
- prompt_type: str
174
- prompt_text: str
175
- prompt_messages: List[PromptMessage]
176
- golden: Golden | ConversationalGolden
177
- feedback_text: str
178
-
179
- Non applicable fields are set to None.
180
- """
181
- return {
182
- # scoring context
183
- "golden": golden,
184
- # rewriter context
185
- "feedback_text": feedback_text,
186
- # shared
187
- "prompt": prompt,
188
- "prompt_text": prompt_text,
189
- "prompt_messages": prompt_messages,
190
- }
191
-
192
-
193
159
  def invoke_model_callback(
194
160
  *,
195
- hook: str,
196
- model_callback: Callable[
197
- ...,
198
- Union[
199
- str,
200
- Dict,
201
- Tuple[Union[str, Dict], float],
202
- ],
203
- ],
204
- candidate_kwargs: Dict[str, Any],
205
- ) -> Union[
206
- str,
207
- Dict,
208
- Tuple[Union[str, Dict], float],
209
- ]:
161
+ model_callback: ModelCallback,
162
+ prompt: Prompt,
163
+ golden: Union["Golden", "ConversationalGolden"],
164
+ ) -> str:
210
165
  """
211
166
  Call a user provided model_callback in a synchronous context.
212
167
 
213
- - Filters kwargs to only those the callback accepts.
214
- - Injects `hook` if the callback declares it.
215
- - Raises if the callback returns an awaitable; callers must use async
216
- helpers for async callbacks.
168
+ Raises if the callback returns an awaitable.
217
169
  """
218
- sig = inspect.signature(model_callback)
219
- supported = set(sig.parameters.keys())
220
-
221
- filtered = {
222
- key: value
223
- for key, value in candidate_kwargs.items()
224
- if key in supported
225
- }
226
-
227
- if "hook" in supported:
228
- filtered["hook"] = hook
229
-
230
- result = model_callback(**filtered)
170
+ result = model_callback(prompt, golden)
231
171
  if inspect.isawaitable(result):
232
172
  raise DeepEvalError(
233
173
  "model_callback returned an awaitable from a synchronous context. "
234
- "Either declare the callback as `async def` and use async GEPA, or call "
174
+ "Either declare the callback as `async def` and use async optimization, or call "
235
175
  "`model.generate(...)` instead of `model.a_generate(...)` inside a sync callback."
236
176
  )
237
177
  return result
@@ -239,41 +179,16 @@ def invoke_model_callback(
239
179
 
240
180
  async def a_invoke_model_callback(
241
181
  *,
242
- hook: str,
243
- model_callback: Callable[
244
- ...,
245
- Union[
246
- str,
247
- Dict,
248
- Tuple[Union[str, Dict], float],
249
- ],
250
- ],
251
- candidate_kwargs: Dict[str, Any],
252
- ) -> Union[
253
- str,
254
- Dict,
255
- Tuple[Union[str, Dict], float],
256
- ]:
182
+ model_callback: ModelCallback,
183
+ prompt: Prompt,
184
+ golden: Union["Golden", "ConversationalGolden"],
185
+ ) -> str:
257
186
  """
258
187
  Call a user provided model_callback in an async context.
259
188
 
260
- - Filters kwargs to only those the callback accepts.
261
- - Injects `hook` if the callback declares it.
262
- - Supports both sync and async callbacks.
189
+ Supports both sync and async callbacks.
263
190
  """
264
- sig = inspect.signature(model_callback)
265
- supported = set(sig.parameters.keys())
266
-
267
- filtered = {
268
- key: value
269
- for key, value in candidate_kwargs.items()
270
- if key in supported
271
- }
272
-
273
- if "hook" in supported:
274
- filtered["hook"] = hook
275
-
276
- result = model_callback(**filtered)
191
+ result = model_callback(prompt, golden)
277
192
  if inspect.isawaitable(result):
278
193
  return await result
279
194
  return result
@@ -288,58 +203,17 @@ def build_prompt_config_snapshots(
288
203
  prompt_configurations_by_id: Dict[
289
204
  PromptConfigurationId, "PromptConfiguration"
290
205
  ],
291
- ) -> Dict[PromptConfigurationId, Dict[str, Any]]:
206
+ ) -> Dict[PromptConfigurationId, PromptConfigSnapshot]:
292
207
  """
293
- Build a serializable snapshot of all prompt configurations.
294
-
295
- Shape matches the docs for `prompt_configurations`:
296
-
297
- {
298
- "<config_id>": {
299
- "parent": "<parent_id or None>",
300
- "prompts": {
301
- "<module_id>": {
302
- "type": "TEXT",
303
- "text_template": "...",
304
- }
305
- # or
306
- "<module_id>": {
307
- "type": "LIST",
308
- "messages": [
309
- {"role": "system", "content": "..."},
310
- ...
311
- ],
312
- },
313
- },
314
- },
315
- ...
316
- }
208
+ Build snapshots of all prompt configurations.
317
209
  """
318
- snapshots: Dict[PromptConfigurationId, Dict[str, Any]] = {}
210
+ snapshots: Dict[PromptConfigurationId, PromptConfigSnapshot] = {}
319
211
 
320
212
  for cfg_id, cfg in prompt_configurations_by_id.items():
321
- prompts_snapshot: Dict[str, Any] = {}
322
-
323
- for module_id, prompt in cfg.prompts.items():
324
- if prompt.type is PromptType.LIST:
325
- messages = [
326
- {"role": msg.role, "content": (msg.content or "")}
327
- for msg in (prompt.messages_template or [])
328
- ]
329
- prompts_snapshot[module_id] = {
330
- "type": "LIST",
331
- "messages": messages,
332
- }
333
- else:
334
- prompts_snapshot[module_id] = {
335
- "type": "TEXT",
336
- "text_template": (prompt.text_template or ""),
337
- }
338
-
339
- snapshots[cfg_id] = {
340
- "parent": cfg.parent,
341
- "prompts": prompts_snapshot,
342
- }
213
+ snapshots[cfg_id] = PromptConfigSnapshot(
214
+ parent=cfg.parent,
215
+ prompts=dict(cfg.prompts),
216
+ )
343
217
 
344
218
  return snapshots
345
219
 
@@ -494,17 +368,8 @@ def validate_sequence_of(
494
368
  def validate_callback(
495
369
  *,
496
370
  component: str,
497
- model_callback: Optional[
498
- Callable[
499
- ...,
500
- Union[
501
- str,
502
- Dict,
503
- Tuple[Union[str, Dict], float],
504
- ],
505
- ]
506
- ],
507
- ) -> Callable[..., Union[str, Dict, Tuple[Union[str, Dict], float]]]:
371
+ model_callback: Optional[ModelCallback],
372
+ ) -> ModelCallback:
508
373
  """
509
374
  Ensure that `model_callback` is provided.
510
375
 
@@ -596,3 +461,20 @@ def validate_int_in_range(
596
461
  )
597
462
 
598
463
  return value
464
+
465
+
466
+ ##############
467
+ # Aggregates #
468
+ ##############
469
+
470
+
471
+ class Aggregator(Protocol):
472
+ def __call__(self, scores: Sequence[float]) -> float: ...
473
+
474
+
475
+ def mean_of_all(scores: Sequence[float]) -> float:
476
+ return statistics.fmean(scores) if scores else 0.0
477
+
478
+
479
+ def median_of_all(scores: Sequence[float]) -> float:
480
+ return statistics.median(scores) if scores else 0.0
deepeval/prompt/prompt.py CHANGED
@@ -4,7 +4,7 @@ import json
4
4
  import os
5
5
 
6
6
  from enum import Enum
7
- from typing import Optional, List, Dict, Type, Literal, TYPE_CHECKING
7
+ from typing import Optional, List, Dict, Type, Literal
8
8
  from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
9
9
  from rich.console import Console
10
10
  from pydantic import BaseModel, ValidationError
@@ -34,10 +34,6 @@ from deepeval.prompt.utils import (
34
34
  from deepeval.confident.api import Api, Endpoints, HttpMethods
35
35
  from deepeval.constants import HIDDEN_DIR
36
36
 
37
-
38
- if TYPE_CHECKING:
39
- from deepeval.optimization.types import OptimizationReport
40
-
41
37
  logger = logging.getLogger(__name__)
42
38
 
43
39
  portalocker = None
@@ -117,6 +113,7 @@ class Prompt:
117
113
  model_settings: Optional[ModelSettings] = None,
118
114
  output_type: Optional[OutputType] = None,
119
115
  output_schema: Optional[Type[BaseModel]] = None,
116
+ interpolation_type: Optional[PromptInterpolationType] = None,
120
117
  ):
121
118
  if text_template and messages_template:
122
119
  raise TypeError(
@@ -129,7 +126,9 @@ class Prompt:
129
126
  self.output_type: Optional[OutputType] = output_type
130
127
  self.output_schema: Optional[Type[BaseModel]] = output_schema
131
128
  self.label: Optional[str] = None
132
- self.interpolation_type: Optional[PromptInterpolationType] = None
129
+ self.interpolation_type: PromptInterpolationType = (
130
+ interpolation_type or PromptInterpolationType.FSTRING
131
+ )
133
132
 
134
133
  self._version = None
135
134
  self._prompt_version_id: Optional[str] = None
@@ -145,9 +144,6 @@ class Prompt:
145
144
  elif messages_template:
146
145
  self.type = PromptType.LIST
147
146
 
148
- # updated after optimization runs
149
- self.optimization_report: Optional["OptimizationReport"] = None
150
-
151
147
  def __del__(self):
152
148
  """Cleanup polling tasks when instance is destroyed"""
153
149
  try:
@@ -20,6 +20,7 @@ from deepeval.simulator.template import (
20
20
  ConversationSimulatorTemplate,
21
21
  )
22
22
  from deepeval.models import DeepEvalBaseLLM
23
+ from deepeval.metrics.utils import MULTIMODAL_SUPPORTED_MODELS
23
24
  from deepeval.simulator.schema import (
24
25
  SimulatedInput,
25
26
  ConversationCompletion,
@@ -94,6 +95,26 @@ class ConversationSimulator:
94
95
  )
95
96
  )
96
97
  else:
98
+ multimodal = any(
99
+ [golden.multimodal for golden in conversational_goldens]
100
+ )
101
+ if multimodal:
102
+ if (
103
+ not self.simulator_model
104
+ or not self.simulator_model.supports_multimodal()
105
+ ):
106
+ if (
107
+ self.simulator_model
108
+ and type(self.simulator_model)
109
+ in MULTIMODAL_SUPPORTED_MODELS
110
+ ):
111
+ raise ValueError(
112
+ f"The evaluation model {self.simulator_model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {self.simulator_model.__class__.__name__} provider includes {', '.join(self.simulator_model.__class__.valid_multimodal_models)}."
113
+ )
114
+ else:
115
+ raise ValueError(
116
+ f"The evaluation model {self.simulator_model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS])}"
117
+ )
97
118
  conversational_test_cases: List[ConversationalTestCase] = []
98
119
  for conversation_index, golden in enumerate(
99
120
  conversational_goldens
@@ -124,6 +145,28 @@ class ConversationSimulator:
124
145
  progress: Optional[Progress] = None,
125
146
  pbar_id: Optional[int] = None,
126
147
  ) -> List[ConversationalTestCase]:
148
+
149
+ multimodal = any(
150
+ [golden.multimodal for golden in conversational_goldens]
151
+ )
152
+ if multimodal:
153
+ if (
154
+ not self.simulator_model
155
+ or not self.simulator_model.supports_multimodal()
156
+ ):
157
+ if (
158
+ self.simulator_model
159
+ and type(self.simulator_model)
160
+ in MULTIMODAL_SUPPORTED_MODELS
161
+ ):
162
+ raise ValueError(
163
+ f"The evaluation model {self.simulator_model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {self.simulator_model.__class__.__name__} provider includes {', '.join(self.simulator_model.__class__.valid_multimodal_models)}."
164
+ )
165
+ else:
166
+ raise ValueError(
167
+ f"The evaluation model {self.simulator_model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS])}"
168
+ )
169
+
127
170
  self.simulation_cost = 0 if self.using_native_model else None
128
171
 
129
172
  async def simulate_conversations(
@@ -7,6 +7,13 @@ from deepeval.test_case import Turn
7
7
 
8
8
 
9
9
  class ConversationSimulatorTemplate:
10
+ multimodal_rules = """
11
+ --- MULTIMODAL INPUT RULES ---
12
+ - Treat image content as factual evidence.
13
+ - Only reference visual details that are explicitly and clearly visible.
14
+ - Do not infer or guess objects, text, or details not visibly present.
15
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
16
+ """
10
17
 
11
18
  @staticmethod
12
19
  def simulate_first_user_turn(
@@ -23,6 +30,8 @@ class ConversationSimulatorTemplate:
23
30
  3. Avoid providing excessive details upfront; the goal is to initiate the conversation and build rapport, not to solve it in the first message.
24
31
  4. The message should be concise, ideally no more than 1-3 sentences.
25
32
 
33
+ {ConversationSimulatorTemplate.multimodal_rules}
34
+
26
35
  IMPORTANT: The output must be formatted as a JSON object with a single key `simulated_input`, where the value is the generated opening message in {language}.
27
36
 
28
37
  Example Language: english
@@ -61,6 +70,8 @@ class ConversationSimulatorTemplate:
61
70
  3. Keep the tone consistent with the previous user inputs.
62
71
  4. The generated user input should be concise, ideally no more than 1-2 sentences.
63
72
 
73
+ {ConversationSimulatorTemplate.multimodal_rules}
74
+
64
75
  IMPORTANT: The output must be formatted as a JSON object with a single key `simulated_input`,
65
76
  where the value is the generated user input in {language}.
66
77
 
@@ -101,6 +112,8 @@ class ConversationSimulatorTemplate:
101
112
  2. If the expected outcome has been met, mark the conversation as complete.
102
113
  3. If not, mark it as incomplete and briefly describe what remains to be done.
103
114
 
115
+ {ConversationSimulatorTemplate.multimodal_rules}
116
+
104
117
  IMPORTANT: The output must be formatted as a JSON object with two keys:
105
118
  `is_complete` (a boolean) and `reason` (a string).
106
119
 
@@ -3,13 +3,13 @@ from .llm_test_case import (
3
3
  LLMTestCaseParams,
4
4
  ToolCall,
5
5
  ToolCallParams,
6
+ MLLMImage,
6
7
  )
7
8
  from .conversational_test_case import (
8
9
  ConversationalTestCase,
9
10
  Turn,
10
11
  TurnParams,
11
12
  )
12
- from .mllm_test_case import MLLMTestCase, MLLMTestCaseParams, MLLMImage
13
13
  from .arena_test_case import ArenaTestCase, Contestant
14
14
  from .mcp import (
15
15
  MCPServer,
@@ -31,8 +31,6 @@ __all__ = [
31
31
  "MCPPromptCall",
32
32
  "MCPResourceCall",
33
33
  "MCPToolCall",
34
- "MLLMTestCase",
35
- "MLLMTestCaseParams",
36
34
  "MLLMImage",
37
35
  "ArenaTestCase",
38
36
  "Contestant",
deepeval/test_case/api.py CHANGED
@@ -10,7 +10,6 @@ from deepeval.test_run.api import (
10
10
  from deepeval.test_case import (
11
11
  LLMTestCase,
12
12
  ConversationalTestCase,
13
- MLLMTestCase,
14
13
  Turn,
15
14
  )
16
15
  from deepeval.constants import PYTEST_RUN_TEST_NAME
@@ -29,10 +28,11 @@ def create_api_turn(turn: Turn, index: int) -> TurnApi:
29
28
 
30
29
 
31
30
  def create_api_test_case(
32
- test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
31
+ test_case: Union[LLMTestCase, ConversationalTestCase],
33
32
  trace: Optional[TraceApi] = None,
34
33
  index: Optional[int] = None,
35
34
  ) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
35
+
36
36
  if isinstance(test_case, ConversationalTestCase):
37
37
  order = (
38
38
  test_case._dataset_rank
@@ -59,8 +59,10 @@ def create_api_test_case(
59
59
  context=test_case.context,
60
60
  tags=test_case.tags,
61
61
  comments=test_case.comments,
62
+ imagesMapping=test_case._get_images_mapping(),
62
63
  additionalMetadata=test_case.additional_metadata,
63
64
  )
65
+
64
66
  api_test_case.turns = [
65
67
  create_api_turn(
66
68
  turn=turn,
@@ -84,48 +86,27 @@ def create_api_test_case(
84
86
  name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
85
87
  metrics_data = []
86
88
 
87
- if isinstance(test_case, LLMTestCase):
88
- api_test_case = LLMApiTestCase(
89
- name=name,
90
- input=test_case.input,
91
- actualOutput=test_case.actual_output,
92
- expectedOutput=test_case.expected_output,
93
- context=test_case.context,
94
- retrievalContext=test_case.retrieval_context,
95
- toolsCalled=test_case.tools_called,
96
- expectedTools=test_case.expected_tools,
97
- tokenCost=test_case.token_cost,
98
- completionTime=test_case.completion_time,
99
- tags=test_case.tags,
100
- success=success,
101
- metricsData=metrics_data,
102
- runDuration=None,
103
- evaluationCost=None,
104
- order=order,
105
- additionalMetadata=test_case.additional_metadata,
106
- comments=test_case.comments,
107
- trace=trace,
108
- )
109
- elif isinstance(test_case, MLLMTestCase):
110
- api_test_case = LLMApiTestCase(
111
- name=name,
112
- input="",
113
- multimodalInput=test_case.input,
114
- multimodalActualOutput=test_case.actual_output,
115
- multimodalExpectedOutput=test_case.expected_output,
116
- multimodalRetrievalContext=test_case.retrieval_context,
117
- multimodalContext=test_case.context,
118
- toolsCalled=test_case.tools_called,
119
- expectedTools=test_case.expected_tools,
120
- tokenCost=test_case.token_cost,
121
- completionTime=test_case.completion_time,
122
- success=success,
123
- metricsData=metrics_data,
124
- runDuration=None,
125
- evaluationCost=None,
126
- order=order,
127
- additionalMetadata=test_case.additional_metadata,
128
- comments=test_case.comments,
129
- )
89
+ api_test_case = LLMApiTestCase(
90
+ name=name,
91
+ input=test_case.input,
92
+ actualOutput=test_case.actual_output,
93
+ expectedOutput=test_case.expected_output,
94
+ retrievalContext=test_case.retrieval_context,
95
+ context=test_case.context,
96
+ imagesMapping=test_case._get_images_mapping(),
97
+ toolsCalled=test_case.tools_called,
98
+ expectedTools=test_case.expected_tools,
99
+ tokenCost=test_case.token_cost,
100
+ completionTime=test_case.completion_time,
101
+ success=success,
102
+ metricsData=metrics_data,
103
+ runDuration=None,
104
+ evaluationCost=None,
105
+ order=order,
106
+ additionalMetadata=test_case.additional_metadata,
107
+ comments=test_case.comments,
108
+ tags=test_case.tags,
109
+ trace=trace,
110
+ )
130
111
  # llm_test_case_lookup_map[instance_id] = api_test_case
131
112
  return api_test_case
@@ -1,7 +1,7 @@
1
1
  from typing import List, Dict, Optional, Union
2
- from dataclasses import dataclass
2
+ from dataclasses import dataclass, field
3
3
  from pydantic import BaseModel
4
-
4
+ import re
5
5
  from deepeval.test_case import (
6
6
  LLMTestCase,
7
7
  )
@@ -19,6 +19,7 @@ class Contestant(BaseModel):
19
19
  @dataclass
20
20
  class ArenaTestCase:
21
21
  contestants: List[Contestant]
22
+ multimodal: bool = field(default=False)
22
23
 
23
24
  def __post_init__(self):
24
25
  contestant_names = [contestant.name for contestant in self.contestants]
@@ -38,6 +39,10 @@ class ArenaTestCase:
38
39
  "All contestants must have the same 'expected_output'."
39
40
  )
40
41
 
42
+ for contestant in self.contestants:
43
+ if contestant.test_case.multimodal:
44
+ self.multimodal = True
45
+
41
46
 
42
47
  class Arena:
43
48
  test_cases: List[ArenaTestCase]