deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -2,11 +2,14 @@ from __future__ import annotations
2
2
  import inspect
3
3
  import random
4
4
  import re
5
+ import statistics
5
6
  from typing import (
6
7
  Any,
7
8
  Callable,
8
9
  List,
9
10
  Optional,
11
+ Protocol,
12
+ Sequence,
10
13
  Tuple,
11
14
  TYPE_CHECKING,
12
15
  Union,
@@ -17,11 +20,13 @@ from typing import (
17
20
  from deepeval.errors import DeepEvalError
18
21
  from deepeval.metrics.base_metric import BaseMetric, BaseConversationalMetric
19
22
  from deepeval.prompt.prompt import Prompt
20
- from deepeval.prompt.api import PromptType, PromptMessage
21
- from deepeval.optimization.types import (
23
+ from deepeval.prompt.api import PromptMessage
24
+ from deepeval.optimizer.types import (
25
+ ModelCallback,
22
26
  ModuleId,
23
27
  PromptConfigurationId,
24
28
  PromptConfiguration,
29
+ PromptConfigSnapshot,
25
30
  OptimizationReport,
26
31
  )
27
32
 
@@ -54,7 +59,7 @@ def split_goldens(
54
59
  pareto_size: Number of items to allocate to the Pareto set bound between [0, len(goldens)].
55
60
  random_state: A shared `random.Random` instance that provides the source
56
61
  of randomness. For reproducible runs, pass the same object used by
57
- the GEPA loop constructed from `GEPAConfig.random_seed`
62
+ the GEPA loop constructed from `GEPA.random_seed`
58
63
 
59
64
  Returns:
60
65
  (d_feedback, d_pareto)
@@ -151,87 +156,22 @@ def normalize_seed_prompts(
151
156
  return mapping
152
157
 
153
158
 
154
- def build_model_callback_kwargs(
155
- *,
156
- # scoring context
157
- golden: Optional[Union["Golden", "ConversationalGolden"]] = None,
158
- # rewriter context
159
- feedback_text: Optional[str] = None,
160
- # shared
161
- prompt: Optional[Prompt] = None,
162
- prompt_type: Optional[str] = None,
163
- prompt_text: Optional[str] = None,
164
- prompt_messages: Optional[List["PromptMessage"]] = None,
165
- ) -> Dict[str, Any]:
166
- """
167
- Build a superset of kwargs for GEPA model callbacks.
168
-
169
- All keys are present in the dict so callbacks can declare any subset of:
170
-
171
- hook: str # injected by (a_)invoke_model_callback
172
- prompt: Prompt
173
- prompt_type: str
174
- prompt_text: str
175
- prompt_messages: List[PromptMessage]
176
- golden: Golden | ConversationalGolden
177
- feedback_text: str
178
-
179
- Non applicable fields are set to None.
180
- """
181
- return {
182
- # scoring context
183
- "golden": golden,
184
- # rewriter context
185
- "feedback_text": feedback_text,
186
- # shared
187
- "prompt": prompt,
188
- "prompt_text": prompt_text,
189
- "prompt_messages": prompt_messages,
190
- }
191
-
192
-
193
159
  def invoke_model_callback(
194
160
  *,
195
- hook: str,
196
- model_callback: Callable[
197
- ...,
198
- Union[
199
- str,
200
- Dict,
201
- Tuple[Union[str, Dict], float],
202
- ],
203
- ],
204
- candidate_kwargs: Dict[str, Any],
205
- ) -> Union[
206
- str,
207
- Dict,
208
- Tuple[Union[str, Dict], float],
209
- ]:
161
+ model_callback: ModelCallback,
162
+ prompt: Prompt,
163
+ golden: Union["Golden", "ConversationalGolden"],
164
+ ) -> str:
210
165
  """
211
166
  Call a user provided model_callback in a synchronous context.
212
167
 
213
- - Filters kwargs to only those the callback accepts.
214
- - Injects `hook` if the callback declares it.
215
- - Raises if the callback returns an awaitable; callers must use async
216
- helpers for async callbacks.
168
+ Raises if the callback returns an awaitable.
217
169
  """
218
- sig = inspect.signature(model_callback)
219
- supported = set(sig.parameters.keys())
220
-
221
- filtered = {
222
- key: value
223
- for key, value in candidate_kwargs.items()
224
- if key in supported
225
- }
226
-
227
- if "hook" in supported:
228
- filtered["hook"] = hook
229
-
230
- result = model_callback(**filtered)
170
+ result = model_callback(prompt, golden)
231
171
  if inspect.isawaitable(result):
232
172
  raise DeepEvalError(
233
173
  "model_callback returned an awaitable from a synchronous context. "
234
- "Either declare the callback as `async def` and use async GEPA, or call "
174
+ "Either declare the callback as `async def` and use async optimization, or call "
235
175
  "`model.generate(...)` instead of `model.a_generate(...)` inside a sync callback."
236
176
  )
237
177
  return result
@@ -239,41 +179,16 @@ def invoke_model_callback(
239
179
 
240
180
  async def a_invoke_model_callback(
241
181
  *,
242
- hook: str,
243
- model_callback: Callable[
244
- ...,
245
- Union[
246
- str,
247
- Dict,
248
- Tuple[Union[str, Dict], float],
249
- ],
250
- ],
251
- candidate_kwargs: Dict[str, Any],
252
- ) -> Union[
253
- str,
254
- Dict,
255
- Tuple[Union[str, Dict], float],
256
- ]:
182
+ model_callback: ModelCallback,
183
+ prompt: Prompt,
184
+ golden: Union["Golden", "ConversationalGolden"],
185
+ ) -> str:
257
186
  """
258
187
  Call a user provided model_callback in an async context.
259
188
 
260
- - Filters kwargs to only those the callback accepts.
261
- - Injects `hook` if the callback declares it.
262
- - Supports both sync and async callbacks.
189
+ Supports both sync and async callbacks.
263
190
  """
264
- sig = inspect.signature(model_callback)
265
- supported = set(sig.parameters.keys())
266
-
267
- filtered = {
268
- key: value
269
- for key, value in candidate_kwargs.items()
270
- if key in supported
271
- }
272
-
273
- if "hook" in supported:
274
- filtered["hook"] = hook
275
-
276
- result = model_callback(**filtered)
191
+ result = model_callback(prompt, golden)
277
192
  if inspect.isawaitable(result):
278
193
  return await result
279
194
  return result
@@ -288,58 +203,17 @@ def build_prompt_config_snapshots(
288
203
  prompt_configurations_by_id: Dict[
289
204
  PromptConfigurationId, "PromptConfiguration"
290
205
  ],
291
- ) -> Dict[PromptConfigurationId, Dict[str, Any]]:
206
+ ) -> Dict[PromptConfigurationId, PromptConfigSnapshot]:
292
207
  """
293
- Build a serializable snapshot of all prompt configurations.
294
-
295
- Shape matches the docs for `prompt_configurations`:
296
-
297
- {
298
- "<config_id>": {
299
- "parent": "<parent_id or None>",
300
- "prompts": {
301
- "<module_id>": {
302
- "type": "TEXT",
303
- "text_template": "...",
304
- }
305
- # or
306
- "<module_id>": {
307
- "type": "LIST",
308
- "messages": [
309
- {"role": "system", "content": "..."},
310
- ...
311
- ],
312
- },
313
- },
314
- },
315
- ...
316
- }
208
+ Build snapshots of all prompt configurations.
317
209
  """
318
- snapshots: Dict[PromptConfigurationId, Dict[str, Any]] = {}
210
+ snapshots: Dict[PromptConfigurationId, PromptConfigSnapshot] = {}
319
211
 
320
212
  for cfg_id, cfg in prompt_configurations_by_id.items():
321
- prompts_snapshot: Dict[str, Any] = {}
322
-
323
- for module_id, prompt in cfg.prompts.items():
324
- if prompt.type is PromptType.LIST:
325
- messages = [
326
- {"role": msg.role, "content": (msg.content or "")}
327
- for msg in (prompt.messages_template or [])
328
- ]
329
- prompts_snapshot[module_id] = {
330
- "type": "LIST",
331
- "messages": messages,
332
- }
333
- else:
334
- prompts_snapshot[module_id] = {
335
- "type": "TEXT",
336
- "text_template": (prompt.text_template or ""),
337
- }
338
-
339
- snapshots[cfg_id] = {
340
- "parent": cfg.parent,
341
- "prompts": prompts_snapshot,
342
- }
213
+ snapshots[cfg_id] = PromptConfigSnapshot(
214
+ parent=cfg.parent,
215
+ prompts=dict(cfg.prompts),
216
+ )
343
217
 
344
218
  return snapshots
345
219
 
@@ -494,17 +368,8 @@ def validate_sequence_of(
494
368
  def validate_callback(
495
369
  *,
496
370
  component: str,
497
- model_callback: Optional[
498
- Callable[
499
- ...,
500
- Union[
501
- str,
502
- Dict,
503
- Tuple[Union[str, Dict], float],
504
- ],
505
- ]
506
- ],
507
- ) -> Callable[..., Union[str, Dict, Tuple[Union[str, Dict], float]]]:
371
+ model_callback: Optional[ModelCallback],
372
+ ) -> ModelCallback:
508
373
  """
509
374
  Ensure that `model_callback` is provided.
510
375
 
@@ -596,3 +461,20 @@ def validate_int_in_range(
596
461
  )
597
462
 
598
463
  return value
464
+
465
+
466
+ ##############
467
+ # Aggregates #
468
+ ##############
469
+
470
+
471
+ class Aggregator(Protocol):
472
+ def __call__(self, scores: Sequence[float]) -> float: ...
473
+
474
+
475
+ def mean_of_all(scores: Sequence[float]) -> float:
476
+ return statistics.fmean(scores) if scores else 0.0
477
+
478
+
479
+ def median_of_all(scores: Sequence[float]) -> float:
480
+ return statistics.median(scores) if scores else 0.0
deepeval/prompt/prompt.py CHANGED
@@ -4,7 +4,7 @@ import json
4
4
  import os
5
5
 
6
6
  from enum import Enum
7
- from typing import Optional, List, Dict, Type, Literal, TYPE_CHECKING
7
+ from typing import Optional, List, Dict, Type, Literal
8
8
  from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
9
9
  from rich.console import Console
10
10
  from pydantic import BaseModel, ValidationError
@@ -34,10 +34,6 @@ from deepeval.prompt.utils import (
34
34
  from deepeval.confident.api import Api, Endpoints, HttpMethods
35
35
  from deepeval.constants import HIDDEN_DIR
36
36
 
37
-
38
- if TYPE_CHECKING:
39
- from deepeval.optimization.types import OptimizationReport
40
-
41
37
  logger = logging.getLogger(__name__)
42
38
 
43
39
  portalocker = None
@@ -117,6 +113,7 @@ class Prompt:
117
113
  model_settings: Optional[ModelSettings] = None,
118
114
  output_type: Optional[OutputType] = None,
119
115
  output_schema: Optional[Type[BaseModel]] = None,
116
+ interpolation_type: Optional[PromptInterpolationType] = None,
120
117
  ):
121
118
  if text_template and messages_template:
122
119
  raise TypeError(
@@ -129,7 +126,9 @@ class Prompt:
129
126
  self.output_type: Optional[OutputType] = output_type
130
127
  self.output_schema: Optional[Type[BaseModel]] = output_schema
131
128
  self.label: Optional[str] = None
132
- self.interpolation_type: Optional[PromptInterpolationType] = None
129
+ self.interpolation_type: PromptInterpolationType = (
130
+ interpolation_type or PromptInterpolationType.FSTRING
131
+ )
133
132
 
134
133
  self._version = None
135
134
  self._prompt_version_id: Optional[str] = None
@@ -145,9 +144,6 @@ class Prompt:
145
144
  elif messages_template:
146
145
  self.type = PromptType.LIST
147
146
 
148
- # updated after optimization runs
149
- self.optimization_report: Optional["OptimizationReport"] = None
150
-
151
147
  def __del__(self):
152
148
  """Cleanup polling tasks when instance is destroyed"""
153
149
  try:
@@ -3,13 +3,13 @@ from .llm_test_case import (
3
3
  LLMTestCaseParams,
4
4
  ToolCall,
5
5
  ToolCallParams,
6
+ MLLMImage,
6
7
  )
7
8
  from .conversational_test_case import (
8
9
  ConversationalTestCase,
9
10
  Turn,
10
11
  TurnParams,
11
12
  )
12
- from .mllm_test_case import MLLMTestCase, MLLMTestCaseParams, MLLMImage
13
13
  from .arena_test_case import ArenaTestCase, Contestant
14
14
  from .mcp import (
15
15
  MCPServer,
@@ -31,8 +31,6 @@ __all__ = [
31
31
  "MCPPromptCall",
32
32
  "MCPResourceCall",
33
33
  "MCPToolCall",
34
- "MLLMTestCase",
35
- "MLLMTestCaseParams",
36
34
  "MLLMImage",
37
35
  "ArenaTestCase",
38
36
  "Contestant",
deepeval/test_case/api.py CHANGED
@@ -10,9 +10,9 @@ from deepeval.test_run.api import (
10
10
  from deepeval.test_case import (
11
11
  LLMTestCase,
12
12
  ConversationalTestCase,
13
- MLLMTestCase,
14
13
  Turn,
15
14
  )
15
+ from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
16
16
  from deepeval.constants import PYTEST_RUN_TEST_NAME
17
17
 
18
18
 
@@ -29,10 +29,12 @@ def create_api_turn(turn: Turn, index: int) -> TurnApi:
29
29
 
30
30
 
31
31
  def create_api_test_case(
32
- test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
32
+ test_case: Union[LLMTestCase, ConversationalTestCase],
33
33
  trace: Optional[TraceApi] = None,
34
34
  index: Optional[int] = None,
35
35
  ) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
36
+ from deepeval.utils import convert_to_multi_modal_array
37
+
36
38
  if isinstance(test_case, ConversationalTestCase):
37
39
  order = (
38
40
  test_case._dataset_rank
@@ -84,7 +86,7 @@ def create_api_test_case(
84
86
  name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
85
87
  metrics_data = []
86
88
 
87
- if isinstance(test_case, LLMTestCase):
89
+ if isinstance(test_case, LLMTestCase) and test_case.multimodal is False:
88
90
  api_test_case = LLMApiTestCase(
89
91
  name=name,
90
92
  input=test_case.input,
@@ -106,15 +108,15 @@ def create_api_test_case(
106
108
  comments=test_case.comments,
107
109
  trace=trace,
108
110
  )
109
- elif isinstance(test_case, MLLMTestCase):
111
+ elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
110
112
  api_test_case = LLMApiTestCase(
111
113
  name=name,
112
- input="",
113
- multimodalInput=test_case.input,
114
- multimodalActualOutput=test_case.actual_output,
115
- multimodalExpectedOutput=test_case.expected_output,
116
- multimodalRetrievalContext=test_case.retrieval_context,
117
- multimodalContext=test_case.context,
114
+ input=test_case.input,
115
+ actualOutput=test_case.actual_output,
116
+ expectedOutput=test_case.expected_output,
117
+ retrievalContext=test_case.retrieval_context,
118
+ context=test_case.context,
119
+ imagesMapping=_MLLM_IMAGE_REGISTRY,
118
120
  toolsCalled=test_case.tools_called,
119
121
  expectedTools=test_case.expected_tools,
120
122
  tokenCost=test_case.token_cost,
@@ -9,7 +9,7 @@ from typing import List, Optional, Dict, Literal
9
9
  from copy import deepcopy
10
10
  from enum import Enum
11
11
 
12
- from deepeval.test_case import ToolCall
12
+ from deepeval.test_case import ToolCall, MLLMImage
13
13
  from deepeval.test_case.mcp import (
14
14
  MCPServer,
15
15
  MCPPromptCall,
@@ -156,11 +156,29 @@ class ConversationalTestCase(BaseModel):
156
156
  comments: Optional[str] = Field(default=None)
157
157
  tags: Optional[List[str]] = Field(default=None)
158
158
  mcp_servers: Optional[List[MCPServer]] = Field(default=None)
159
+ multimodal: bool = False
159
160
 
160
161
  _dataset_rank: Optional[int] = PrivateAttr(default=None)
161
162
  _dataset_alias: Optional[str] = PrivateAttr(default=None)
162
163
  _dataset_id: Optional[str] = PrivateAttr(default=None)
163
164
 
165
+ @model_validator(mode="after")
166
+ def set_is_multimodal(self):
167
+ import re
168
+
169
+ if self.multimodal is True:
170
+ return self
171
+
172
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
173
+ self.multimodal = any(
174
+ [
175
+ re.search(pattern, turn.content) is not None
176
+ for turn in self.turns
177
+ ]
178
+ )
179
+
180
+ return self
181
+
164
182
  @model_validator(mode="before")
165
183
  def validate_input(cls, data):
166
184
  turns = data.get("turns")
@@ -9,7 +9,12 @@ from typing import List, Optional, Dict, Any
9
9
  from enum import Enum
10
10
  import json
11
11
  import uuid
12
-
12
+ import re
13
+ import os
14
+ import mimetypes
15
+ import base64
16
+ from dataclasses import dataclass, field
17
+ from urllib.parse import urlparse, unquote
13
18
  from deepeval.utils import make_model_config
14
19
 
15
20
  from deepeval.test_case.mcp import (
@@ -20,6 +25,128 @@ from deepeval.test_case.mcp import (
20
25
  validate_mcp_servers,
21
26
  )
22
27
 
28
+ _MLLM_IMAGE_REGISTRY: Dict[str, "MLLMImage"] = {}
29
+
30
+
31
+ @dataclass
32
+ class MLLMImage:
33
+ dataBase64: Optional[str] = None
34
+ mimeType: Optional[str] = None
35
+ url: Optional[str] = None
36
+ local: Optional[bool] = None
37
+ filename: Optional[str] = None
38
+ _id: str = field(default_factory=lambda: uuid.uuid4().hex)
39
+
40
+ def __post_init__(self):
41
+
42
+ if not self.url and not self.dataBase64:
43
+ raise ValueError(
44
+ "You must provide either a 'url' or both 'dataBase64' and 'mimeType' to create an MLLMImage."
45
+ )
46
+
47
+ if self.dataBase64 is not None:
48
+ if self.mimeType is None:
49
+ raise ValueError(
50
+ "mimeType must be provided when initializing from Base64 data."
51
+ )
52
+ else:
53
+ is_local = self.is_local_path(self.url)
54
+ if self.local is not None:
55
+ assert self.local == is_local, "Local path mismatch"
56
+ else:
57
+ self.local = is_local
58
+
59
+ # compute filename, mime_type, and Base64 data
60
+ if self.local:
61
+ path = self.process_url(self.url)
62
+ self.filename = os.path.basename(path)
63
+ self.mimeType = (
64
+ mimetypes.guess_type(path)[0] or "application/octet-stream"
65
+ )
66
+ with open(path, "rb") as f:
67
+ raw = f.read()
68
+ self.dataBase64 = base64.b64encode(raw).decode("ascii")
69
+ else:
70
+ self.filename = None
71
+ self.mimeType = None
72
+ self.dataBase64 = None
73
+
74
+ _MLLM_IMAGE_REGISTRY[self._id] = self
75
+
76
+ def _placeholder(self) -> str:
77
+ return f"[DEEPEVAL:IMAGE:{self._id}]"
78
+
79
+ def __str__(self) -> str:
80
+ return self._placeholder()
81
+
82
+ def __repr__(self) -> str:
83
+ return self._placeholder()
84
+
85
+ def __format__(self, format_spec: str) -> str:
86
+ return self._placeholder()
87
+
88
+ @staticmethod
89
+ def process_url(url: str) -> str:
90
+ if os.path.exists(url):
91
+ return url
92
+ parsed = urlparse(url)
93
+ if parsed.scheme == "file":
94
+ raw_path = (
95
+ f"//{parsed.netloc}{parsed.path}"
96
+ if parsed.netloc
97
+ else parsed.path
98
+ )
99
+ path = unquote(raw_path)
100
+ return path
101
+ return url
102
+
103
+ @staticmethod
104
+ def is_local_path(url: str) -> bool:
105
+ if os.path.exists(url):
106
+ return True
107
+ parsed = urlparse(url)
108
+ if parsed.scheme == "file":
109
+ raw_path = (
110
+ f"//{parsed.netloc}{parsed.path}"
111
+ if parsed.netloc
112
+ else parsed.path
113
+ )
114
+ path = unquote(raw_path)
115
+ return os.path.exists(path)
116
+ return False
117
+
118
+ def parse_multimodal_string(s: str):
119
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
120
+ matches = list(re.finditer(pattern, s))
121
+
122
+ result = []
123
+ last_end = 0
124
+
125
+ for m in matches:
126
+ start, end = m.span()
127
+
128
+ if start > last_end:
129
+ result.append(s[last_end:start])
130
+
131
+ img_id = m.group(1)
132
+
133
+ if img_id not in _MLLM_IMAGE_REGISTRY:
134
+ MLLMImage(url=img_id, _id=img_id)
135
+
136
+ result.append(_MLLM_IMAGE_REGISTRY[img_id])
137
+ last_end = end
138
+
139
+ if last_end < len(s):
140
+ result.append(s[last_end:])
141
+
142
+ return result
143
+
144
+ def as_data_uri(self) -> Optional[str]:
145
+ """Return the image as a data URI string, if Base64 data is available."""
146
+ if not self.dataBase64 or not self.mimeType:
147
+ return None
148
+ return f"data:{self.mimeType};base64,{self.dataBase64}"
149
+
23
150
 
24
151
  class LLMTestCaseParams(Enum):
25
152
  INPUT = "input"
@@ -208,6 +335,7 @@ class LLMTestCase(BaseModel):
208
335
  serialization_alias="completionTime",
209
336
  validation_alias=AliasChoices("completionTime", "completion_time"),
210
337
  )
338
+ multimodal: bool = Field(default=False)
211
339
  name: Optional[str] = Field(default=None)
212
340
  tags: Optional[List[str]] = Field(default=None)
213
341
  mcp_servers: Optional[List[MCPServer]] = Field(default=None)
@@ -229,6 +357,29 @@ class LLMTestCase(BaseModel):
229
357
  default_factory=lambda: str(uuid.uuid4())
230
358
  )
231
359
 
360
+ @model_validator(mode="after")
361
+ def set_is_multimodal(self):
362
+ import re
363
+
364
+ if self.multimodal is True:
365
+ return self
366
+
367
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
368
+
369
+ auto_detect = (
370
+ any(
371
+ [
372
+ re.search(pattern, self.input or "") is not None,
373
+ re.search(pattern, self.actual_output or "") is not None,
374
+ ]
375
+ )
376
+ if isinstance(self.input, str)
377
+ else self.multimodal
378
+ )
379
+
380
+ self.multimodal = auto_detect
381
+ return self
382
+
232
383
  @model_validator(mode="before")
233
384
  def validate_input(cls, data):
234
385
  input = data.get("input")
@@ -1,24 +1,20 @@
1
1
  from typing import Union, List
2
2
 
3
- from deepeval.test_case import LLMTestCase, MLLMTestCase, ConversationalTestCase
3
+ from deepeval.test_case import LLMTestCase, ConversationalTestCase
4
4
 
5
5
 
6
6
  def check_valid_test_cases_type(
7
- test_cases: Union[
8
- List[Union[LLMTestCase, MLLMTestCase]], List[ConversationalTestCase]
9
- ],
7
+ test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
10
8
  ):
11
9
  llm_test_case_count = 0
12
10
  conversational_test_case_count = 0
13
11
  for test_case in test_cases:
14
- if isinstance(test_case, LLMTestCase) or isinstance(
15
- test_case, MLLMTestCase
16
- ):
12
+ if isinstance(test_case, LLMTestCase):
17
13
  llm_test_case_count += 1
18
14
  else:
19
15
  conversational_test_case_count += 1
20
16
 
21
17
  if llm_test_case_count > 0 and conversational_test_case_count > 0:
22
18
  raise ValueError(
23
- "You cannot supply a mixture of `LLMTestCase`/`MLLMTestCase`(s) and `ConversationalTestCase`(s) as the list of test cases."
19
+ "You cannot supply a mixture of `LLMTestCase`(s) and `ConversationalTestCase`(s) as the list of test cases."
24
20
  )