deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import re
1
2
  from pydantic import (
2
3
  BaseModel,
3
4
  Field,
@@ -9,7 +10,7 @@ from typing import List, Optional, Dict, Literal
9
10
  from copy import deepcopy
10
11
  from enum import Enum
11
12
 
12
- from deepeval.test_case import ToolCall
13
+ from deepeval.test_case import ToolCall, MLLMImage
13
14
  from deepeval.test_case.mcp import (
14
15
  MCPServer,
15
16
  MCPPromptCall,
@@ -17,6 +18,7 @@ from deepeval.test_case.mcp import (
17
18
  MCPToolCall,
18
19
  validate_mcp_servers,
19
20
  )
21
+ from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
20
22
 
21
23
 
22
24
  class TurnParams(Enum):
@@ -156,11 +158,45 @@ class ConversationalTestCase(BaseModel):
156
158
  comments: Optional[str] = Field(default=None)
157
159
  tags: Optional[List[str]] = Field(default=None)
158
160
  mcp_servers: Optional[List[MCPServer]] = Field(default=None)
161
+ multimodal: bool = False
159
162
 
160
163
  _dataset_rank: Optional[int] = PrivateAttr(default=None)
161
164
  _dataset_alias: Optional[str] = PrivateAttr(default=None)
162
165
  _dataset_id: Optional[str] = PrivateAttr(default=None)
163
166
 
167
+ @model_validator(mode="after")
168
+ def set_is_multimodal(self):
169
+ import re
170
+
171
+ if self.multimodal is True:
172
+ return self
173
+
174
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
175
+ if self.scenario:
176
+ if re.search(pattern, self.scenario) is not None:
177
+ self.multimodal = True
178
+ return self
179
+ if self.expected_outcome:
180
+ if re.search(pattern, self.expected_outcome) is not None:
181
+ self.multimodal = True
182
+ return self
183
+ if self.user_description:
184
+ if re.search(pattern, self.user_description) is not None:
185
+ self.multimodal = True
186
+ return self
187
+ if self.turns:
188
+ for turn in self.turns:
189
+ if re.search(pattern, turn.content) is not None:
190
+ self.multimodal = True
191
+ return self
192
+ if turn.retrieval_context is not None:
193
+ self.multimodal = any(
194
+ re.search(pattern, context) is not None
195
+ for context in turn.retrieval_context
196
+ )
197
+
198
+ return self
199
+
164
200
  @model_validator(mode="before")
165
201
  def validate_input(cls, data):
166
202
  turns = data.get("turns")
@@ -197,3 +233,34 @@ class ConversationalTestCase(BaseModel):
197
233
  data["turns"] = copied_turns
198
234
 
199
235
  return data
236
+
237
+ def _get_images_mapping(self) -> Dict[str, MLLMImage]:
238
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
239
+ image_ids = set()
240
+
241
+ def extract_ids_from_string(s: Optional[str]) -> None:
242
+ """Helper to extract image IDs from a string."""
243
+ if s is not None and isinstance(s, str):
244
+ matches = re.findall(pattern, s)
245
+ image_ids.update(matches)
246
+
247
+ def extract_ids_from_list(lst: Optional[List[str]]) -> None:
248
+ """Helper to extract image IDs from a list of strings."""
249
+ if lst is not None:
250
+ for item in lst:
251
+ extract_ids_from_string(item)
252
+
253
+ extract_ids_from_string(self.scenario)
254
+ extract_ids_from_string(self.expected_outcome)
255
+ extract_ids_from_list(self.context)
256
+ extract_ids_from_string(self.user_description)
257
+ for turn in self.turns:
258
+ extract_ids_from_string(turn.content)
259
+ extract_ids_from_list(turn.retrieval_context)
260
+
261
+ images_mapping = {}
262
+ for img_id in image_ids:
263
+ if img_id in _MLLM_IMAGE_REGISTRY:
264
+ images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
265
+
266
+ return images_mapping if len(images_mapping) > 0 else None
@@ -9,7 +9,12 @@ from typing import List, Optional, Dict, Any
9
9
  from enum import Enum
10
10
  import json
11
11
  import uuid
12
-
12
+ import re
13
+ import os
14
+ import mimetypes
15
+ import base64
16
+ from dataclasses import dataclass, field
17
+ from urllib.parse import urlparse, unquote
13
18
  from deepeval.utils import make_model_config
14
19
 
15
20
  from deepeval.test_case.mcp import (
@@ -20,6 +25,143 @@ from deepeval.test_case.mcp import (
20
25
  validate_mcp_servers,
21
26
  )
22
27
 
28
+ _MLLM_IMAGE_REGISTRY: Dict[str, "MLLMImage"] = {}
29
+
30
+
31
+ @dataclass
32
+ class MLLMImage:
33
+ dataBase64: Optional[str] = None
34
+ mimeType: Optional[str] = None
35
+ url: Optional[str] = None
36
+ local: Optional[bool] = None
37
+ filename: Optional[str] = None
38
+ _id: str = field(default_factory=lambda: uuid.uuid4().hex)
39
+
40
+ def __post_init__(self):
41
+
42
+ if not self.url and not self.dataBase64:
43
+ raise ValueError(
44
+ "You must provide either a 'url' or both 'dataBase64' and 'mimeType' to create an MLLMImage."
45
+ )
46
+
47
+ if self.dataBase64 is not None:
48
+ if self.mimeType is None:
49
+ raise ValueError(
50
+ "mimeType must be provided when initializing from Base64 data."
51
+ )
52
+ else:
53
+ is_local = self.is_local_path(self.url)
54
+ if self.local is not None:
55
+ assert self.local == is_local, "Local path mismatch"
56
+ else:
57
+ self.local = is_local
58
+
59
+ # compute filename, mime_type, and Base64 data
60
+ if self.local:
61
+ path = self.process_url(self.url)
62
+ self.filename = os.path.basename(path)
63
+ self.mimeType = mimetypes.guess_type(path)[0] or "image/jpeg"
64
+
65
+ if not os.path.exists(path):
66
+ raise FileNotFoundError(f"Image file not found: {path}")
67
+
68
+ self._load_base64(path)
69
+ else:
70
+ if not self.url.startswith(("http://", "https://")):
71
+ raise ValueError(
72
+ f"Invalid remote URL format: {self.url}. URL must start with http:// or https://"
73
+ )
74
+ self.filename = None
75
+ self.mimeType = None
76
+ self.dataBase64 = None
77
+
78
+ _MLLM_IMAGE_REGISTRY[self._id] = self
79
+
80
+ def _load_base64(self, path: str):
81
+ with open(path, "rb") as f:
82
+ raw = f.read()
83
+ self.dataBase64 = base64.b64encode(raw).decode("ascii")
84
+
85
+ def ensure_images_loaded(self):
86
+ if self.local and self.dataBase64 is None:
87
+ path = self.process_url(self.url)
88
+ self._load_base64(path)
89
+ return self
90
+
91
+ def _placeholder(self) -> str:
92
+ return f"[DEEPEVAL:IMAGE:{self._id}]"
93
+
94
+ def __str__(self) -> str:
95
+ return self._placeholder()
96
+
97
+ def __repr__(self) -> str:
98
+ return self._placeholder()
99
+
100
+ def __format__(self, format_spec: str) -> str:
101
+ return self._placeholder()
102
+
103
+ @staticmethod
104
+ def process_url(url: str) -> str:
105
+ if os.path.exists(url):
106
+ return url
107
+ parsed = urlparse(url)
108
+ if parsed.scheme == "file":
109
+ raw_path = (
110
+ f"//{parsed.netloc}{parsed.path}"
111
+ if parsed.netloc
112
+ else parsed.path
113
+ )
114
+ path = unquote(raw_path)
115
+ return path
116
+ return url
117
+
118
+ @staticmethod
119
+ def is_local_path(url: str) -> bool:
120
+ if os.path.exists(url):
121
+ return True
122
+ parsed = urlparse(url)
123
+ if parsed.scheme == "file":
124
+ raw_path = (
125
+ f"//{parsed.netloc}{parsed.path}"
126
+ if parsed.netloc
127
+ else parsed.path
128
+ )
129
+ path = unquote(raw_path)
130
+ return os.path.exists(path)
131
+ return False
132
+
133
+ def parse_multimodal_string(s: str):
134
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
135
+ matches = list(re.finditer(pattern, s))
136
+
137
+ result = []
138
+ last_end = 0
139
+
140
+ for m in matches:
141
+ start, end = m.span()
142
+
143
+ if start > last_end:
144
+ result.append(s[last_end:start])
145
+
146
+ img_id = m.group(1)
147
+
148
+ if img_id not in _MLLM_IMAGE_REGISTRY:
149
+ MLLMImage(url=img_id, _id=img_id)
150
+
151
+ result.append(_MLLM_IMAGE_REGISTRY[img_id])
152
+ last_end = end
153
+
154
+ if last_end < len(s):
155
+ result.append(s[last_end:])
156
+
157
+ return result
158
+
159
+ def as_data_uri(self) -> Optional[str]:
160
+ """Return the image as a data URI string, if Base64 data is available."""
161
+ if not self.dataBase64 or not self.mimeType:
162
+ return None
163
+ return f"data:{self.mimeType};base64,{self.dataBase64}"
164
+
23
165
 
24
166
  class LLMTestCaseParams(Enum):
25
167
  INPUT = "input"
@@ -208,6 +350,7 @@ class LLMTestCase(BaseModel):
208
350
  serialization_alias="completionTime",
209
351
  validation_alias=AliasChoices("completionTime", "completion_time"),
210
352
  )
353
+ multimodal: bool = Field(default=False)
211
354
  name: Optional[str] = Field(default=None)
212
355
  tags: Optional[List[str]] = Field(default=None)
213
356
  mcp_servers: Optional[List[MCPServer]] = Field(default=None)
@@ -229,6 +372,39 @@ class LLMTestCase(BaseModel):
229
372
  default_factory=lambda: str(uuid.uuid4())
230
373
  )
231
374
 
375
+ @model_validator(mode="after")
376
+ def set_is_multimodal(self):
377
+ import re
378
+
379
+ if self.multimodal is True:
380
+ return self
381
+
382
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
383
+
384
+ auto_detect = (
385
+ any(
386
+ [
387
+ re.search(pattern, self.input or "") is not None,
388
+ re.search(pattern, self.actual_output or "") is not None,
389
+ ]
390
+ )
391
+ if isinstance(self.input, str)
392
+ else self.multimodal
393
+ )
394
+ if self.retrieval_context is not None:
395
+ auto_detect = auto_detect or any(
396
+ re.search(pattern, context) is not None
397
+ for context in self.retrieval_context
398
+ )
399
+ if self.context is not None:
400
+ auto_detect = auto_detect or any(
401
+ re.search(pattern, context) is not None
402
+ for context in self.context
403
+ )
404
+
405
+ self.multimodal = auto_detect
406
+ return self
407
+
232
408
  @model_validator(mode="before")
233
409
  def validate_input(cls, data):
234
410
  input = data.get("input")
@@ -335,3 +511,32 @@ class LLMTestCase(BaseModel):
335
511
  )
336
512
 
337
513
  return data
514
+
515
+ def _get_images_mapping(self) -> Dict[str, MLLMImage]:
516
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
517
+ image_ids = set()
518
+
519
+ def extract_ids_from_string(s: Optional[str]) -> None:
520
+ """Helper to extract image IDs from a string."""
521
+ if s is not None and isinstance(s, str):
522
+ matches = re.findall(pattern, s)
523
+ image_ids.update(matches)
524
+
525
+ def extract_ids_from_list(lst: Optional[List[str]]) -> None:
526
+ """Helper to extract image IDs from a list of strings."""
527
+ if lst is not None:
528
+ for item in lst:
529
+ extract_ids_from_string(item)
530
+
531
+ extract_ids_from_string(self.input)
532
+ extract_ids_from_string(self.actual_output)
533
+ extract_ids_from_string(self.expected_output)
534
+ extract_ids_from_list(self.context)
535
+ extract_ids_from_list(self.retrieval_context)
536
+
537
+ images_mapping = {}
538
+ for img_id in image_ids:
539
+ if img_id in _MLLM_IMAGE_REGISTRY:
540
+ images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
541
+
542
+ return images_mapping if len(images_mapping) > 0 else None
@@ -1,24 +1,20 @@
1
1
  from typing import Union, List
2
2
 
3
- from deepeval.test_case import LLMTestCase, MLLMTestCase, ConversationalTestCase
3
+ from deepeval.test_case import LLMTestCase, ConversationalTestCase
4
4
 
5
5
 
6
6
  def check_valid_test_cases_type(
7
- test_cases: Union[
8
- List[Union[LLMTestCase, MLLMTestCase]], List[ConversationalTestCase]
9
- ],
7
+ test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
10
8
  ):
11
9
  llm_test_case_count = 0
12
10
  conversational_test_case_count = 0
13
11
  for test_case in test_cases:
14
- if isinstance(test_case, LLMTestCase) or isinstance(
15
- test_case, MLLMTestCase
16
- ):
12
+ if isinstance(test_case, LLMTestCase):
17
13
  llm_test_case_count += 1
18
14
  else:
19
15
  conversational_test_case_count += 1
20
16
 
21
17
  if llm_test_case_count > 0 and conversational_test_case_count > 0:
22
18
  raise ValueError(
23
- "You cannot supply a mixture of `LLMTestCase`/`MLLMTestCase`(s) and `ConversationalTestCase`(s) as the list of test cases."
19
+ "You cannot supply a mixture of `LLMTestCase`(s) and `ConversationalTestCase`(s) as the list of test cases."
24
20
  )
deepeval/test_run/api.py CHANGED
@@ -18,20 +18,21 @@ class LLMApiTestCase(BaseModel):
18
18
  token_cost: Optional[float] = Field(None, alias="tokenCost")
19
19
  completion_time: Optional[float] = Field(None, alias="completionTime")
20
20
  tags: Optional[List[str]] = Field(None)
21
- multimodal_input: Optional[List[Union[str, MLLMImage]]] = Field(
22
- None, alias="multimodalInput"
23
- )
24
- multimodal_input_actual_output: Optional[List[Union[str, MLLMImage]]] = (
25
- Field(None, alias="multimodalActualOutput")
26
- )
27
- multimodal_expected_output: Optional[List[Union[str, MLLMImage]]] = Field(
28
- None, alias="multimodalExpectedOutput"
29
- )
30
- multimodal_retrieval_context: Optional[List[Union[str, MLLMImage]]] = Field(
31
- None, alias="multimodalRetrievalContext"
32
- )
33
- multimodal_context: Optional[List[Union[str, MLLMImage]]] = Field(
34
- None, alias="multimodalContext"
21
+ # multimodal_input: Optional[str] = Field(None, alias="multimodalInput")
22
+ # multimodal_input_actual_output: Optional[str] = Field(
23
+ # None, alias="multimodalActualOutput"
24
+ # )
25
+ # multimodal_expected_output: Optional[str] = Field(
26
+ # None, alias="multimodalExpectedOutput"
27
+ # )
28
+ # multimodal_retrieval_context: Optional[List[str]] = Field(
29
+ # None, alias="multimodalRetrievalContext"
30
+ # )
31
+ # multimodal_context: Optional[List[str]] = Field(
32
+ # None, alias="multimodalContext"
33
+ # )
34
+ images_mapping: Optional[Dict[str, MLLMImage]] = Field(
35
+ None, alias="imagesMapping"
35
36
  )
36
37
 
37
38
  # make these optional, not all test cases in a conversation will be evaluated
@@ -125,6 +126,9 @@ class ConversationalApiTestCase(BaseModel):
125
126
  additional_metadata: Optional[Dict] = Field(
126
127
  None, alias="additionalMetadata"
127
128
  )
129
+ images_mapping: Optional[Dict[str, MLLMImage]] = Field(
130
+ None, alias="imagesMapping"
131
+ )
128
132
  tags: Optional[List[str]] = Field(None)
129
133
 
130
134
  def update_metric_data(self, metrics_data: MetricData):
@@ -21,7 +21,7 @@ from deepeval.test_run.api import (
21
21
  )
22
22
  from deepeval.tracing.utils import make_json_serializable
23
23
  from deepeval.tracing.api import SpanApiType, span_api_type_literals
24
- from deepeval.test_case import LLMTestCase, ConversationalTestCase, MLLMTestCase
24
+ from deepeval.test_case import LLMTestCase, ConversationalTestCase
25
25
  from deepeval.utils import (
26
26
  delete_file_if_exists,
27
27
  get_is_running_deepeval,
@@ -182,7 +182,7 @@ class TestRun(BaseModel):
182
182
 
183
183
  def set_dataset_properties(
184
184
  self,
185
- test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
185
+ test_case: Union[LLMTestCase, ConversationalTestCase],
186
186
  ):
187
187
  if self.dataset_alias is None:
188
188
  self.dataset_alias = test_case._dataset_alias
@@ -538,7 +538,7 @@ class TestRunManager:
538
538
  def update_test_run(
539
539
  self,
540
540
  api_test_case: Union[LLMApiTestCase, ConversationalApiTestCase],
541
- test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
541
+ test_case: Union[LLMTestCase, ConversationalTestCase],
542
542
  ):
543
543
  if (
544
544
  api_test_case.metrics_data is not None
@@ -1,6 +1,7 @@
1
1
  import functools
2
2
 
3
- from anthropic import Anthropic
3
+ from typing import TYPE_CHECKING
4
+
4
5
  from openai import OpenAI
5
6
 
6
7
  from deepeval.tracing.context import update_current_span, update_llm_span
@@ -8,6 +9,10 @@ from deepeval.tracing.context import current_span_context
8
9
  from deepeval.tracing.types import LlmSpan
9
10
 
10
11
 
12
+ if TYPE_CHECKING:
13
+ from anthropic import Anthropic
14
+
15
+
11
16
  def patch_openai_client(client: OpenAI):
12
17
 
13
18
  original_methods = {}
@@ -61,7 +66,7 @@ def patch_openai_client(client: OpenAI):
61
66
  output = None
62
67
  try:
63
68
  output = response.choices[0].message.content
64
- except Exception as e:
69
+ except Exception:
65
70
  pass
66
71
 
67
72
  # extract input output token counts
@@ -70,7 +75,7 @@ def patch_openai_client(client: OpenAI):
70
75
  try:
71
76
  input_token_count = response.usage.prompt_tokens
72
77
  output_token_count = response.usage.completion_tokens
73
- except Exception as e:
78
+ except Exception:
74
79
  pass
75
80
 
76
81
  update_current_span(
@@ -86,7 +91,7 @@ def patch_openai_client(client: OpenAI):
86
91
  setattr(current_obj, method_name, wrapped_method)
87
92
 
88
93
 
89
- def patch_anthropic_client(client: Anthropic):
94
+ def patch_anthropic_client(client: "Anthropic"):
90
95
  """
91
96
  Patch an Anthropic client instance to add tracing capabilities.
92
97
 
@@ -19,7 +19,6 @@ import random
19
19
  import atexit
20
20
  import queue
21
21
  import uuid
22
- from anthropic import Anthropic
23
22
  from openai import OpenAI
24
23
  from rich.console import Console
25
24
  from rich.progress import Progress
@@ -74,6 +73,7 @@ from deepeval.tracing.trace_test_manager import trace_testing_manager
74
73
 
75
74
  if TYPE_CHECKING:
76
75
  from deepeval.dataset.golden import Golden
76
+ from anthropic import Anthropic
77
77
 
78
78
  EVAL_DUMMY_SPAN_NAME = "evals_iterator"
79
79
 
@@ -154,7 +154,7 @@ class TraceManager:
154
154
  environment: Optional[str] = None,
155
155
  sampling_rate: Optional[float] = None,
156
156
  confident_api_key: Optional[str] = None,
157
- anthropic_client: Optional[Anthropic] = None,
157
+ anthropic_client: Optional["Anthropic"] = None,
158
158
  openai_client: Optional[OpenAI] = None,
159
159
  tracing_enabled: Optional[bool] = None,
160
160
  ) -> None:
deepeval/utils.py CHANGED
@@ -14,6 +14,7 @@ import logging
14
14
 
15
15
  from contextvars import ContextVar
16
16
  from enum import Enum
17
+ from importlib import import_module
17
18
  from typing import Any, Dict, List, Optional, Protocol, Sequence, Union
18
19
  from collections.abc import Iterable
19
20
  from dataclasses import asdict, is_dataclass
@@ -537,6 +538,25 @@ def shorten(
537
538
  return stext[:cut] + suffix
538
539
 
539
540
 
541
+ def convert_to_multi_modal_array(input: Union[str, List[str]]):
542
+ from deepeval.test_case import MLLMImage
543
+
544
+ if isinstance(input, str):
545
+ return MLLMImage.parse_multimodal_string(input)
546
+ elif isinstance(input, list):
547
+ new_list = []
548
+ for context in input:
549
+ parsed_array = MLLMImage.parse_multimodal_string(context)
550
+ new_list.extend(parsed_array)
551
+ return new_list
552
+
553
+
554
+ def check_if_multimodal(input: str):
555
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
556
+ matches = list(re.finditer(pattern, input))
557
+ return bool(matches)
558
+
559
+
540
560
  def format_turn(
541
561
  turn: TurnLike,
542
562
  *,
@@ -829,7 +849,22 @@ def require_param(
829
849
  env_var_name: str,
830
850
  param_hint: str,
831
851
  ) -> Any:
852
+ """
853
+ Ensures that a required parameter is provided. If the parameter is `None`, raises a
854
+ `DeepEvalError` with a helpful message indicating the missing parameter and how to resolve it.
832
855
 
856
+ Args:
857
+ param (Optional[Any]): The parameter to validate.
858
+ provider_label (str): A label for the provider to be used in the error message.
859
+ env_var_name (str): The name of the environment variable where the parameter can be set.
860
+ param_hint (str): A hint for the parameter, usually the name of the argument.
861
+
862
+ Raises:
863
+ DeepEvalError: If the `param` is `None`, indicating that a required parameter is missing.
864
+
865
+ Returns:
866
+ Any: The value of `param` if it is provided.
867
+ """
833
868
  if param is None:
834
869
  raise DeepEvalError(
835
870
  f"{provider_label} is missing a required parameter. "
@@ -838,3 +873,33 @@ def require_param(
838
873
  )
839
874
 
840
875
  return param
876
+
877
+
878
+ def require_dependency(
879
+ module_name: str,
880
+ *,
881
+ provider_label: str,
882
+ install_hint: Optional[str] = None,
883
+ ) -> Any:
884
+ """
885
+ Imports an optional dependency module or raises a `DeepEvalError` if the module is not found.
886
+ The error message includes a suggestion on how to install the missing module.
887
+
888
+ Args:
889
+ module_name (str): The name of the module to import.
890
+ provider_label (str): A label for the provider to be used in the error message.
891
+ install_hint (Optional[str]): A hint on how to install the missing module, usually a pip command.
892
+
893
+ Raises:
894
+ DeepEvalError: If the module cannot be imported, indicating that the dependency is missing.
895
+
896
+ Returns:
897
+ Any: The imported module if successful.
898
+ """
899
+ try:
900
+ return import_module(module_name)
901
+ except ImportError as exc:
902
+ hint = install_hint or f"Install it with `pip install {module_name}`."
903
+ raise DeepEvalError(
904
+ f"{provider_label} requires the `{module_name}` package. {hint}"
905
+ ) from exc
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.7.4
3
+ Version: 3.7.6
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0
@@ -13,13 +13,10 @@ Classifier: Programming Language :: Python :: 3.9
13
13
  Classifier: Programming Language :: Python :: 3.10
14
14
  Classifier: Programming Language :: Python :: 3.11
15
15
  Requires-Dist: aiohttp
16
- Requires-Dist: anthropic
17
16
  Requires-Dist: click (>=8.0.0,<8.3.0)
18
- Requires-Dist: google-genai (>=1.9.0,<2.0.0)
19
17
  Requires-Dist: grpcio (>=1.67.1,<2.0.0)
20
18
  Requires-Dist: jinja2
21
19
  Requires-Dist: nest_asyncio
22
- Requires-Dist: ollama
23
20
  Requires-Dist: openai
24
21
  Requires-Dist: opentelemetry-api (>=1.24.0,<2.0.0)
25
22
  Requires-Dist: opentelemetry-exporter-otlp-proto-grpc (>=1.24.0,<2.0.0)