deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
deepeval/test_case/api.py CHANGED
@@ -10,9 +10,9 @@ from deepeval.test_run.api import (
10
10
  from deepeval.test_case import (
11
11
  LLMTestCase,
12
12
  ConversationalTestCase,
13
- MLLMTestCase,
14
13
  Turn,
15
14
  )
15
+ from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
16
16
  from deepeval.constants import PYTEST_RUN_TEST_NAME
17
17
 
18
18
 
@@ -29,10 +29,12 @@ def create_api_turn(turn: Turn, index: int) -> TurnApi:
29
29
 
30
30
 
31
31
  def create_api_test_case(
32
- test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
32
+ test_case: Union[LLMTestCase, ConversationalTestCase],
33
33
  trace: Optional[TraceApi] = None,
34
34
  index: Optional[int] = None,
35
35
  ) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
36
+ from deepeval.utils import convert_to_multi_modal_array
37
+
36
38
  if isinstance(test_case, ConversationalTestCase):
37
39
  order = (
38
40
  test_case._dataset_rank
@@ -84,7 +86,7 @@ def create_api_test_case(
84
86
  name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
85
87
  metrics_data = []
86
88
 
87
- if isinstance(test_case, LLMTestCase):
89
+ if isinstance(test_case, LLMTestCase) and test_case.multimodal is False:
88
90
  api_test_case = LLMApiTestCase(
89
91
  name=name,
90
92
  input=test_case.input,
@@ -106,15 +108,15 @@ def create_api_test_case(
106
108
  comments=test_case.comments,
107
109
  trace=trace,
108
110
  )
109
- elif isinstance(test_case, MLLMTestCase):
111
+ elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
110
112
  api_test_case = LLMApiTestCase(
111
113
  name=name,
112
- input="",
113
- multimodalInput=test_case.input,
114
- multimodalActualOutput=test_case.actual_output,
115
- multimodalExpectedOutput=test_case.expected_output,
116
- multimodalRetrievalContext=test_case.retrieval_context,
117
- multimodalContext=test_case.context,
114
+ input=test_case.input,
115
+ actualOutput=test_case.actual_output,
116
+ expectedOutput=test_case.expected_output,
117
+ retrievalContext=test_case.retrieval_context,
118
+ context=test_case.context,
119
+ imagesMapping=_MLLM_IMAGE_REGISTRY,
118
120
  toolsCalled=test_case.tools_called,
119
121
  expectedTools=test_case.expected_tools,
120
122
  tokenCost=test_case.token_cost,
@@ -9,7 +9,7 @@ from typing import List, Optional, Dict, Literal
9
9
  from copy import deepcopy
10
10
  from enum import Enum
11
11
 
12
- from deepeval.test_case import ToolCall
12
+ from deepeval.test_case import ToolCall, MLLMImage
13
13
  from deepeval.test_case.mcp import (
14
14
  MCPServer,
15
15
  MCPPromptCall,
@@ -156,11 +156,29 @@ class ConversationalTestCase(BaseModel):
156
156
  comments: Optional[str] = Field(default=None)
157
157
  tags: Optional[List[str]] = Field(default=None)
158
158
  mcp_servers: Optional[List[MCPServer]] = Field(default=None)
159
+ multimodal: bool = False
159
160
 
160
161
  _dataset_rank: Optional[int] = PrivateAttr(default=None)
161
162
  _dataset_alias: Optional[str] = PrivateAttr(default=None)
162
163
  _dataset_id: Optional[str] = PrivateAttr(default=None)
163
164
 
165
+ @model_validator(mode="after")
166
+ def set_is_multimodal(self):
167
+ import re
168
+
169
+ if self.multimodal is True:
170
+ return self
171
+
172
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
173
+ self.multimodal = any(
174
+ [
175
+ re.search(pattern, turn.content) is not None
176
+ for turn in self.turns
177
+ ]
178
+ )
179
+
180
+ return self
181
+
164
182
  @model_validator(mode="before")
165
183
  def validate_input(cls, data):
166
184
  turns = data.get("turns")
@@ -9,7 +9,12 @@ from typing import List, Optional, Dict, Any
9
9
  from enum import Enum
10
10
  import json
11
11
  import uuid
12
-
12
+ import re
13
+ import os
14
+ import mimetypes
15
+ import base64
16
+ from dataclasses import dataclass, field
17
+ from urllib.parse import urlparse, unquote
13
18
  from deepeval.utils import make_model_config
14
19
 
15
20
  from deepeval.test_case.mcp import (
@@ -20,6 +25,128 @@ from deepeval.test_case.mcp import (
20
25
  validate_mcp_servers,
21
26
  )
22
27
 
28
+ _MLLM_IMAGE_REGISTRY: Dict[str, "MLLMImage"] = {}
29
+
30
+
31
+ @dataclass
32
+ class MLLMImage:
33
+ dataBase64: Optional[str] = None
34
+ mimeType: Optional[str] = None
35
+ url: Optional[str] = None
36
+ local: Optional[bool] = None
37
+ filename: Optional[str] = None
38
+ _id: str = field(default_factory=lambda: uuid.uuid4().hex)
39
+
40
+ def __post_init__(self):
41
+
42
+ if not self.url and not self.dataBase64:
43
+ raise ValueError(
44
+ "You must provide either a 'url' or both 'dataBase64' and 'mimeType' to create an MLLMImage."
45
+ )
46
+
47
+ if self.dataBase64 is not None:
48
+ if self.mimeType is None:
49
+ raise ValueError(
50
+ "mimeType must be provided when initializing from Base64 data."
51
+ )
52
+ else:
53
+ is_local = self.is_local_path(self.url)
54
+ if self.local is not None:
55
+ assert self.local == is_local, "Local path mismatch"
56
+ else:
57
+ self.local = is_local
58
+
59
+ # compute filename, mime_type, and Base64 data
60
+ if self.local:
61
+ path = self.process_url(self.url)
62
+ self.filename = os.path.basename(path)
63
+ self.mimeType = (
64
+ mimetypes.guess_type(path)[0] or "application/octet-stream"
65
+ )
66
+ with open(path, "rb") as f:
67
+ raw = f.read()
68
+ self.dataBase64 = base64.b64encode(raw).decode("ascii")
69
+ else:
70
+ self.filename = None
71
+ self.mimeType = None
72
+ self.dataBase64 = None
73
+
74
+ _MLLM_IMAGE_REGISTRY[self._id] = self
75
+
76
+ def _placeholder(self) -> str:
77
+ return f"[DEEPEVAL:IMAGE:{self._id}]"
78
+
79
+ def __str__(self) -> str:
80
+ return self._placeholder()
81
+
82
+ def __repr__(self) -> str:
83
+ return self._placeholder()
84
+
85
+ def __format__(self, format_spec: str) -> str:
86
+ return self._placeholder()
87
+
88
+ @staticmethod
89
+ def process_url(url: str) -> str:
90
+ if os.path.exists(url):
91
+ return url
92
+ parsed = urlparse(url)
93
+ if parsed.scheme == "file":
94
+ raw_path = (
95
+ f"//{parsed.netloc}{parsed.path}"
96
+ if parsed.netloc
97
+ else parsed.path
98
+ )
99
+ path = unquote(raw_path)
100
+ return path
101
+ return url
102
+
103
+ @staticmethod
104
+ def is_local_path(url: str) -> bool:
105
+ if os.path.exists(url):
106
+ return True
107
+ parsed = urlparse(url)
108
+ if parsed.scheme == "file":
109
+ raw_path = (
110
+ f"//{parsed.netloc}{parsed.path}"
111
+ if parsed.netloc
112
+ else parsed.path
113
+ )
114
+ path = unquote(raw_path)
115
+ return os.path.exists(path)
116
+ return False
117
+
118
+ def parse_multimodal_string(s: str):
119
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
120
+ matches = list(re.finditer(pattern, s))
121
+
122
+ result = []
123
+ last_end = 0
124
+
125
+ for m in matches:
126
+ start, end = m.span()
127
+
128
+ if start > last_end:
129
+ result.append(s[last_end:start])
130
+
131
+ img_id = m.group(1)
132
+
133
+ if img_id not in _MLLM_IMAGE_REGISTRY:
134
+ MLLMImage(url=img_id, _id=img_id)
135
+
136
+ result.append(_MLLM_IMAGE_REGISTRY[img_id])
137
+ last_end = end
138
+
139
+ if last_end < len(s):
140
+ result.append(s[last_end:])
141
+
142
+ return result
143
+
144
+ def as_data_uri(self) -> Optional[str]:
145
+ """Return the image as a data URI string, if Base64 data is available."""
146
+ if not self.dataBase64 or not self.mimeType:
147
+ return None
148
+ return f"data:{self.mimeType};base64,{self.dataBase64}"
149
+
23
150
 
24
151
  class LLMTestCaseParams(Enum):
25
152
  INPUT = "input"
@@ -208,6 +335,7 @@ class LLMTestCase(BaseModel):
208
335
  serialization_alias="completionTime",
209
336
  validation_alias=AliasChoices("completionTime", "completion_time"),
210
337
  )
338
+ multimodal: bool = Field(default=False)
211
339
  name: Optional[str] = Field(default=None)
212
340
  tags: Optional[List[str]] = Field(default=None)
213
341
  mcp_servers: Optional[List[MCPServer]] = Field(default=None)
@@ -229,6 +357,29 @@ class LLMTestCase(BaseModel):
229
357
  default_factory=lambda: str(uuid.uuid4())
230
358
  )
231
359
 
360
+ @model_validator(mode="after")
361
+ def set_is_multimodal(self):
362
+ import re
363
+
364
+ if self.multimodal is True:
365
+ return self
366
+
367
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
368
+
369
+ auto_detect = (
370
+ any(
371
+ [
372
+ re.search(pattern, self.input or "") is not None,
373
+ re.search(pattern, self.actual_output or "") is not None,
374
+ ]
375
+ )
376
+ if isinstance(self.input, str)
377
+ else self.multimodal
378
+ )
379
+
380
+ self.multimodal = auto_detect
381
+ return self
382
+
232
383
  @model_validator(mode="before")
233
384
  def validate_input(cls, data):
234
385
  input = data.get("input")
@@ -1,24 +1,20 @@
1
1
  from typing import Union, List
2
2
 
3
- from deepeval.test_case import LLMTestCase, MLLMTestCase, ConversationalTestCase
3
+ from deepeval.test_case import LLMTestCase, ConversationalTestCase
4
4
 
5
5
 
6
6
  def check_valid_test_cases_type(
7
- test_cases: Union[
8
- List[Union[LLMTestCase, MLLMTestCase]], List[ConversationalTestCase]
9
- ],
7
+ test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
10
8
  ):
11
9
  llm_test_case_count = 0
12
10
  conversational_test_case_count = 0
13
11
  for test_case in test_cases:
14
- if isinstance(test_case, LLMTestCase) or isinstance(
15
- test_case, MLLMTestCase
16
- ):
12
+ if isinstance(test_case, LLMTestCase):
17
13
  llm_test_case_count += 1
18
14
  else:
19
15
  conversational_test_case_count += 1
20
16
 
21
17
  if llm_test_case_count > 0 and conversational_test_case_count > 0:
22
18
  raise ValueError(
23
- "You cannot supply a mixture of `LLMTestCase`/`MLLMTestCase`(s) and `ConversationalTestCase`(s) as the list of test cases."
19
+ "You cannot supply a mixture of `LLMTestCase`(s) and `ConversationalTestCase`(s) as the list of test cases."
24
20
  )
deepeval/test_run/api.py CHANGED
@@ -18,20 +18,21 @@ class LLMApiTestCase(BaseModel):
18
18
  token_cost: Optional[float] = Field(None, alias="tokenCost")
19
19
  completion_time: Optional[float] = Field(None, alias="completionTime")
20
20
  tags: Optional[List[str]] = Field(None)
21
- multimodal_input: Optional[List[Union[str, MLLMImage]]] = Field(
22
- None, alias="multimodalInput"
23
- )
24
- multimodal_input_actual_output: Optional[List[Union[str, MLLMImage]]] = (
25
- Field(None, alias="multimodalActualOutput")
26
- )
27
- multimodal_expected_output: Optional[List[Union[str, MLLMImage]]] = Field(
28
- None, alias="multimodalExpectedOutput"
29
- )
30
- multimodal_retrieval_context: Optional[List[Union[str, MLLMImage]]] = Field(
31
- None, alias="multimodalRetrievalContext"
32
- )
33
- multimodal_context: Optional[List[Union[str, MLLMImage]]] = Field(
34
- None, alias="multimodalContext"
21
+ # multimodal_input: Optional[str] = Field(None, alias="multimodalInput")
22
+ # multimodal_input_actual_output: Optional[str] = Field(
23
+ # None, alias="multimodalActualOutput"
24
+ # )
25
+ # multimodal_expected_output: Optional[str] = Field(
26
+ # None, alias="multimodalExpectedOutput"
27
+ # )
28
+ # multimodal_retrieval_context: Optional[List[str]] = Field(
29
+ # None, alias="multimodalRetrievalContext"
30
+ # )
31
+ # multimodal_context: Optional[List[str]] = Field(
32
+ # None, alias="multimodalContext"
33
+ # )
34
+ images_mapping: Optional[Dict[str, MLLMImage]] = Field(
35
+ None, alias="imagesMapping"
35
36
  )
36
37
 
37
38
  # make these optional, not all test cases in a conversation will be evaluated
@@ -90,6 +90,8 @@ class CachedTestRun(BaseModel):
90
90
  # Pydantic version below 2.0
91
91
  body = self.dict(by_alias=True, exclude_none=True)
92
92
  json.dump(body, f, cls=CustomEncoder)
93
+ f.flush()
94
+ os.fsync(f.fileno())
93
95
  return self
94
96
 
95
97
  # load from file (this happens initially during a test run)
@@ -21,7 +21,7 @@ from deepeval.test_run.api import (
21
21
  )
22
22
  from deepeval.tracing.utils import make_json_serializable
23
23
  from deepeval.tracing.api import SpanApiType, span_api_type_literals
24
- from deepeval.test_case import LLMTestCase, ConversationalTestCase, MLLMTestCase
24
+ from deepeval.test_case import LLMTestCase, ConversationalTestCase
25
25
  from deepeval.utils import (
26
26
  delete_file_if_exists,
27
27
  get_is_running_deepeval,
@@ -182,7 +182,7 @@ class TestRun(BaseModel):
182
182
 
183
183
  def set_dataset_properties(
184
184
  self,
185
- test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
185
+ test_case: Union[LLMTestCase, ConversationalTestCase],
186
186
  ):
187
187
  if self.dataset_alias is None:
188
188
  self.dataset_alias = test_case._dataset_alias
@@ -406,9 +406,10 @@ class TestRun(BaseModel):
406
406
  try:
407
407
  body = self.model_dump(by_alias=True, exclude_none=True)
408
408
  except AttributeError:
409
- # Pydantic version below 2.0
410
409
  body = self.dict(by_alias=True, exclude_none=True)
411
410
  json.dump(body, f, cls=TestRunEncoder)
411
+ f.flush()
412
+ os.fsync(f.fileno())
412
413
  return self
413
414
 
414
415
  @classmethod
@@ -515,6 +516,8 @@ class TestRunManager:
515
516
  )
516
517
  wrapper_data = {save_under_key: test_run_data}
517
518
  json.dump(wrapper_data, file, cls=TestRunEncoder)
519
+ file.flush()
520
+ os.fsync(file.fileno())
518
521
  else:
519
522
  self.test_run.save(file)
520
523
  except portalocker.exceptions.LockException:
@@ -527,13 +530,15 @@ class TestRunManager:
527
530
  LATEST_TEST_RUN_FILE_PATH, mode="w"
528
531
  ) as file:
529
532
  json.dump({LATEST_TEST_RUN_LINK_KEY: link}, file)
533
+ file.flush()
534
+ os.fsync(file.fileno())
530
535
  except portalocker.exceptions.LockException:
531
536
  pass
532
537
 
533
538
  def update_test_run(
534
539
  self,
535
540
  api_test_case: Union[LLMApiTestCase, ConversationalApiTestCase],
536
- test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
541
+ test_case: Union[LLMTestCase, ConversationalTestCase],
537
542
  ):
538
543
  if (
539
544
  api_test_case.metrics_data is not None
@@ -1,6 +1,7 @@
1
1
  import functools
2
2
 
3
- from anthropic import Anthropic
3
+ from typing import TYPE_CHECKING
4
+
4
5
  from openai import OpenAI
5
6
 
6
7
  from deepeval.tracing.context import update_current_span, update_llm_span
@@ -8,6 +9,10 @@ from deepeval.tracing.context import current_span_context
8
9
  from deepeval.tracing.types import LlmSpan
9
10
 
10
11
 
12
+ if TYPE_CHECKING:
13
+ from anthropic import Anthropic
14
+
15
+
11
16
  def patch_openai_client(client: OpenAI):
12
17
 
13
18
  original_methods = {}
@@ -61,7 +66,7 @@ def patch_openai_client(client: OpenAI):
61
66
  output = None
62
67
  try:
63
68
  output = response.choices[0].message.content
64
- except Exception as e:
69
+ except Exception:
65
70
  pass
66
71
 
67
72
  # extract input output token counts
@@ -70,7 +75,7 @@ def patch_openai_client(client: OpenAI):
70
75
  try:
71
76
  input_token_count = response.usage.prompt_tokens
72
77
  output_token_count = response.usage.completion_tokens
73
- except Exception as e:
78
+ except Exception:
74
79
  pass
75
80
 
76
81
  update_current_span(
@@ -86,7 +91,7 @@ def patch_openai_client(client: OpenAI):
86
91
  setattr(current_obj, method_name, wrapped_method)
87
92
 
88
93
 
89
- def patch_anthropic_client(client: Anthropic):
94
+ def patch_anthropic_client(client: "Anthropic"):
90
95
  """
91
96
  Patch an Anthropic client instance to add tracing capabilities.
92
97
 
@@ -19,7 +19,6 @@ import random
19
19
  import atexit
20
20
  import queue
21
21
  import uuid
22
- from anthropic import Anthropic
23
22
  from openai import OpenAI
24
23
  from rich.console import Console
25
24
  from rich.progress import Progress
@@ -74,6 +73,7 @@ from deepeval.tracing.trace_test_manager import trace_testing_manager
74
73
 
75
74
  if TYPE_CHECKING:
76
75
  from deepeval.dataset.golden import Golden
76
+ from anthropic import Anthropic
77
77
 
78
78
  EVAL_DUMMY_SPAN_NAME = "evals_iterator"
79
79
 
@@ -154,7 +154,7 @@ class TraceManager:
154
154
  environment: Optional[str] = None,
155
155
  sampling_rate: Optional[float] = None,
156
156
  confident_api_key: Optional[str] = None,
157
- anthropic_client: Optional[Anthropic] = None,
157
+ anthropic_client: Optional["Anthropic"] = None,
158
158
  openai_client: Optional[OpenAI] = None,
159
159
  tracing_enabled: Optional[bool] = None,
160
160
  ) -> None:
deepeval/utils.py CHANGED
@@ -14,6 +14,7 @@ import logging
14
14
 
15
15
  from contextvars import ContextVar
16
16
  from enum import Enum
17
+ from importlib import import_module
17
18
  from typing import Any, Dict, List, Optional, Protocol, Sequence, Union
18
19
  from collections.abc import Iterable
19
20
  from dataclasses import asdict, is_dataclass
@@ -21,6 +22,7 @@ from pydantic import BaseModel
21
22
  from rich.progress import Progress
22
23
  from rich.console import Console, Theme
23
24
 
25
+ from deepeval.errors import DeepEvalError
24
26
  from deepeval.config.settings import get_settings
25
27
  from deepeval.config.utils import (
26
28
  get_env_bool,
@@ -536,6 +538,25 @@ def shorten(
536
538
  return stext[:cut] + suffix
537
539
 
538
540
 
541
+ def convert_to_multi_modal_array(input: Union[str, List[str]]):
542
+ from deepeval.test_case import MLLMImage
543
+
544
+ if isinstance(input, str):
545
+ return MLLMImage.parse_multimodal_string(input)
546
+ elif isinstance(input, list):
547
+ new_list = []
548
+ for context in input:
549
+ parsed_array = MLLMImage.parse_multimodal_string(context)
550
+ new_list.extend(parsed_array)
551
+ return new_list
552
+
553
+
554
+ def check_if_multimodal(input: str):
555
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
556
+ matches = list(re.finditer(pattern, input))
557
+ return bool(matches)
558
+
559
+
539
560
  def format_turn(
540
561
  turn: TurnLike,
541
562
  *,
@@ -814,3 +835,71 @@ def format_error_text(
814
835
 
815
836
  def is_read_only_env():
816
837
  return get_settings().DEEPEVAL_FILE_SYSTEM == "READ_ONLY"
838
+
839
+
840
+ ##############
841
+ # validation #
842
+ ##############
843
+
844
+
845
+ def require_param(
846
+ param: Optional[Any] = None,
847
+ *,
848
+ provider_label: str,
849
+ env_var_name: str,
850
+ param_hint: str,
851
+ ) -> Any:
852
+ """
853
+ Ensures that a required parameter is provided. If the parameter is `None`, raises a
854
+ `DeepEvalError` with a helpful message indicating the missing parameter and how to resolve it.
855
+
856
+ Args:
857
+ param (Optional[Any]): The parameter to validate.
858
+ provider_label (str): A label for the provider to be used in the error message.
859
+ env_var_name (str): The name of the environment variable where the parameter can be set.
860
+ param_hint (str): A hint for the parameter, usually the name of the argument.
861
+
862
+ Raises:
863
+ DeepEvalError: If the `param` is `None`, indicating that a required parameter is missing.
864
+
865
+ Returns:
866
+ Any: The value of `param` if it is provided.
867
+ """
868
+ if param is None:
869
+ raise DeepEvalError(
870
+ f"{provider_label} is missing a required parameter. "
871
+ f"Set {env_var_name} in your environment or pass "
872
+ f"{param_hint}."
873
+ )
874
+
875
+ return param
876
+
877
+
878
+ def require_dependency(
879
+ module_name: str,
880
+ *,
881
+ provider_label: str,
882
+ install_hint: Optional[str] = None,
883
+ ) -> Any:
884
+ """
885
+ Imports an optional dependency module or raises a `DeepEvalError` if the module is not found.
886
+ The error message includes a suggestion on how to install the missing module.
887
+
888
+ Args:
889
+ module_name (str): The name of the module to import.
890
+ provider_label (str): A label for the provider to be used in the error message.
891
+ install_hint (Optional[str]): A hint on how to install the missing module, usually a pip command.
892
+
893
+ Raises:
894
+ DeepEvalError: If the module cannot be imported, indicating that the dependency is missing.
895
+
896
+ Returns:
897
+ Any: The imported module if successful.
898
+ """
899
+ try:
900
+ return import_module(module_name)
901
+ except ImportError as exc:
902
+ hint = install_hint or f"Install it with `pip install {module_name}`."
903
+ raise DeepEvalError(
904
+ f"{provider_label} requires the `{module_name}` package. {hint}"
905
+ ) from exc
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.7.3
3
+ Version: 3.7.5
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0
@@ -13,13 +13,10 @@ Classifier: Programming Language :: Python :: 3.9
13
13
  Classifier: Programming Language :: Python :: 3.10
14
14
  Classifier: Programming Language :: Python :: 3.11
15
15
  Requires-Dist: aiohttp
16
- Requires-Dist: anthropic
17
16
  Requires-Dist: click (>=8.0.0,<8.3.0)
18
- Requires-Dist: google-genai (>=1.9.0,<2.0.0)
19
17
  Requires-Dist: grpcio (>=1.67.1,<2.0.0)
20
18
  Requires-Dist: jinja2
21
19
  Requires-Dist: nest_asyncio
22
- Requires-Dist: ollama
23
20
  Requires-Dist: openai
24
21
  Requires-Dist: opentelemetry-api (>=1.24.0,<2.0.0)
25
22
  Requires-Dist: opentelemetry-exporter-otlp-proto-grpc (>=1.24.0,<2.0.0)